PostgreSQL Source Code  git master
nbtpage.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/procarray.h"
#include "utils/memdebug.h"
#include "utils/memutils.h"
#include "utils/snapmgr.h"
Include dependency graph for nbtpage.c:

Go to the source code of this file.

Functions

static BTMetaPageData_bt_getmeta (Relation rel, Buffer metabuf)
 
static void _bt_log_reuse_page (Relation rel, BlockNumber blkno, FullTransactionId safexid)
 
static void _bt_delitems_delete (Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
static char * _bt_delitems_update (BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
 
static bool _bt_mark_page_halfdead (Relation rel, Buffer leafbuf, BTStack stack)
 
static bool _bt_unlink_halfdead_page (Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
 
static bool _bt_lock_subtree_parent (Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
 
static void _bt_pendingfsm_add (BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
void _bt_upgrademetapage (Page page)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
static int _bt_delitems_cmp (const void *a, const void *b)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
static bool _bt_leftsib_splitflag (Relation rel, BlockNumber leftsib, BlockNumber target)
 
static bool _bt_rightsib_halfdeadflag (Relation rel, BlockNumber leafrightsib)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 

Function Documentation

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 794 of file nbtpage.c.

795 {
796  Page page = BufferGetPage(buf);
797 
798  /*
799  * ReadBuffer verifies that every newly-read page passes
800  * PageHeaderIsValid, which means it either contains a reasonably sane
801  * page header or is all-zero. We have to defend against the all-zero
802  * case, however.
803  */
804  if (PageIsNew(page))
805  ereport(ERROR,
806  (errcode(ERRCODE_INDEX_CORRUPTED),
807  errmsg("index \"%s\" contains unexpected zero page at block %u",
810  errhint("Please REINDEX it.")));
811 
812  /*
813  * Additionally check that the special area looks sane.
814  */
815  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
816  ereport(ERROR,
817  (errcode(ERRCODE_INDEX_CORRUPTED),
818  errmsg("index \"%s\" contains corrupted page at block %u",
821  errhint("Please REINDEX it.")));
822 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2811
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:285
Pointer Page
Definition: bufpage.h:78
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static uint16 PageGetSpecialSize(Page page)
Definition: bufpage.h:313
#define MAXALIGN(LEN)
Definition: c.h:795
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static char * buf
Definition: pg_test_fsync.c:67
#define RelationGetRelationName(relation)
Definition: rel.h:537

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1105 of file nbtpage.c.

1106 {
1107  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1108  if (!ConditionalLockBuffer(buf))
1109  return false;
1110 
1111  if (!RelationUsesLocalBuffers(rel))
1113 
1114  return true;
1115 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4272
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637

References buf, BufferGetPage(), ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), and _bt_search_insert().

◆ _bt_delitems_cmp()

static int _bt_delitems_cmp ( const void *  a,
const void *  b 
)
static

Definition at line 1475 of file nbtpage.c.

1476 {
1477  TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1478  TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1479 
1480  if (indexdelete1->id > indexdelete2->id)
1481  return 1;
1482  if (indexdelete1->id < indexdelete2->id)
1483  return -1;
1484 
1485  Assert(false);
1486 
1487  return 0;
1488 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69
Assert(fmt[strlen(fmt) - 1] !='\n')

References a, Assert(), b, and TM_IndexDelete::id.

Referenced by _bt_delitems_delete_check().

◆ _bt_delitems_delete()

static void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
TransactionId  snapshotConflictHorizon,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)
static

Definition at line 1296 of file nbtpage.c.

1300 {
1301  Page page = BufferGetPage(buf);
1302  BTPageOpaque opaque;
1303  bool needswal = RelationNeedsWAL(rel);
1304  char *updatedbuf = NULL;
1305  Size updatedbuflen = 0;
1306  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1307 
1308  /* Shouldn't be called unless there's something to do */
1309  Assert(ndeletable > 0 || nupdatable > 0);
1310 
1311  /* Generate new versions of posting lists without deleted TIDs */
1312  if (nupdatable > 0)
1313  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1314  updatedoffsets, &updatedbuflen,
1315  needswal);
1316 
1317  /* No ereport(ERROR) until changes are logged */
1319 
1320  /* Handle updates and deletes just like _bt_delitems_vacuum */
1321  for (int i = 0; i < nupdatable; i++)
1322  {
1323  OffsetNumber updatedoffset = updatedoffsets[i];
1324  IndexTuple itup;
1325  Size itemsz;
1326 
1327  itup = updatable[i]->itup;
1328  itemsz = MAXALIGN(IndexTupleSize(itup));
1329  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1330  itemsz))
1331  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1333  }
1334 
1335  if (ndeletable > 0)
1336  PageIndexMultiDelete(page, deletable, ndeletable);
1337 
1338  /*
1339  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1340  * this point. The VACUUM command alone controls vacuum cycle IDs.
1341  */
1342  opaque = BTPageGetOpaque(page);
1343 
1344  /*
1345  * Clear the BTP_HAS_GARBAGE page flag.
1346  *
1347  * This flag indicates the presence of LP_DEAD items on the page (though
1348  * not reliably). Note that we only rely on it with pg_upgrade'd
1349  * !heapkeyspace indexes.
1350  */
1351  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1352 
1354 
1355  /* XLOG stuff */
1356  if (needswal)
1357  {
1358  XLogRecPtr recptr;
1359  xl_btree_delete xlrec_delete;
1360 
1361  xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
1362  xlrec_delete.ndeleted = ndeletable;
1363  xlrec_delete.nupdated = nupdatable;
1364 
1365  XLogBeginInsert();
1367  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1368 
1369  if (ndeletable > 0)
1370  XLogRegisterBufData(0, (char *) deletable,
1371  ndeletable * sizeof(OffsetNumber));
1372 
1373  if (nupdatable > 0)
1374  {
1375  XLogRegisterBufData(0, (char *) updatedoffsets,
1376  nupdatable * sizeof(OffsetNumber));
1377  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1378  }
1379 
1380  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1381 
1382  PageSetLSN(page, recptr);
1383  }
1384 
1385  END_CRIT_SECTION();
1386 
1387  /* can't leak memory here */
1388  if (updatedbuf != NULL)
1389  pfree(updatedbuf);
1390  /* free tuples allocated within _bt_delitems_update() */
1391  for (int i = 0; i < nupdatable; i++)
1392  pfree(updatable[i]->itup);
1393 }
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1631
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1405
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
size_t Size
Definition: c.h:589
#define PANIC
Definition: elog.h:42
int i
Definition: isn.c:73
Pointer Item
Definition: item.h:17
#define IndexTupleSize(itup)
Definition: itup.h:70
#define MaxIndexTuplesPerPage
Definition: itup.h:165
void pfree(void *pointer)
Definition: mcxt.c:1436
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1416
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define SizeOfBtreeDelete
Definition: nbtxlog.h:245
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:34
uint16 OffsetNumber
Definition: off.h:24
#define RelationNeedsWAL(relation)
Definition: rel.h:628
uint16 btpo_flags
Definition: nbtree.h:67
IndexTuple itup
Definition: nbtree.h:912
TransactionId snapshotConflictHorizon
Definition: nbtxlog.h:232
uint16 ndeleted
Definition: nbtxlog.h:233
uint16 nupdated
Definition: nbtxlog.h:234
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:351
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:451
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:389
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
#define REGBUF_STANDARD
Definition: xloginsert.h:34

References _bt_delitems_update(), Assert(), BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog(), END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_delete::ndeleted, xl_btree_delete::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeDelete, xl_btree_delete::snapshotConflictHorizon, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delitems_delete_check().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1529 of file nbtpage.c.

1531 {
1532  Page page = BufferGetPage(buf);
1533  TransactionId snapshotConflictHorizon;
1534  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1535  int ndeletable = 0,
1536  nupdatable = 0;
1539 
1540  /* Use tableam interface to determine which tuples to delete first */
1541  snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1542 
1543  /* Should not WAL-log snapshotConflictHorizon unless it's required */
1544  if (!XLogStandbyInfoActive())
1545  snapshotConflictHorizon = InvalidTransactionId;
1546 
1547  /*
1548  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1549  * needs to do to physically delete index tuples from the page.
1550  *
1551  * Must sort deltids array to restore leaf-page-wise order (original order
1552  * before call to tableam). This is the order that the loop expects.
1553  *
1554  * Note that deltids array might be a lot smaller now. It might even have
1555  * no entries at all (with bottom-up deletion caller), in which case there
1556  * is nothing left to do.
1557  */
1558  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1560  if (delstate->ndeltids == 0)
1561  {
1562  Assert(delstate->bottomup);
1563  return;
1564  }
1565 
1566  /* We definitely have to delete at least one index tuple (or one TID) */
1567  for (int i = 0; i < delstate->ndeltids; i++)
1568  {
1569  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1570  OffsetNumber idxoffnum = dstatus->idxoffnum;
1571  ItemId itemid = PageGetItemId(page, idxoffnum);
1572  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1573  int nestedi,
1574  nitem;
1575  BTVacuumPosting vacposting;
1576 
1577  Assert(OffsetNumberIsValid(idxoffnum));
1578 
1579  if (idxoffnum == postingidxoffnum)
1580  {
1581  /*
1582  * This deltid entry is a TID from a posting list tuple that has
1583  * already been completely processed
1584  */
1585  Assert(BTreeTupleIsPosting(itup));
1587  &delstate->deltids[i].tid) < 0);
1589  &delstate->deltids[i].tid) >= 0);
1590  continue;
1591  }
1592 
1593  if (!BTreeTupleIsPosting(itup))
1594  {
1595  /* Plain non-pivot tuple */
1596  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1597  if (dstatus->knowndeletable)
1598  deletable[ndeletable++] = idxoffnum;
1599  continue;
1600  }
1601 
1602  /*
1603  * itup is a posting list tuple whose lowest deltids entry (which may
1604  * or may not be for the first TID from itup) is considered here now.
1605  * We should process all of the deltids entries for the posting list
1606  * together now, though (not just the lowest). Remember to skip over
1607  * later itup-related entries during later iterations of outermost
1608  * loop.
1609  */
1610  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1611  nestedi = i; /* Initialize for first itup deltids entry */
1612  vacposting = NULL; /* Describes final action for itup */
1613  nitem = BTreeTupleGetNPosting(itup);
1614  for (int p = 0; p < nitem; p++)
1615  {
1616  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1617  int ptidcmp = -1;
1618 
1619  /*
1620  * This nested loop reuses work across ptid TIDs taken from itup.
1621  * We take advantage of the fact that both itup's TIDs and deltids
1622  * entries (within a single itup/posting list grouping) must both
1623  * be in ascending TID order.
1624  */
1625  for (; nestedi < delstate->ndeltids; nestedi++)
1626  {
1627  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1628  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1629 
1630  /* Stop once we get past all itup related deltids entries */
1631  Assert(tdstatus->idxoffnum >= idxoffnum);
1632  if (tdstatus->idxoffnum != idxoffnum)
1633  break;
1634 
1635  /* Skip past non-deletable itup related entries up front */
1636  if (!tdstatus->knowndeletable)
1637  continue;
1638 
1639  /* Entry is first partial ptid match (or an exact match)? */
1640  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1641  if (ptidcmp >= 0)
1642  {
1643  /* Greater than or equal (partial or exact) match... */
1644  break;
1645  }
1646  }
1647 
1648  /* ...exact ptid match to a deletable deltids entry? */
1649  if (ptidcmp != 0)
1650  continue;
1651 
1652  /* Exact match for deletable deltids entry -- ptid gets deleted */
1653  if (vacposting == NULL)
1654  {
1655  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1656  nitem * sizeof(uint16));
1657  vacposting->itup = itup;
1658  vacposting->updatedoffset = idxoffnum;
1659  vacposting->ndeletedtids = 0;
1660  }
1661  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1662  }
1663 
1664  /* Final decision on itup, a posting list tuple */
1665 
1666  if (vacposting == NULL)
1667  {
1668  /* No TIDs to delete from itup -- do nothing */
1669  }
1670  else if (vacposting->ndeletedtids == nitem)
1671  {
1672  /* Straight delete of itup (to delete all TIDs) */
1673  deletable[ndeletable++] = idxoffnum;
1674  /* Turns out we won't need granular information */
1675  pfree(vacposting);
1676  }
1677  else
1678  {
1679  /* Delete some (but not all) TIDs from itup */
1680  Assert(vacposting->ndeletedtids > 0 &&
1681  vacposting->ndeletedtids < nitem);
1682  updatable[nupdatable++] = vacposting;
1683  }
1684  }
1685 
1686  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1687  _bt_delitems_delete(rel, buf, snapshotConflictHorizon,
1688  deletable, ndeletable, updatable, nupdatable);
1689 
1690  /* be tidy */
1691  for (int i = 0; i < nupdatable; i++)
1692  pfree(updatable[i]);
1693 }
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
unsigned short uint16
Definition: c.h:489
uint32 TransactionId
Definition: c.h:636
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
IndexTupleData * IndexTuple
Definition: itup.h:53
void * palloc(Size size)
Definition: mcxt.c:1210
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1296
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1475
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:445
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:917
uint16 ndeletedtids
Definition: nbtree.h:916
OffsetNumber updatedoffset
Definition: nbtree.h:913
ItemPointerData t_tid
Definition: itup.h:37
TM_IndexStatus * status
Definition: tableam.h:247
TM_IndexDelete * deltids
Definition: tableam.h:246
ItemPointerData tid
Definition: tableam.h:205
bool knowndeletable
Definition: tableam.h:212
OffsetNumber idxoffnum
Definition: tableam.h:211
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1353
#define InvalidTransactionId
Definition: transam.h:31
#define XLogStandbyInfoActive()
Definition: xlog.h:118

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert(), TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_update()

static char * _bt_delitems_update ( BTVacuumPosting updatable,
int  nupdatable,
OffsetNumber updatedoffsets,
Size updatedbuflen,
bool  needswal 
)
static

Definition at line 1416 of file nbtpage.c.

1419 {
1420  char *updatedbuf = NULL;
1421  Size buflen = 0;
1422 
1423  /* Shouldn't be called unless there's something to do */
1424  Assert(nupdatable > 0);
1425 
1426  for (int i = 0; i < nupdatable; i++)
1427  {
1428  BTVacuumPosting vacposting = updatable[i];
1429  Size itemsz;
1430 
1431  /* Replace work area IndexTuple with updated version */
1432  _bt_update_posting(vacposting);
1433 
1434  /* Keep track of size of xl_btree_update for updatedbuf in passing */
1435  itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1436  buflen += itemsz;
1437 
1438  /* Build updatedoffsets buffer in passing */
1439  updatedoffsets[i] = vacposting->updatedoffset;
1440  }
1441 
1442  /* XLOG stuff */
1443  if (needswal)
1444  {
1445  Size offset = 0;
1446 
1447  /* Allocate, set final size for caller */
1448  updatedbuf = palloc(buflen);
1449  *updatedbuflen = buflen;
1450  for (int i = 0; i < nupdatable; i++)
1451  {
1452  BTVacuumPosting vacposting = updatable[i];
1453  Size itemsz;
1454  xl_btree_update update;
1455 
1456  update.ndeletedtids = vacposting->ndeletedtids;
1457  memcpy(updatedbuf + offset, &update.ndeletedtids,
1459  offset += SizeOfBtreeUpdate;
1460 
1461  itemsz = update.ndeletedtids * sizeof(uint16);
1462  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1463  offset += itemsz;
1464  }
1465  }
1466 
1467  return updatedbuf;
1468 }
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:924
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:260
uint16 ndeletedtids
Definition: nbtxlog.h:255

References _bt_update_posting(), Assert(), BTVacuumPostingData::deletetids, i, BTVacuumPostingData::ndeletedtids, xl_btree_update::ndeletedtids, palloc(), SizeOfBtreeUpdate, and BTVacuumPostingData::updatedoffset.

Referenced by _bt_delitems_delete(), and _bt_delitems_vacuum().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1166 of file nbtpage.c.

1169 {
1170  Page page = BufferGetPage(buf);
1171  BTPageOpaque opaque;
1172  bool needswal = RelationNeedsWAL(rel);
1173  char *updatedbuf = NULL;
1174  Size updatedbuflen = 0;
1175  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1176 
1177  /* Shouldn't be called unless there's something to do */
1178  Assert(ndeletable > 0 || nupdatable > 0);
1179 
1180  /* Generate new version of posting lists without deleted TIDs */
1181  if (nupdatable > 0)
1182  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1183  updatedoffsets, &updatedbuflen,
1184  needswal);
1185 
1186  /* No ereport(ERROR) until changes are logged */
1188 
1189  /*
1190  * Handle posting tuple updates.
1191  *
1192  * Deliberately do this before handling simple deletes. If we did it the
1193  * other way around (i.e. WAL record order -- simple deletes before
1194  * updates) then we'd have to make compensating changes to the 'updatable'
1195  * array of offset numbers.
1196  *
1197  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1198  * happens to already be set. It's important that we not interfere with
1199  * any future simple index tuple deletion operations.
1200  */
1201  for (int i = 0; i < nupdatable; i++)
1202  {
1203  OffsetNumber updatedoffset = updatedoffsets[i];
1204  IndexTuple itup;
1205  Size itemsz;
1206 
1207  itup = updatable[i]->itup;
1208  itemsz = MAXALIGN(IndexTupleSize(itup));
1209  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1210  itemsz))
1211  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1213  }
1214 
1215  /* Now handle simple deletes of entire tuples */
1216  if (ndeletable > 0)
1217  PageIndexMultiDelete(page, deletable, ndeletable);
1218 
1219  /*
1220  * We can clear the vacuum cycle ID since this page has certainly been
1221  * processed by the current vacuum scan.
1222  */
1223  opaque = BTPageGetOpaque(page);
1224  opaque->btpo_cycleid = 0;
1225 
1226  /*
1227  * Clear the BTP_HAS_GARBAGE page flag.
1228  *
1229  * This flag indicates the presence of LP_DEAD items on the page (though
1230  * not reliably). Note that we only rely on it with pg_upgrade'd
1231  * !heapkeyspace indexes. That's why clearing it here won't usually
1232  * interfere with simple index tuple deletion.
1233  */
1234  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1235 
1237 
1238  /* XLOG stuff */
1239  if (needswal)
1240  {
1241  XLogRecPtr recptr;
1242  xl_btree_vacuum xlrec_vacuum;
1243 
1244  xlrec_vacuum.ndeleted = ndeletable;
1245  xlrec_vacuum.nupdated = nupdatable;
1246 
1247  XLogBeginInsert();
1249  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1250 
1251  if (ndeletable > 0)
1252  XLogRegisterBufData(0, (char *) deletable,
1253  ndeletable * sizeof(OffsetNumber));
1254 
1255  if (nupdatable > 0)
1256  {
1257  XLogRegisterBufData(0, (char *) updatedoffsets,
1258  nupdatable * sizeof(OffsetNumber));
1259  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1260  }
1261 
1262  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1263 
1264  PageSetLSN(page, recptr);
1265  }
1266 
1267  END_CRIT_SECTION();
1268 
1269  /* can't leak memory here */
1270  if (updatedbuf != NULL)
1271  pfree(updatedbuf);
1272  /* free tuples allocated within _bt_delitems_update() */
1273  for (int i = 0; i < nupdatable; i++)
1274  pfree(updatable[i]->itup);
1275 }
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:228
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:68
uint16 ndeleted
Definition: nbtxlog.h:220
uint16 nupdated
Definition: nbtxlog.h:221

References _bt_delitems_update(), Assert(), BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog(), END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 871 of file nbtpage.c.

872 {
873  Buffer buf;
874 
875  if (blkno != P_NEW)
876  {
877  /* Read an existing block of the relation */
878  buf = ReadBuffer(rel, blkno);
879  _bt_lockbuf(rel, buf, access);
880  _bt_checkpage(rel, buf);
881  }
882  else
883  {
884  bool needLock;
885  Page page;
886 
887  Assert(access == BT_WRITE);
888 
889  /*
890  * First see if the FSM knows of any free pages.
891  *
892  * We can't trust the FSM's report unreservedly; we have to check that
893  * the page is still free. (For example, an already-free page could
894  * have been re-used between the time the last VACUUM scanned it and
895  * the time the VACUUM made its FSM updates.)
896  *
897  * In fact, it's worse than that: we can't even assume that it's safe
898  * to take a lock on the reported page. If somebody else has a lock
899  * on it, or even worse our own caller does, we could deadlock. (The
900  * own-caller scenario is actually not improbable. Consider an index
901  * on a serial or timestamp column. Nearly all splits will be at the
902  * rightmost page, so it's entirely likely that _bt_split will call us
903  * while holding a lock on the page most recently acquired from FSM. A
904  * VACUUM running concurrently with the previous split could well have
905  * placed that page back in FSM.)
906  *
907  * To get around that, we ask for only a conditional lock on the
908  * reported page. If we fail, then someone else is using the page,
909  * and we may reasonably assume it's not free. (If we happen to be
910  * wrong, the worst consequence is the page will be lost to use till
911  * the next VACUUM, which is no big problem.)
912  */
913  for (;;)
914  {
915  blkno = GetFreeIndexPage(rel);
916  if (blkno == InvalidBlockNumber)
917  break;
918  buf = ReadBuffer(rel, blkno);
919  if (_bt_conditionallockbuf(rel, buf))
920  {
921  page = BufferGetPage(buf);
922 
923  /*
924  * It's possible to find an all-zeroes page in an index. For
925  * example, a backend might successfully extend the relation
926  * one page and then crash before it is able to make a WAL
927  * entry for adding the page. If we find a zeroed page then
928  * reclaim it immediately.
929  */
930  if (PageIsNew(page))
931  {
932  /* Okay to use page. Initialize and return it. */
934  return buf;
935  }
936 
937  if (BTPageIsRecyclable(page))
938  {
939  /*
940  * If we are generating WAL for Hot Standby then create a
941  * WAL record that will allow us to conflict with queries
942  * running on standby, in case they have snapshots older
943  * than safexid value
944  */
946  _bt_log_reuse_page(rel, blkno,
947  BTPageGetDeleteXid(page));
948 
949  /* Okay to use page. Re-initialize and return it. */
951  return buf;
952  }
953  elog(DEBUG2, "FSM returned nonrecyclable page");
954  _bt_relbuf(rel, buf);
955  }
956  else
957  {
958  elog(DEBUG2, "FSM returned nonlockable page");
959  /* couldn't get lock, so just drop pin */
961  }
962  }
963 
964  /*
965  * Extend the relation by one page.
966  *
967  * We have to use a lock to ensure no one else is extending the rel at
968  * the same time, else we will both try to initialize the same new
969  * page. We can skip locking for new or temp relations, however,
970  * since no one else could be accessing them.
971  */
972  needLock = !RELATION_IS_LOCAL(rel);
973 
974  if (needLock)
976 
977  buf = ReadBuffer(rel, P_NEW);
978 
979  /* Acquire buffer lock on new page */
980  _bt_lockbuf(rel, buf, BT_WRITE);
981 
982  /*
983  * Release the file-extension lock; it's now OK for someone else to
984  * extend the relation some more. Note that we cannot release this
985  * lock before we have buffer lock on the new page, or we risk a race
986  * condition against btvacuumscan --- see comments therein.
987  */
988  if (needLock)
990 
991  /* Initialize the new page before returning it */
992  page = BufferGetPage(buf);
993  Assert(PageIsNew(page));
995  }
996 
997  /* ref count and lock type are correct */
998  return buf;
999 }
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4005
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:704
#define P_NEW
Definition: bufmgr.h:105
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:271
#define DEBUG2
Definition: elog.h:29
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1141
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1105
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1051
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
Definition: nbtpage.c:828
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:260
static bool BTPageIsRecyclable(Page page)
Definition: nbtree.h:291
#define BT_WRITE
Definition: nbtree.h:721
short access
Definition: preproc-type.c:36
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:648

References _bt_checkpage(), _bt_conditionallockbuf(), _bt_lockbuf(), _bt_log_reuse_page(), _bt_pageinit(), _bt_relbuf(), Assert(), BT_WRITE, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, elog(), ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockRelationForExtension(), P_NEW, PageIsNew(), ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_metaversion(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

◆ _bt_getmeta()

static BTMetaPageData * _bt_getmeta ( Relation  rel,
Buffer  metabuf 
)
static

Definition at line 144 of file nbtpage.c.

145 {
146  Page metapg;
147  BTPageOpaque metaopaque;
148  BTMetaPageData *metad;
149 
150  metapg = BufferGetPage(metabuf);
151  metaopaque = BTPageGetOpaque(metapg);
152  metad = BTPageGetMeta(metapg);
153 
154  /* sanity-check the metapage */
155  if (!P_ISMETA(metaopaque) ||
156  metad->btm_magic != BTREE_MAGIC)
157  ereport(ERROR,
158  (errcode(ERRCODE_INDEX_CORRUPTED),
159  errmsg("index \"%s\" is not a btree",
160  RelationGetRelationName(rel))));
161 
162  if (metad->btm_version < BTREE_MIN_VERSION ||
163  metad->btm_version > BTREE_VERSION)
164  ereport(ERROR,
165  (errcode(ERRCODE_INDEX_CORRUPTED),
166  errmsg("version mismatch in index \"%s\": file version %d, "
167  "current version %d, minimal supported version %d",
170 
171  return metad;
172 }
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define P_ISMETA(opaque)
Definition: nbtree.h:223
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTREE_VERSION
Definition: nbtree.h:150
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105

References BTMetaPageData::btm_magic, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTREE_MAGIC, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), ereport, errcode(), errmsg(), ERROR, P_ISMETA, and RelationGetRelationName.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_metaversion().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
int  access 
)

Definition at line 343 of file nbtpage.c.

344 {
345  Buffer metabuf;
346  Buffer rootbuf;
347  Page rootpage;
348  BTPageOpaque rootopaque;
349  BlockNumber rootblkno;
350  uint32 rootlevel;
351  BTMetaPageData *metad;
352 
353  /*
354  * Try to use previously-cached metapage data to find the root. This
355  * normally saves one buffer access per index search, which is a very
356  * helpful savings in bufmgr traffic and hence contention.
357  */
358  if (rel->rd_amcache != NULL)
359  {
360  metad = (BTMetaPageData *) rel->rd_amcache;
361  /* We shouldn't have cached it if any of these fail */
362  Assert(metad->btm_magic == BTREE_MAGIC);
364  Assert(metad->btm_version <= BTREE_VERSION);
365  Assert(!metad->btm_allequalimage ||
367  Assert(metad->btm_root != P_NONE);
368 
369  rootblkno = metad->btm_fastroot;
370  Assert(rootblkno != P_NONE);
371  rootlevel = metad->btm_fastlevel;
372 
373  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
374  rootpage = BufferGetPage(rootbuf);
375  rootopaque = BTPageGetOpaque(rootpage);
376 
377  /*
378  * Since the cache might be stale, we check the page more carefully
379  * here than normal. We *must* check that it's not deleted. If it's
380  * not alone on its level, then we reject too --- this may be overly
381  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
382  * because that's not set in a "fast root".
383  */
384  if (!P_IGNORE(rootopaque) &&
385  rootopaque->btpo_level == rootlevel &&
386  P_LEFTMOST(rootopaque) &&
387  P_RIGHTMOST(rootopaque))
388  {
389  /* OK, accept cached page as the root */
390  return rootbuf;
391  }
392  _bt_relbuf(rel, rootbuf);
393  /* Cache is stale, throw it away */
394  if (rel->rd_amcache)
395  pfree(rel->rd_amcache);
396  rel->rd_amcache = NULL;
397  }
398 
399  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
400  metad = _bt_getmeta(rel, metabuf);
401 
402  /* if no root page initialized yet, do it */
403  if (metad->btm_root == P_NONE)
404  {
405  Page metapg;
406 
407  /* If access = BT_READ, caller doesn't want us to create root yet */
408  if (access == BT_READ)
409  {
410  _bt_relbuf(rel, metabuf);
411  return InvalidBuffer;
412  }
413 
414  /* trade in our read lock for a write lock */
415  _bt_unlockbuf(rel, metabuf);
416  _bt_lockbuf(rel, metabuf, BT_WRITE);
417 
418  /*
419  * Race condition: if someone else initialized the metadata between
420  * the time we released the read lock and acquired the write lock, we
421  * must avoid doing it again.
422  */
423  if (metad->btm_root != P_NONE)
424  {
425  /*
426  * Metadata initialized by someone else. In order to guarantee no
427  * deadlocks, we have to release the metadata page and start all
428  * over again. (Is that really true? But it's hardly worth trying
429  * to optimize this case.)
430  */
431  _bt_relbuf(rel, metabuf);
432  return _bt_getroot(rel, access);
433  }
434 
435  /*
436  * Get, initialize, write, and leave a lock of the appropriate type on
437  * the new root page. Since this is the first page in the tree, it's
438  * a leaf as well as the root.
439  */
440  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
441  rootblkno = BufferGetBlockNumber(rootbuf);
442  rootpage = BufferGetPage(rootbuf);
443  rootopaque = BTPageGetOpaque(rootpage);
444  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
445  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
446  rootopaque->btpo_level = 0;
447  rootopaque->btpo_cycleid = 0;
448  /* Get raw page pointer for metapage */
449  metapg = BufferGetPage(metabuf);
450 
451  /* NO ELOG(ERROR) till meta is updated */
453 
454  /* upgrade metapage if needed */
455  if (metad->btm_version < BTREE_NOVAC_VERSION)
456  _bt_upgrademetapage(metapg);
457 
458  metad->btm_root = rootblkno;
459  metad->btm_level = 0;
460  metad->btm_fastroot = rootblkno;
461  metad->btm_fastlevel = 0;
463  metad->btm_last_cleanup_num_heap_tuples = -1.0;
464 
465  MarkBufferDirty(rootbuf);
466  MarkBufferDirty(metabuf);
467 
468  /* XLOG stuff */
469  if (RelationNeedsWAL(rel))
470  {
471  xl_btree_newroot xlrec;
472  XLogRecPtr recptr;
474 
475  XLogBeginInsert();
478 
480  md.version = metad->btm_version;
481  md.root = rootblkno;
482  md.level = 0;
483  md.fastroot = rootblkno;
484  md.fastlevel = 0;
486  md.allequalimage = metad->btm_allequalimage;
487 
488  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
489 
490  xlrec.rootblk = rootblkno;
491  xlrec.level = 0;
492 
493  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
494 
495  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
496 
497  PageSetLSN(rootpage, recptr);
498  PageSetLSN(metapg, recptr);
499  }
500 
502 
503  /*
504  * swap root write lock for read lock. There is no danger of anyone
505  * else accessing the new root page while it's unlocked, since no one
506  * else knows where it is yet.
507  */
508  _bt_unlockbuf(rel, rootbuf);
509  _bt_lockbuf(rel, rootbuf, BT_READ);
510 
511  /* okay, metadata is correct, release lock on it without caching */
512  _bt_relbuf(rel, metabuf);
513  }
514  else
515  {
516  rootblkno = metad->btm_fastroot;
517  Assert(rootblkno != P_NONE);
518  rootlevel = metad->btm_fastlevel;
519 
520  /*
521  * Cache the metapage data for next time
522  */
524  sizeof(BTMetaPageData));
525  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
526 
527  /*
528  * We are done with the metapage; arrange to release it via first
529  * _bt_relandgetbuf call
530  */
531  rootbuf = metabuf;
532 
533  for (;;)
534  {
535  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
536  rootpage = BufferGetPage(rootbuf);
537  rootopaque = BTPageGetOpaque(rootpage);
538 
539  if (!P_IGNORE(rootopaque))
540  break;
541 
542  /* it's dead, Jim. step right one page */
543  if (P_RIGHTMOST(rootopaque))
544  elog(ERROR, "no live root page found in index \"%s\"",
546  rootblkno = rootopaque->btpo_next;
547  }
548 
549  if (rootopaque->btpo_level != rootlevel)
550  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
551  rootblkno, RelationGetRelationName(rel),
552  rootopaque->btpo_level, rootlevel);
553  }
554 
555  /*
556  * By here, we have a pin and read lock on the root page, and no lock set
557  * on the metadata page. Return the root page's buffer.
558  */
559  return rootbuf;
560 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBuffer
Definition: buf.h:25
unsigned int uint32
Definition: c.h:490
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1005
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1015
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:109
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:343
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:144
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:871
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1082
#define BTP_LEAF
Definition: nbtree.h:76
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTP_ROOT
Definition: nbtree.h:77
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:720
#define P_IGNORE(opaque)
Definition: nbtree.h:225
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:339
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
BlockNumber btm_root
Definition: nbtree.h:107
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint32 btpo_level
Definition: nbtree.h:66
void * rd_amcache
Definition: rel.h:227
MemoryContext rd_indexcxt
Definition: rel.h:202
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:336
BlockNumber rootblk
Definition: nbtxlog.h:335
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33

References _bt_getbuf(), _bt_getmeta(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog(), END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 672 of file nbtpage.c.

673 {
674  BTMetaPageData *metad;
675 
676  if (rel->rd_amcache == NULL)
677  {
678  Buffer metabuf;
679 
680  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
681  metad = _bt_getmeta(rel, metabuf);
682 
683  /*
684  * If there's no root page yet, _bt_getroot() doesn't expect a cache
685  * to be made, so just stop here and report the index height is zero.
686  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
687  */
688  if (metad->btm_root == P_NONE)
689  {
690  _bt_relbuf(rel, metabuf);
691  return 0;
692  }
693 
694  /*
695  * Cache the metapage data for next time
696  */
698  sizeof(BTMetaPageData));
699  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
700  _bt_relbuf(rel, metabuf);
701  }
702 
703  /* Get cached page */
704  metad = (BTMetaPageData *) rel->rd_amcache;
705  /* We shouldn't have cached it if any of these fail */
706  Assert(metad->btm_magic == BTREE_MAGIC);
708  Assert(metad->btm_version <= BTREE_VERSION);
709  Assert(!metad->btm_allequalimage ||
711  Assert(metad->btm_fastroot != P_NONE);
712 
713  return metad->btm_fastlevel;
714 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and get_relation_info().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 577 of file nbtpage.c.

578 {
579  Buffer metabuf;
580  Page metapg;
581  BTPageOpaque metaopaque;
582  Buffer rootbuf;
583  Page rootpage;
584  BTPageOpaque rootopaque;
585  BlockNumber rootblkno;
586  uint32 rootlevel;
587  BTMetaPageData *metad;
588 
589  /*
590  * We don't try to use cached metapage data here, since (a) this path is
591  * not performance-critical, and (b) if we are here it suggests our cache
592  * is out-of-date anyway. In light of point (b), it's probably safest to
593  * actively flush any cached metapage info.
594  */
595  if (rel->rd_amcache)
596  pfree(rel->rd_amcache);
597  rel->rd_amcache = NULL;
598 
599  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
600  metapg = BufferGetPage(metabuf);
601  metaopaque = BTPageGetOpaque(metapg);
602  metad = BTPageGetMeta(metapg);
603 
604  if (!P_ISMETA(metaopaque) ||
605  metad->btm_magic != BTREE_MAGIC)
606  ereport(ERROR,
607  (errcode(ERRCODE_INDEX_CORRUPTED),
608  errmsg("index \"%s\" is not a btree",
609  RelationGetRelationName(rel))));
610 
611  if (metad->btm_version < BTREE_MIN_VERSION ||
612  metad->btm_version > BTREE_VERSION)
613  ereport(ERROR,
614  (errcode(ERRCODE_INDEX_CORRUPTED),
615  errmsg("version mismatch in index \"%s\": file version %d, "
616  "current version %d, minimal supported version %d",
619 
620  /* if no root page initialized yet, fail */
621  if (metad->btm_root == P_NONE)
622  {
623  _bt_relbuf(rel, metabuf);
624  return InvalidBuffer;
625  }
626 
627  rootblkno = metad->btm_root;
628  rootlevel = metad->btm_level;
629 
630  /*
631  * We are done with the metapage; arrange to release it via first
632  * _bt_relandgetbuf call
633  */
634  rootbuf = metabuf;
635 
636  for (;;)
637  {
638  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
639  rootpage = BufferGetPage(rootbuf);
640  rootopaque = BTPageGetOpaque(rootpage);
641 
642  if (!P_IGNORE(rootopaque))
643  break;
644 
645  /* it's dead, Jim. step right one page */
646  if (P_RIGHTMOST(rootopaque))
647  elog(ERROR, "no live root page found in index \"%s\"",
649  rootblkno = rootopaque->btpo_next;
650  }
651 
652  if (rootopaque->btpo_level != rootlevel)
653  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
654  rootblkno, RelationGetRelationName(rel),
655  rootopaque->btpo_level, rootlevel);
656 
657  return rootbuf;
658 }

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog(), ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 69 of file nbtpage.c.

71 {
72  BTMetaPageData *metad;
73  BTPageOpaque metaopaque;
74 
75  _bt_pageinit(page, BLCKSZ);
76 
77  metad = BTPageGetMeta(page);
78  metad->btm_magic = BTREE_MAGIC;
79  metad->btm_version = BTREE_VERSION;
80  metad->btm_root = rootbknum;
81  metad->btm_level = level;
82  metad->btm_fastroot = rootbknum;
83  metad->btm_fastlevel = level;
86  metad->btm_allequalimage = allequalimage;
87 
88  metaopaque = BTPageGetOpaque(page);
89  metaopaque->btpo_flags = BTP_META;
90 
91  /*
92  * Set pd_lower just past the end of the metadata. This is essential,
93  * because without doing so, metadata will be lost if xlog.c compresses
94  * the page.
95  */
96  ((PageHeader) page)->pd_lower =
97  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
98 }
PageHeaderData * PageHeader
Definition: bufpage.h:170
#define BTP_META
Definition: nbtree.h:79

References _bt_pageinit(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, and BTREE_VERSION.

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_leftsib_splitflag()

static bool _bt_leftsib_splitflag ( Relation  rel,
BlockNumber  leftsib,
BlockNumber  target 
)
static

Definition at line 1709 of file nbtpage.c.

1710 {
1711  Buffer buf;
1712  Page page;
1713  BTPageOpaque opaque;
1714  bool result;
1715 
1716  /* Easy case: No left sibling */
1717  if (leftsib == P_NONE)
1718  return false;
1719 
1720  buf = _bt_getbuf(rel, leftsib, BT_READ);
1721  page = BufferGetPage(buf);
1722  opaque = BTPageGetOpaque(page);
1723 
1724  /*
1725  * If the left sibling was concurrently split, so that its next-pointer
1726  * doesn't point to the current page anymore, the split that created
1727  * target must be completed. Caller can reasonably expect that there will
1728  * be a downlink to the target page that it can relocate using its stack.
1729  * (We don't allow splitting an incompletely split page again until the
1730  * previous split has been completed.)
1731  */
1732  result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1733  _bt_relbuf(rel, buf);
1734 
1735  return result;
1736 }
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetPage(), P_INCOMPLETE_SPLIT, and P_NONE.

Referenced by _bt_lock_subtree_parent(), and _bt_pagedel().

◆ _bt_lock_subtree_parent()

static bool _bt_lock_subtree_parent ( Relation  rel,
BlockNumber  child,
BTStack  stack,
Buffer subtreeparent,
OffsetNumber poffset,
BlockNumber topparent,
BlockNumber topparentrightsib 
)
static

Definition at line 2776 of file nbtpage.c.

2779 {
2780  BlockNumber parent,
2781  leftsibparent;
2782  OffsetNumber parentoffset,
2783  maxoff;
2784  Buffer pbuf;
2785  Page page;
2786  BTPageOpaque opaque;
2787 
2788  /*
2789  * Locate the pivot tuple whose downlink points to "child". Write lock
2790  * the parent page itself.
2791  */
2792  pbuf = _bt_getstackbuf(rel, stack, child);
2793  if (pbuf == InvalidBuffer)
2794  {
2795  /*
2796  * Failed to "re-find" a pivot tuple whose downlink matched our child
2797  * block number on the parent level -- the index must be corrupt.
2798  * Don't even try to delete the leafbuf subtree. Just report the
2799  * issue and press on with vacuuming the index.
2800  *
2801  * Note: _bt_getstackbuf() recovers from concurrent page splits that
2802  * take place on the parent level. Its approach is a near-exhaustive
2803  * linear search. This also gives it a surprisingly good chance of
2804  * recovering in the event of a buggy or inconsistent opclass. But we
2805  * don't rely on that here.
2806  */
2807  ereport(LOG,
2808  (errcode(ERRCODE_INDEX_CORRUPTED),
2809  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2810  RelationGetRelationName(rel), child)));
2811  return false;
2812  }
2813 
2814  parent = stack->bts_blkno;
2815  parentoffset = stack->bts_offset;
2816 
2817  page = BufferGetPage(pbuf);
2818  opaque = BTPageGetOpaque(page);
2819  maxoff = PageGetMaxOffsetNumber(page);
2820  leftsibparent = opaque->btpo_prev;
2821 
2822  /*
2823  * _bt_getstackbuf() completes page splits on returned parent buffer when
2824  * required.
2825  *
2826  * In general it's a bad idea for VACUUM to use up more disk space, which
2827  * is why page deletion does not finish incomplete page splits most of the
2828  * time. We allow this limited exception because the risk is much lower,
2829  * and the potential downside of not proceeding is much higher: A single
2830  * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2831  * prevent us from deleting hundreds of empty leaf pages from one level
2832  * down.
2833  */
2834  Assert(!P_INCOMPLETE_SPLIT(opaque));
2835 
2836  if (parentoffset < maxoff)
2837  {
2838  /*
2839  * Child is not the rightmost child in parent, so it's safe to delete
2840  * the subtree whose root/topparent is child page
2841  */
2842  *subtreeparent = pbuf;
2843  *poffset = parentoffset;
2844  return true;
2845  }
2846 
2847  /*
2848  * Child is the rightmost child of parent.
2849  *
2850  * Since it's the rightmost child of parent, deleting the child (or
2851  * deleting the subtree whose root/topparent is the child page) is only
2852  * safe when it's also possible to delete the parent.
2853  */
2854  Assert(parentoffset == maxoff);
2855  if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2856  {
2857  /*
2858  * Child isn't parent's only child, or parent is rightmost on its
2859  * entire level. Definitely cannot delete any pages.
2860  */
2861  _bt_relbuf(rel, pbuf);
2862  return false;
2863  }
2864 
2865  /*
2866  * Now make sure that the parent deletion is itself safe by examining the
2867  * child's grandparent page. Recurse, passing the parent page as the
2868  * child page (child's grandparent is the parent on the next level up). If
2869  * parent deletion is unsafe, then child deletion must also be unsafe (in
2870  * which case caller cannot delete any pages at all).
2871  */
2872  *topparent = parent;
2873  *topparentrightsib = opaque->btpo_next;
2874 
2875  /*
2876  * Release lock on parent before recursing.
2877  *
2878  * It's OK to release page locks on parent before recursive call locks
2879  * grandparent. An internal page can only acquire an entry if the child
2880  * is split, but that cannot happen as long as we still hold a lock on the
2881  * leafbuf page.
2882  */
2883  _bt_relbuf(rel, pbuf);
2884 
2885  /*
2886  * Before recursing, check that the left sibling of parent (if any) is not
2887  * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2888  * parent lock).
2889  *
2890  * Note: We deliberately avoid completing incomplete splits here.
2891  */
2892  if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2893  return false;
2894 
2895  /* Recurse to examine child page's grandparent page */
2896  return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2897  subtreeparent, poffset,
2898  topparent, topparentrightsib);
2899 }
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
#define LOG
Definition: elog.h:31
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2307
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1709
static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2776
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
BlockNumber bts_blkno
Definition: nbtree.h:735
struct BTStackData * bts_parent
Definition: nbtree.h:737
OffsetNumber bts_offset
Definition: nbtree.h:736

References _bt_getstackbuf(), _bt_leftsib_splitflag(), _bt_relbuf(), Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetPage(), ereport, errcode(), errmsg_internal(), InvalidBuffer, LOG, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_mark_page_halfdead().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)

Definition at line 1051 of file nbtpage.c.

1052 {
1053  /* LockBuffer() asserts that pin is held by this backend */
1054  LockBuffer(buf, access);
1055 
1056  /*
1057  * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1058  * an nbtree error (e.g. a unique violation error). That won't cause
1059  * Valgrind false positives.
1060  *
1061  * The nbtree client requests are superimposed on top of the bufmgr.c
1062  * buffer pin client requests. In the event of an nbtree error the buffer
1063  * will certainly get marked as defined when the backend once again
1064  * acquires its first pin on the buffer. (Of course, if the backend never
1065  * touches the buffer again then it doesn't matter that it remains
1066  * non-accessible to Valgrind.)
1067  *
1068  * Note: When an IndexTuple C pointer gets computed using an ItemId read
1069  * from a page while a lock was held, the C pointer becomes unsafe to
1070  * dereference forever as soon as the lock is released. Valgrind can only
1071  * detect cases where the pointer gets dereferenced with no _current_
1072  * lock/pin held, though.
1073  */
1074  if (!RelationUsesLocalBuffers(rel))
1076 }
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4246

References buf, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readnextpage(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), _bt_unlink_halfdead_page(), and btvacuumpage().

◆ _bt_log_reuse_page()

static void _bt_log_reuse_page ( Relation  rel,
BlockNumber  blkno,
FullTransactionId  safexid 
)
static

Definition at line 828 of file nbtpage.c.

829 {
830  xl_btree_reuse_page xlrec_reuse;
831 
832  /*
833  * Note that we don't register the buffer with the record, because this
834  * operation doesn't modify the page. This record only exists to provide a
835  * conflict point for Hot Standby.
836  */
837 
838  /* XLOG stuff */
839  xlrec_reuse.locator = rel->rd_locator;
840  xlrec_reuse.block = blkno;
841  xlrec_reuse.snapshotConflictHorizon = safexid;
842 
843  XLogBeginInsert();
844  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
845 
846  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
847 }
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:190
RelFileLocator rd_locator
Definition: rel.h:56
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186

References xl_btree_reuse_page::block, xl_btree_reuse_page::locator, RelationData::rd_locator, SizeOfBtreeReusePage, xl_btree_reuse_page::snapshotConflictHorizon, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), and XLogRegisterData().

Referenced by _bt_getbuf().

◆ _bt_mark_page_halfdead()

static bool _bt_mark_page_halfdead ( Relation  rel,
Buffer  leafbuf,
BTStack  stack 
)
static

Definition at line 2087 of file nbtpage.c.

2088 {
2089  BlockNumber leafblkno;
2090  BlockNumber leafrightsib;
2091  BlockNumber topparent;
2092  BlockNumber topparentrightsib;
2093  ItemId itemid;
2094  Page page;
2095  BTPageOpaque opaque;
2096  Buffer subtreeparent;
2097  OffsetNumber poffset;
2098  OffsetNumber nextoffset;
2099  IndexTuple itup;
2100  IndexTupleData trunctuple;
2101 
2102  page = BufferGetPage(leafbuf);
2103  opaque = BTPageGetOpaque(page);
2104 
2105  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2106  P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2107  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2108 
2109  /*
2110  * Save info about the leaf page.
2111  */
2112  leafblkno = BufferGetBlockNumber(leafbuf);
2113  leafrightsib = opaque->btpo_next;
2114 
2115  /*
2116  * Before attempting to lock the parent page, check that the right sibling
2117  * is not in half-dead state. A half-dead right sibling would have no
2118  * downlink in the parent, which would be highly confusing later when we
2119  * delete the downlink. It would fail the "right sibling of target page
2120  * is also the next child in parent page" cross-check below.
2121  */
2122  if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2123  {
2124  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2125  leafblkno, leafrightsib);
2126  return false;
2127  }
2128 
2129  /*
2130  * We cannot delete a page that is the rightmost child of its immediate
2131  * parent, unless it is the only child --- in which case the parent has to
2132  * be deleted too, and the same condition applies recursively to it. We
2133  * have to check this condition all the way up before trying to delete,
2134  * and lock the parent of the root of the to-be-deleted subtree (the
2135  * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2136  * for us. We remove the downlink to the "top parent" page (subtree root
2137  * page) from the subtree parent page below.
2138  *
2139  * Initialize topparent to be leafbuf page now. The final to-be-deleted
2140  * subtree is often a degenerate one page subtree consisting only of the
2141  * leafbuf page. When that happens, the leafbuf page is the final subtree
2142  * root page/top parent page.
2143  */
2144  topparent = leafblkno;
2145  topparentrightsib = leafrightsib;
2146  if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
2147  &subtreeparent, &poffset,
2148  &topparent, &topparentrightsib))
2149  return false;
2150 
2151  /*
2152  * Check that the parent-page index items we're about to delete/overwrite
2153  * in subtree parent page contain what we expect. This can fail if the
2154  * index has become corrupt for some reason. We want to throw any error
2155  * before entering the critical section --- otherwise it'd be a PANIC.
2156  */
2157  page = BufferGetPage(subtreeparent);
2158  opaque = BTPageGetOpaque(page);
2159 
2160 #ifdef USE_ASSERT_CHECKING
2161 
2162  /*
2163  * This is just an assertion because _bt_lock_subtree_parent should have
2164  * guaranteed tuple has the expected contents
2165  */
2166  itemid = PageGetItemId(page, poffset);
2167  itup = (IndexTuple) PageGetItem(page, itemid);
2168  Assert(BTreeTupleGetDownLink(itup) == topparent);
2169 #endif
2170 
2171  nextoffset = OffsetNumberNext(poffset);
2172  itemid = PageGetItemId(page, nextoffset);
2173  itup = (IndexTuple) PageGetItem(page, itemid);
2174  if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2175  ereport(ERROR,
2176  (errcode(ERRCODE_INDEX_CORRUPTED),
2177  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2178  topparentrightsib, topparent,
2179  BTreeTupleGetDownLink(itup),
2180  BufferGetBlockNumber(subtreeparent),
2181  RelationGetRelationName(rel))));
2182 
2183  /*
2184  * Any insert which would have gone on the leaf block will now go to its
2185  * right sibling. In other words, the key space moves right.
2186  */
2187  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2188 
2189  /* No ereport(ERROR) until changes are logged */
2191 
2192  /*
2193  * Update parent of subtree. We want to delete the downlink to the top
2194  * parent page/root of the subtree, and the *following* key. Easiest way
2195  * is to copy the right sibling's downlink over the downlink that points
2196  * to top parent page, and then delete the right sibling's original pivot
2197  * tuple.
2198  *
2199  * Lanin and Shasha make the key space move left when deleting a page,
2200  * whereas the key space moves right here. That's why we cannot simply
2201  * delete the pivot tuple with the downlink to the top parent page. See
2202  * nbtree/README.
2203  */
2204  page = BufferGetPage(subtreeparent);
2205  opaque = BTPageGetOpaque(page);
2206 
2207  itemid = PageGetItemId(page, poffset);
2208  itup = (IndexTuple) PageGetItem(page, itemid);
2209  BTreeTupleSetDownLink(itup, topparentrightsib);
2210 
2211  nextoffset = OffsetNumberNext(poffset);
2212  PageIndexTupleDelete(page, nextoffset);
2213 
2214  /*
2215  * Mark the leaf page as half-dead, and stamp it with a link to the top
2216  * parent page. When the leaf page is also the top parent page, the link
2217  * is set to InvalidBlockNumber.
2218  */
2219  page = BufferGetPage(leafbuf);
2220  opaque = BTPageGetOpaque(page);
2221  opaque->btpo_flags |= BTP_HALF_DEAD;
2222 
2224  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2225  trunctuple.t_info = sizeof(IndexTupleData);
2226  if (topparent != leafblkno)
2227  BTreeTupleSetTopParent(&trunctuple, topparent);
2228  else
2230 
2231  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2232  IndexTupleSize(&trunctuple)))
2233  elog(ERROR, "could not overwrite high key in half-dead page");
2234 
2235  /* Must mark buffers dirty before XLogInsert */
2236  MarkBufferDirty(subtreeparent);
2237  MarkBufferDirty(leafbuf);
2238 
2239  /* XLOG stuff */
2240  if (RelationNeedsWAL(rel))
2241  {
2243  XLogRecPtr recptr;
2244 
2245  xlrec.poffset = poffset;
2246  xlrec.leafblk = leafblkno;
2247  if (topparent != leafblkno)
2248  xlrec.topparent = topparent;
2249  else
2250  xlrec.topparent = InvalidBlockNumber;
2251 
2252  XLogBeginInsert();
2253  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2254  XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2255 
2256  page = BufferGetPage(leafbuf);
2257  opaque = BTPageGetOpaque(page);
2258  xlrec.leftblk = opaque->btpo_prev;
2259  xlrec.rightblk = opaque->btpo_next;
2260 
2262 
2263  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2264 
2265  page = BufferGetPage(subtreeparent);
2266  PageSetLSN(page, recptr);
2267  page = BufferGetPage(leafbuf);
2268  PageSetLSN(page, recptr);
2269  }
2270 
2271  END_CRIT_SECTION();
2272 
2273  _bt_relbuf(rel, subtreeparent);
2274  return true;
2275 }
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:1052
#define MemSet(start, val, len)
Definition: c.h:1004
#define DEBUG1
Definition: elog.h:30
struct IndexTupleData IndexTupleData
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1766
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTP_HALF_DEAD
Definition: nbtree.h:80
#define P_HIKEY
Definition: nbtree.h:368
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:627
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:563
#define P_ISROOT(opaque)
Definition: nbtree.h:221
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:283
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:38
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3163
unsigned short t_info
Definition: itup.h:49

References _bt_lock_subtree_parent(), _bt_relbuf(), _bt_rightsib_halfdeadflag(), Assert(), BTP_HALF_DEAD, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), BTreeTupleSetDownLink(), BTreeTupleSetTopParent(), BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog(), END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, IndexTupleSize, InvalidBlockNumber, xl_btree_mark_page_halfdead::leafblk, xl_btree_mark_page_halfdead::leftblk, MarkBufferDirty(), MemSet, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageIndexTupleDelete(), PageIndexTupleOverwrite(), PageSetLSN(), xl_btree_mark_page_halfdead::poffset, PredicateLockPageCombine(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_mark_page_halfdead::rightblk, SizeOfBtreeMarkPageHalfDead, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_mark_page_halfdead::topparent, XLOG_BTREE_MARK_PAGE_HALFDEAD, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_pagedel().

◆ _bt_metaversion()

void _bt_metaversion ( Relation  rel,
bool heapkeyspace,
bool allequalimage 
)

Definition at line 736 of file nbtpage.c.

737 {
738  BTMetaPageData *metad;
739 
740  if (rel->rd_amcache == NULL)
741  {
742  Buffer metabuf;
743 
744  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
745  metad = _bt_getmeta(rel, metabuf);
746 
747  /*
748  * If there's no root page yet, _bt_getroot() doesn't expect a cache
749  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
750  * be changed to allow this case.)
751  */
752  if (metad->btm_root == P_NONE)
753  {
754  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
755  *allequalimage = metad->btm_allequalimage;
756 
757  _bt_relbuf(rel, metabuf);
758  return;
759  }
760 
761  /*
762  * Cache the metapage data for next time
763  *
764  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
765  * can change the nbtree version for an index without invalidating any
766  * local cache. This is okay because it can only happen when moving
767  * from version 2 to version 3, both of which are !heapkeyspace
768  * versions.
769  */
771  sizeof(BTMetaPageData));
772  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
773  _bt_relbuf(rel, metabuf);
774  }
775 
776  /* Get cached page */
777  metad = (BTMetaPageData *) rel->rd_amcache;
778  /* We shouldn't have cached it if any of these fail */
779  Assert(metad->btm_magic == BTREE_MAGIC);
781  Assert(metad->btm_version <= BTREE_VERSION);
782  Assert(!metad->btm_allequalimage ||
784  Assert(metad->btm_fastroot != P_NONE);
785 
786  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
787  *allequalimage = metad->btm_allequalimage;
788 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_internal().

◆ _bt_pagedel()

void _bt_pagedel ( Relation  rel,
Buffer  leafbuf,
BTVacState vstate 
)

Definition at line 1816 of file nbtpage.c.

1817 {
1818  BlockNumber rightsib;
1819  bool rightsib_empty;
1820  Page page;
1821  BTPageOpaque opaque;
1822 
1823  /*
1824  * Save original leafbuf block number from caller. Only deleted blocks
1825  * that are <= scanblkno are added to bulk delete stat's pages_deleted
1826  * count.
1827  */
1828  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1829 
1830  /*
1831  * "stack" is a search stack leading (approximately) to the target page.
1832  * It is initially NULL, but when iterating, we keep it to avoid
1833  * duplicated search effort.
1834  *
1835  * Also, when "stack" is not NULL, we have already checked that the
1836  * current page is not the right half of an incomplete split, i.e. the
1837  * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1838  * when the current target page is to the right of caller's initial page
1839  * (the scanblkno page).
1840  */
1841  BTStack stack = NULL;
1842 
1843  for (;;)
1844  {
1845  page = BufferGetPage(leafbuf);
1846  opaque = BTPageGetOpaque(page);
1847 
1848  /*
1849  * Internal pages are never deleted directly, only as part of deleting
1850  * the whole subtree all the way down to leaf level.
1851  *
1852  * Also check for deleted pages here. Caller never passes us a fully
1853  * deleted page. Only VACUUM can delete pages, so there can't have
1854  * been a concurrent deletion. Assume that we reached any deleted
1855  * page encountered here by following a sibling link, and that the
1856  * index is corrupt.
1857  */
1858  Assert(!P_ISDELETED(opaque));
1859  if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1860  {
1861  /*
1862  * Pre-9.4 page deletion only marked internal pages as half-dead,
1863  * but now we only use that flag on leaf pages. The old algorithm
1864  * was never supposed to leave half-dead pages in the tree, it was
1865  * just a transient state, but it was nevertheless possible in
1866  * error scenarios. We don't know how to deal with them here. They
1867  * are harmless as far as searches are considered, but inserts
1868  * into the deleted keyspace could add out-of-order downlinks in
1869  * the upper levels. Log a notice, hopefully the admin will notice
1870  * and reindex.
1871  */
1872  if (P_ISHALFDEAD(opaque))
1873  ereport(LOG,
1874  (errcode(ERRCODE_INDEX_CORRUPTED),
1875  errmsg("index \"%s\" contains a half-dead internal page",
1877  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1878 
1879  if (P_ISDELETED(opaque))
1880  ereport(LOG,
1881  (errcode(ERRCODE_INDEX_CORRUPTED),
1882  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1883  BufferGetBlockNumber(leafbuf),
1884  scanblkno,
1885  RelationGetRelationName(rel))));
1886 
1887  _bt_relbuf(rel, leafbuf);
1888  return;
1889  }
1890 
1891  /*
1892  * We can never delete rightmost pages nor root pages. While at it,
1893  * check that page is empty, since it's possible that the leafbuf page
1894  * was empty a moment ago, but has since had some inserts.
1895  *
1896  * To keep the algorithm simple, we also never delete an incompletely
1897  * split page (they should be rare enough that this doesn't make any
1898  * meaningful difference to disk usage):
1899  *
1900  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1901  * left half of an incomplete split, but ensuring that it's not the
1902  * right half is more complicated. For that, we have to check that
1903  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1904  * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1905  * release the lock on scanblkno/leafbuf, check the left sibling, and
1906  * construct a search stack to scanblkno. On subsequent iterations,
1907  * we know we stepped right from a page that passed these tests, so
1908  * it's OK.
1909  */
1910  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1911  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1912  P_INCOMPLETE_SPLIT(opaque))
1913  {
1914  /* Should never fail to delete a half-dead page */
1915  Assert(!P_ISHALFDEAD(opaque));
1916 
1917  _bt_relbuf(rel, leafbuf);
1918  return;
1919  }
1920 
1921  /*
1922  * First, remove downlink pointing to the page (or a parent of the
1923  * page, if we are going to delete a taller subtree), and mark the
1924  * leafbuf page half-dead
1925  */
1926  if (!P_ISHALFDEAD(opaque))
1927  {
1928  /*
1929  * We need an approximate pointer to the page's parent page. We
1930  * use a variant of the standard search mechanism to search for
1931  * the page's high key; this will give us a link to either the
1932  * current parent or someplace to its left (if there are multiple
1933  * equal high keys, which is possible with !heapkeyspace indexes).
1934  *
1935  * Also check if this is the right-half of an incomplete split
1936  * (see comment above).
1937  */
1938  if (!stack)
1939  {
1940  BTScanInsert itup_key;
1941  ItemId itemid;
1942  IndexTuple targetkey;
1943  BlockNumber leftsib,
1944  leafblkno;
1945  Buffer sleafbuf;
1946 
1947  itemid = PageGetItemId(page, P_HIKEY);
1948  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1949 
1950  leftsib = opaque->btpo_prev;
1951  leafblkno = BufferGetBlockNumber(leafbuf);
1952 
1953  /*
1954  * To avoid deadlocks, we'd better drop the leaf page lock
1955  * before going further.
1956  */
1957  _bt_unlockbuf(rel, leafbuf);
1958 
1959  /*
1960  * Check that the left sibling of leafbuf (if any) is not
1961  * marked with INCOMPLETE_SPLIT flag before proceeding
1962  */
1963  Assert(leafblkno == scanblkno);
1964  if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1965  {
1966  ReleaseBuffer(leafbuf);
1967  return;
1968  }
1969 
1970  /* we need an insertion scan key for the search, so build one */
1971  itup_key = _bt_mkscankey(rel, targetkey);
1972  /* find the leftmost leaf page with matching pivot/high key */
1973  itup_key->pivotsearch = true;
1974  stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1975  /* won't need a second lock or pin on leafbuf */
1976  _bt_relbuf(rel, sleafbuf);
1977 
1978  /*
1979  * Re-lock the leaf page, and start over to use our stack
1980  * within _bt_mark_page_halfdead. We must do it that way
1981  * because it's possible that leafbuf can no longer be
1982  * deleted. We need to recheck.
1983  *
1984  * Note: We can't simply hold on to the sleafbuf lock instead,
1985  * because it's barely possible that sleafbuf is not the same
1986  * page as leafbuf. This happens when leafbuf split after our
1987  * original lock was dropped, but before _bt_search finished
1988  * its descent. We rely on the assumption that we'll find
1989  * leafbuf isn't safe to delete anymore in this scenario.
1990  * (Page deletion can cope with the stack being to the left of
1991  * leafbuf, but not to the right of leafbuf.)
1992  */
1993  _bt_lockbuf(rel, leafbuf, BT_WRITE);
1994  continue;
1995  }
1996 
1997  /*
1998  * See if it's safe to delete the leaf page, and determine how
1999  * many parent/internal pages above the leaf level will be
2000  * deleted. If it's safe then _bt_mark_page_halfdead will also
2001  * perform the first phase of deletion, which includes marking the
2002  * leafbuf page half-dead.
2003  */
2004  Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2005  if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
2006  {
2007  _bt_relbuf(rel, leafbuf);
2008  return;
2009  }
2010  }
2011 
2012  /*
2013  * Then unlink it from its siblings. Each call to
2014  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2015  * making it shallower. Iterate until the leafbuf page is deleted.
2016  */
2017  rightsib_empty = false;
2018  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2019  while (P_ISHALFDEAD(opaque))
2020  {
2021  /* Check for interrupts in _bt_unlink_halfdead_page */
2022  if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2023  &rightsib_empty, vstate))
2024  {
2025  /*
2026  * _bt_unlink_halfdead_page should never fail, since we
2027  * established that deletion is generally safe in
2028  * _bt_mark_page_halfdead -- index must be corrupt.
2029  *
2030  * Note that _bt_unlink_halfdead_page already released the
2031  * lock and pin on leafbuf for us.
2032  */
2033  Assert(false);
2034  return;
2035  }
2036  }
2037 
2038  Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2039 
2040  rightsib = opaque->btpo_next;
2041 
2042  _bt_relbuf(rel, leafbuf);
2043 
2044  /*
2045  * Check here, as calling loops will have locks held, preventing
2046  * interrupts from being processed.
2047  */
2049 
2050  /*
2051  * The page has now been deleted. If its right sibling is completely
2052  * empty, it's possible that the reason we haven't deleted it earlier
2053  * is that it was the rightmost child of the parent. Now that we
2054  * removed the downlink for this page, the right sibling might now be
2055  * the only child of the parent, and could be removed. It would be
2056  * picked up by the next vacuum anyway, but might as well try to
2057  * remove it now, so loop back to process the right sibling.
2058  *
2059  * Note: This relies on the assumption that _bt_getstackbuf() will be
2060  * able to reuse our original descent stack with a different child
2061  * block (provided that the child block is to the right of the
2062  * original leaf page reached by _bt_search()). It will even update
2063  * the descent stack each time we loop around, avoiding repeated work.
2064  */
2065  if (!rightsib_empty)
2066  break;
2067 
2068  leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2069  }
2070 }
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2087
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2303
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:96
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
bool pivotsearch
Definition: nbtree.h:796

References _bt_getbuf(), _bt_leftsib_splitflag(), _bt_lockbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), LOG, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), BTScanInsertData::pivotsearch, RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

◆ _bt_pageinit()

void _bt_pageinit ( Page  page,
Size  size 
)

Definition at line 1141 of file nbtpage.c.

1142 {
1143  PageInit(page, size, sizeof(BTPageOpaqueData));
1144 }
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42

References PageInit().

Referenced by _bt_blnewpage(), _bt_getbuf(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

◆ _bt_pendingfsm_add()

static void _bt_pendingfsm_add ( BTVacState vstate,
BlockNumber  target,
FullTransactionId  safexid 
)
static

Definition at line 3021 of file nbtpage.c.

3024 {
3025  Assert(vstate->npendingpages <= vstate->bufsize);
3026  Assert(vstate->bufsize <= vstate->maxbufsize);
3027 
3028 #ifdef USE_ASSERT_CHECKING
3029 
3030  /*
3031  * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3032  * array will always be in safexid order (since that is the order that we
3033  * save them in here)
3034  */
3035  if (vstate->npendingpages > 0)
3036  {
3037  FullTransactionId lastsafexid =
3038  vstate->pendingpages[vstate->npendingpages - 1].safexid;
3039 
3040  Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3041  }
3042 #endif
3043 
3044  /*
3045  * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3046  * information about this page.
3047  *
3048  * Note that this also covers the case where we opted to not use the
3049  * optimization in _bt_pendingfsm_init().
3050  */
3051  if (vstate->npendingpages == vstate->maxbufsize)
3052  return;
3053 
3054  /* Consider enlarging buffer */
3055  if (vstate->npendingpages == vstate->bufsize)
3056  {
3057  int newbufsize = vstate->bufsize * 2;
3058 
3059  /* Respect work_mem */
3060  if (newbufsize > vstate->maxbufsize)
3061  newbufsize = vstate->maxbufsize;
3062 
3063  vstate->bufsize = newbufsize;
3064  vstate->pendingpages =
3065  repalloc(vstate->pendingpages,
3066  sizeof(BTPendingFSM) * vstate->bufsize);
3067  }
3068 
3069  /* Save metadata for newly deleted page */
3070  vstate->pendingpages[vstate->npendingpages].target = target;
3071  vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3072  vstate->npendingpages++;
3073 }
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1456
FullTransactionId safexid
Definition: nbtree.h:328
BlockNumber target
Definition: nbtree.h:327
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
#define FullTransactionIdFollowsOrEquals(a, b)
Definition: transam.h:54

References Assert(), BTVacState::bufsize, FullTransactionIdFollowsOrEquals, BTVacState::maxbufsize, BTVacState::npendingpages, BTVacState::pendingpages, repalloc(), BTPendingFSM::safexid, and BTPendingFSM::target.

Referenced by _bt_unlink_halfdead_page().

◆ _bt_pendingfsm_finalize()

void _bt_pendingfsm_finalize ( Relation  rel,
BTVacState vstate 
)

Definition at line 2956 of file nbtpage.c.

2957 {
2958  IndexBulkDeleteResult *stats = vstate->stats;
2959 
2960  Assert(stats->pages_newly_deleted >= vstate->npendingpages);
2961 
2962  if (vstate->npendingpages == 0)
2963  {
2964  /* Just free memory when nothing to do */
2965  if (vstate->pendingpages)
2966  pfree(vstate->pendingpages);
2967 
2968  return;
2969  }
2970 
2971 #ifdef DEBUG_BTREE_PENDING_FSM
2972 
2973  /*
2974  * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
2975  * placing pending pages in the FSM. Note that the optimization will
2976  * never be effective without some other backend concurrently consuming an
2977  * XID.
2978  */
2979  pg_usleep(5000000L);
2980 #endif
2981 
2982  /*
2983  * Recompute VACUUM XID boundaries.
2984  *
2985  * We don't actually care about the oldest non-removable XID. Computing
2986  * the oldest such XID has a useful side-effect that we rely on: it
2987  * forcibly updates the XID horizon state for this backend. This step is
2988  * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
2989  * that it is now safe to recycle newly deleted pages without this step.
2990  */
2992 
2993  for (int i = 0; i < vstate->npendingpages; i++)
2994  {
2995  BlockNumber target = vstate->pendingpages[i].target;
2996  FullTransactionId safexid = vstate->pendingpages[i].safexid;
2997 
2998  /*
2999  * Do the equivalent of checking BTPageIsRecyclable(), but without
3000  * accessing the page again a second time.
3001  *
3002  * Give up on finding the first non-recyclable page -- all later pages
3003  * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3004  * to the array in safexid order.
3005  */
3006  if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
3007  break;
3008 
3009  RecordFreeIndexPage(rel, target);
3010  stats->pages_free++;
3011  }
3012 
3013  pfree(vstate->pendingpages);
3014 }
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:2034
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4300
void pg_usleep(long microsec)
Definition: signal.c:53
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BlockNumber pages_newly_deleted
Definition: genam.h:80
BlockNumber pages_free
Definition: genam.h:82

References Assert(), GetOldestNonRemovableTransactionId(), GlobalVisCheckRemovableFullXid(), i, BTVacState::npendingpages, IndexBulkDeleteResult::pages_free, IndexBulkDeleteResult::pages_newly_deleted, BTVacState::pendingpages, pfree(), pg_usleep(), RecordFreeIndexPage(), BTPendingFSM::safexid, BTVacState::stats, and BTPendingFSM::target.

Referenced by btvacuumscan().

◆ _bt_pendingfsm_init()

void _bt_pendingfsm_init ( Relation  rel,
BTVacState vstate,
bool  cleanuponly 
)

Definition at line 2915 of file nbtpage.c.

2916 {
2917  int64 maxbufsize;
2918 
2919  /*
2920  * Don't bother with optimization in cleanup-only case -- we don't expect
2921  * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2922  * can only take place because this optimization didn't work out during
2923  * the last VACUUM.
2924  */
2925  if (cleanuponly)
2926  return;
2927 
2928  /*
2929  * Cap maximum size of array so that we always respect work_mem. Avoid
2930  * int overflow here.
2931  */
2932  vstate->bufsize = 256;
2933  maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
2934  maxbufsize = Min(maxbufsize, INT_MAX);
2935  maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2936  /* Stay sane with small work_mem */
2937  maxbufsize = Max(maxbufsize, vstate->bufsize);
2938  vstate->maxbufsize = maxbufsize;
2939 
2940  /* Allocate buffer, indicate that there are currently 0 pending pages */
2941  vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2942  vstate->npendingpages = 0;
2943 }
#define Min(x, y)
Definition: c.h:988
#define Max(x, y)
Definition: c.h:982
int work_mem
Definition: globals.c:125
#define MaxAllocSize
Definition: memutils.h:40

References BTVacState::bufsize, Max, MaxAllocSize, BTVacState::maxbufsize, Min, BTVacState::npendingpages, palloc(), BTVacState::pendingpages, and work_mem.

Referenced by btvacuumscan().

◆ _bt_relandgetbuf()

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)

Definition at line 1015 of file nbtpage.c.

1016 {
1017  Buffer buf;
1018 
1019  Assert(blkno != P_NEW);
1020  if (BufferIsValid(obuf))
1021  _bt_unlockbuf(rel, obuf);
1022  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1023  _bt_lockbuf(rel, buf, access);
1024 
1025  _bt_checkpage(rel, buf);
1026  return buf;
1027 }
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1694
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:233

References _bt_checkpage(), _bt_lockbuf(), _bt_unlockbuf(), Assert(), buf, BufferIsValid(), P_NEW, and ReleaseAndReadBuffer().

Referenced by _bt_check_unique(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_moveright(), _bt_search(), _bt_stepright(), and _bt_walk_left().

◆ _bt_relbuf()

◆ _bt_rightsib_halfdeadflag()

static bool _bt_rightsib_halfdeadflag ( Relation  rel,
BlockNumber  leafrightsib 
)
static

Definition at line 1766 of file nbtpage.c.

1767 {
1768  Buffer buf;
1769  Page page;
1770  BTPageOpaque opaque;
1771  bool result;
1772 
1773  Assert(leafrightsib != P_NONE);
1774 
1775  buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1776  page = BufferGetPage(buf);
1777  opaque = BTPageGetOpaque(page);
1778 
1779  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1780  result = P_ISHALFDEAD(opaque);
1781  _bt_relbuf(rel, buf);
1782 
1783  return result;
1784 }

References _bt_getbuf(), _bt_relbuf(), Assert(), BT_READ, BTPageGetOpaque, buf, BufferGetPage(), P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, and P_NONE.

Referenced by _bt_mark_page_halfdead().

◆ _bt_set_cleanup_info()

void _bt_set_cleanup_info ( Relation  rel,
BlockNumber  num_delpages 
)

Definition at line 234 of file nbtpage.c.

235 {
236  Buffer metabuf;
237  Page metapg;
238  BTMetaPageData *metad;
239 
240  /*
241  * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
242  * field started out as a TransactionId field called btm_oldest_btpo_xact.
243  * Both "versions" are just uint32 fields. It was convenient to repurpose
244  * the field when we began to use 64-bit XIDs in deleted pages.
245  *
246  * It's possible that a pg_upgrade'd database will contain an XID value in
247  * what is now recognized as the metapage's btm_last_cleanup_num_delpages
248  * field. _bt_vacuum_needs_cleanup() may even believe that this value
249  * indicates that there are lots of pages that it needs to recycle, when
250  * in reality there are only one or two. The worst that can happen is
251  * that there will be a call to btvacuumscan a little earlier, which will
252  * set btm_last_cleanup_num_delpages to a sane value when we're called.
253  *
254  * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
255  * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
256  * to be consistent.
257  */
258  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
259  metapg = BufferGetPage(metabuf);
260  metad = BTPageGetMeta(metapg);
261 
262  /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
263  if (metad->btm_version >= BTREE_NOVAC_VERSION &&
264  metad->btm_last_cleanup_num_delpages == num_delpages)
265  {
266  /* Usually means index continues to have num_delpages of 0 */
267  _bt_relbuf(rel, metabuf);
268  return;
269  }
270 
271  /* trade in our read lock for a write lock */
272  _bt_unlockbuf(rel, metabuf);
273  _bt_lockbuf(rel, metabuf, BT_WRITE);
274 
276 
277  /* upgrade meta-page if needed */
278  if (metad->btm_version < BTREE_NOVAC_VERSION)
279  _bt_upgrademetapage(metapg);
280 
281  /* update cleanup-related information */
282  metad->btm_last_cleanup_num_delpages = num_delpages;
283  metad->btm_last_cleanup_num_heap_tuples = -1.0;
284  MarkBufferDirty(metabuf);
285 
286  /* write wal record if needed */
287  if (RelationNeedsWAL(rel))
288  {
290  XLogRecPtr recptr;
291 
292  XLogBeginInsert();
294 
296  md.version = metad->btm_version;
297  md.root = metad->btm_root;
298  md.level = metad->btm_level;
299  md.fastroot = metad->btm_fastroot;
300  md.fastlevel = metad->btm_fastlevel;
301  md.last_cleanup_num_delpages = num_delpages;
302  md.allequalimage = metad->btm_allequalimage;
303 
304  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
305 
306  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
307 
308  PageSetLSN(metapg, recptr);
309  }
310 
312 
313  _bt_relbuf(rel, metabuf);
314 }
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), PageSetLSN(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationNeedsWAL, xl_btree_metadata::root, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_META_CLEANUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), and XLogRegisterBuffer().

Referenced by btvacuumcleanup().

◆ _bt_unlink_halfdead_page()

static bool _bt_unlink_halfdead_page ( Relation  rel,
Buffer  leafbuf,
BlockNumber  scanblkno,
bool rightsib_empty,
BTVacState vstate 
)
static

Definition at line 2303 of file nbtpage.c.

2305 {
2306  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2307  IndexBulkDeleteResult *stats = vstate->stats;
2308  BlockNumber leafleftsib;
2309  BlockNumber leafrightsib;
2310  BlockNumber target;
2311  BlockNumber leftsib;
2312  BlockNumber rightsib;
2313  Buffer lbuf = InvalidBuffer;
2314  Buffer buf;
2315  Buffer rbuf;
2316  Buffer metabuf = InvalidBuffer;
2317  Page metapg = NULL;
2318  BTMetaPageData *metad = NULL;
2319  ItemId itemid;
2320  Page page;
2321  BTPageOpaque opaque;
2322  FullTransactionId safexid;
2323  bool rightsib_is_rightmost;
2324  uint32 targetlevel;
2325  IndexTuple leafhikey;
2326  BlockNumber leaftopparent;
2327 
2328  page = BufferGetPage(leafbuf);
2329  opaque = BTPageGetOpaque(page);
2330 
2331  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2332 
2333  /*
2334  * Remember some information about the leaf page.
2335  */
2336  itemid = PageGetItemId(page, P_HIKEY);
2337  leafhikey = (IndexTuple) PageGetItem(page, itemid);
2338  target = BTreeTupleGetTopParent(leafhikey);
2339  leafleftsib = opaque->btpo_prev;
2340  leafrightsib = opaque->btpo_next;
2341 
2342  _bt_unlockbuf(rel, leafbuf);
2343 
2344  /*
2345  * Check here, as calling loops will have locks held, preventing
2346  * interrupts from being processed.
2347  */
2349 
2350  /* Unlink the current top parent of the subtree */
2351  if (!BlockNumberIsValid(target))
2352  {
2353  /* Target is leaf page (or leaf page is top parent, if you prefer) */
2354  target = leafblkno;
2355 
2356  buf = leafbuf;
2357  leftsib = leafleftsib;
2358  targetlevel = 0;
2359  }
2360  else
2361  {
2362  /* Target is the internal page taken from leaf's top parent link */
2363  Assert(target != leafblkno);
2364 
2365  /* Fetch the block number of the target's left sibling */
2366  buf = _bt_getbuf(rel, target, BT_READ);
2367  page = BufferGetPage(buf);
2368  opaque = BTPageGetOpaque(page);
2369  leftsib = opaque->btpo_prev;
2370  targetlevel = opaque->btpo_level;
2371  Assert(targetlevel > 0);
2372 
2373  /*
2374  * To avoid deadlocks, we'd better drop the target page lock before
2375  * going further.
2376  */
2377  _bt_unlockbuf(rel, buf);
2378  }
2379 
2380  /*
2381  * We have to lock the pages we need to modify in the standard order:
2382  * moving right, then up. Else we will deadlock against other writers.
2383  *
2384  * So, first lock the leaf page, if it's not the target. Then find and
2385  * write-lock the current left sibling of the target page. The sibling
2386  * that was current a moment ago could have split, so we may have to move
2387  * right.
2388  */
2389  if (target != leafblkno)
2390  _bt_lockbuf(rel, leafbuf, BT_WRITE);
2391  if (leftsib != P_NONE)
2392  {
2393  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2394  page = BufferGetPage(lbuf);
2395  opaque = BTPageGetOpaque(page);
2396  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2397  {
2398  bool leftsibvalid = true;
2399 
2400  /*
2401  * Before we follow the link from the page that was the left
2402  * sibling mere moments ago, validate its right link. This
2403  * reduces the opportunities for loop to fail to ever make any
2404  * progress in the presence of index corruption.
2405  *
2406  * Note: we rely on the assumption that there can only be one
2407  * vacuum process running at a time (against the same index).
2408  */
2409  if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2410  leftsib == opaque->btpo_next)
2411  leftsibvalid = false;
2412 
2413  leftsib = opaque->btpo_next;
2414  _bt_relbuf(rel, lbuf);
2415 
2416  if (!leftsibvalid)
2417  {
2418  if (target != leafblkno)
2419  {
2420  /* we have only a pin on target, but pin+lock on leafbuf */
2421  ReleaseBuffer(buf);
2422  _bt_relbuf(rel, leafbuf);
2423  }
2424  else
2425  {
2426  /* we have only a pin on leafbuf */
2427  ReleaseBuffer(leafbuf);
2428  }
2429 
2430  ereport(LOG,
2431  (errcode(ERRCODE_INDEX_CORRUPTED),
2432  errmsg_internal("valid left sibling for deletion target could not be located: "
2433  "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
2434  leftsib, target, leafblkno, scanblkno,
2435  RelationGetRelationName(rel))));
2436 
2437  return false;
2438  }
2439 
2441 
2442  /* step right one page */
2443  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2444  page = BufferGetPage(lbuf);
2445  opaque = BTPageGetOpaque(page);
2446  }
2447  }
2448  else
2449  lbuf = InvalidBuffer;
2450 
2451  /* Next write-lock the target page itself */
2452  _bt_lockbuf(rel, buf, BT_WRITE);
2453  page = BufferGetPage(buf);
2454  opaque = BTPageGetOpaque(page);
2455 
2456  /*
2457  * Check page is still empty etc, else abandon deletion. This is just for
2458  * paranoia's sake; a half-dead page cannot resurrect because there can be
2459  * only one vacuum process running at a time.
2460  */
2461  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2462  elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2463  target, RelationGetRelationName(rel));
2464 
2465  if (opaque->btpo_prev != leftsib)
2466  ereport(ERROR,
2467  (errcode(ERRCODE_INDEX_CORRUPTED),
2468  errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2469  leftsib, opaque->btpo_prev, target,
2470  RelationGetRelationName(rel))));
2471 
2472  if (target == leafblkno)
2473  {
2474  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2475  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2476  elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2477  target, RelationGetRelationName(rel));
2478 
2479  /* Leaf page is also target page: don't set leaftopparent */
2480  leaftopparent = InvalidBlockNumber;
2481  }
2482  else
2483  {
2484  IndexTuple finaldataitem;
2485 
2486  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2487  P_ISLEAF(opaque))
2488  elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2489  targetlevel, target, RelationGetRelationName(rel));
2490 
2491  /* Target is internal: set leaftopparent for next call here... */
2492  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2493  finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2494  leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2495  /* ...except when it would be a redundant pointer-to-self */
2496  if (leaftopparent == leafblkno)
2497  leaftopparent = InvalidBlockNumber;
2498  }
2499 
2500  /* No leaftopparent for level 0 (leaf page) or level 1 target */
2501  Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2502 
2503  /*
2504  * And next write-lock the (current) right sibling.
2505  */
2506  rightsib = opaque->btpo_next;
2507  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2508  page = BufferGetPage(rbuf);
2509  opaque = BTPageGetOpaque(page);
2510  if (opaque->btpo_prev != target)
2511  ereport(ERROR,
2512  (errcode(ERRCODE_INDEX_CORRUPTED),
2513  errmsg_internal("right sibling's left-link doesn't match: "
2514  "block %u links to %u instead of expected %u in index \"%s\"",
2515  rightsib, opaque->btpo_prev, target,
2516  RelationGetRelationName(rel))));
2517  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2518  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2519 
2520  /*
2521  * If we are deleting the next-to-last page on the target's level, then
2522  * the rightsib is a candidate to become the new fast root. (In theory, it
2523  * might be possible to push the fast root even further down, but the odds
2524  * of doing so are slim, and the locking considerations daunting.)
2525  *
2526  * We can safely acquire a lock on the metapage here --- see comments for
2527  * _bt_newroot().
2528  */
2529  if (leftsib == P_NONE && rightsib_is_rightmost)
2530  {
2531  page = BufferGetPage(rbuf);
2532  opaque = BTPageGetOpaque(page);
2533  if (P_RIGHTMOST(opaque))
2534  {
2535  /* rightsib will be the only one left on the level */
2536  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2537  metapg = BufferGetPage(metabuf);
2538  metad = BTPageGetMeta(metapg);
2539 
2540  /*
2541  * The expected case here is btm_fastlevel == targetlevel+1; if
2542  * the fastlevel is <= targetlevel, something is wrong, and we
2543  * choose to overwrite it to fix it.
2544  */
2545  if (metad->btm_fastlevel > targetlevel + 1)
2546  {
2547  /* no update wanted */
2548  _bt_relbuf(rel, metabuf);
2549  metabuf = InvalidBuffer;
2550  }
2551  }
2552  }
2553 
2554  /*
2555  * Here we begin doing the deletion.
2556  */
2557 
2558  /* No ereport(ERROR) until changes are logged */
2560 
2561  /*
2562  * Update siblings' side-links. Note the target page's side-links will
2563  * continue to point to the siblings. Asserts here are just rechecking
2564  * things we already verified above.
2565  */
2566  if (BufferIsValid(lbuf))
2567  {
2568  page = BufferGetPage(lbuf);
2569  opaque = BTPageGetOpaque(page);
2570  Assert(opaque->btpo_next == target);
2571  opaque->btpo_next = rightsib;
2572  }
2573  page = BufferGetPage(rbuf);
2574  opaque = BTPageGetOpaque(page);
2575  Assert(opaque->btpo_prev == target);
2576  opaque->btpo_prev = leftsib;
2577 
2578  /*
2579  * If we deleted a parent of the targeted leaf page, instead of the leaf
2580  * itself, update the leaf to point to the next remaining child in the
2581  * subtree.
2582  *
2583  * Note: We rely on the fact that a buffer pin on the leaf page has been
2584  * held since leafhikey was initialized. This is safe, though only
2585  * because the page was already half-dead at that point. The leaf page
2586  * cannot have been modified by any other backend during the period when
2587  * no lock was held.
2588  */
2589  if (target != leafblkno)
2590  BTreeTupleSetTopParent(leafhikey, leaftopparent);
2591 
2592  /*
2593  * Mark the page itself deleted. It can be recycled when all current
2594  * transactions are gone. Storing GetTopTransactionId() would work, but
2595  * we're in VACUUM and would not otherwise have an XID. Having already
2596  * updated links to the target, ReadNextFullTransactionId() suffices as an
2597  * upper bound. Any scan having retained a now-stale link is advertising
2598  * in its PGPROC an xmin less than or equal to the value we read here. It
2599  * will continue to do so, holding back the xmin horizon, for the duration
2600  * of that scan.
2601  */
2602  page = BufferGetPage(buf);
2603  opaque = BTPageGetOpaque(page);
2604  Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2605 
2606  /*
2607  * Store upper bound XID that's used to determine when deleted page is no
2608  * longer needed as a tombstone
2609  */
2610  safexid = ReadNextFullTransactionId();
2611  BTPageSetDeleted(page, safexid);
2612  opaque->btpo_cycleid = 0;
2613 
2614  /* And update the metapage, if needed */
2615  if (BufferIsValid(metabuf))
2616  {
2617  /* upgrade metapage if needed */
2618  if (metad->btm_version < BTREE_NOVAC_VERSION)
2619  _bt_upgrademetapage(metapg);
2620  metad->btm_fastroot = rightsib;
2621  metad->btm_fastlevel = targetlevel;
2622  MarkBufferDirty(metabuf);
2623  }
2624 
2625  /* Must mark buffers dirty before XLogInsert */
2626  MarkBufferDirty(rbuf);
2628  if (BufferIsValid(lbuf))
2629  MarkBufferDirty(lbuf);
2630  if (target != leafblkno)
2631  MarkBufferDirty(leafbuf);
2632 
2633  /* XLOG stuff */
2634  if (RelationNeedsWAL(rel))
2635  {
2636  xl_btree_unlink_page xlrec;
2637  xl_btree_metadata xlmeta;
2638  uint8 xlinfo;
2639  XLogRecPtr recptr;
2640 
2641  XLogBeginInsert();
2642 
2644  if (BufferIsValid(lbuf))
2647  if (target != leafblkno)
2648  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2649 
2650  /* information stored on the target/to-be-unlinked block */
2651  xlrec.leftsib = leftsib;
2652  xlrec.rightsib = rightsib;
2653  xlrec.level = targetlevel;
2654  xlrec.safexid = safexid;
2655 
2656  /* information needed to recreate the leaf block (if not the target) */
2657  xlrec.leafleftsib = leafleftsib;
2658  xlrec.leafrightsib = leafrightsib;
2659  xlrec.leaftopparent = leaftopparent;
2660 
2661  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2662 
2663  if (BufferIsValid(metabuf))
2664  {
2666 
2668  xlmeta.version = metad->btm_version;
2669  xlmeta.root = metad->btm_root;
2670  xlmeta.level = metad->btm_level;
2671  xlmeta.fastroot = metad->btm_fastroot;
2672  xlmeta.fastlevel = metad->btm_fastlevel;
2674  xlmeta.allequalimage = metad->btm_allequalimage;
2675 
2676  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2677  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2678  }
2679  else
2680  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2681 
2682  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2683 
2684  if (BufferIsValid(metabuf))
2685  {
2686  PageSetLSN(metapg, recptr);
2687  }
2688  page = BufferGetPage(rbuf);
2689  PageSetLSN(page, recptr);
2690  page = BufferGetPage(buf);
2691  PageSetLSN(page, recptr);
2692  if (BufferIsValid(lbuf))
2693  {
2694  page = BufferGetPage(lbuf);
2695  PageSetLSN(page, recptr);
2696  }
2697  if (target != leafblkno)
2698  {
2699  page = BufferGetPage(leafbuf);
2700  PageSetLSN(page, recptr);
2701  }
2702  }
2703 
2704  END_CRIT_SECTION();
2705 
2706  /* release metapage */
2707  if (BufferIsValid(metabuf))
2708  _bt_relbuf(rel, metabuf);
2709 
2710  /* release siblings */
2711  if (BufferIsValid(lbuf))
2712  _bt_relbuf(rel, lbuf);
2713  _bt_relbuf(rel, rbuf);
2714 
2715  /* If the target is not leafbuf, we're done with it now -- release it */
2716  if (target != leafblkno)
2717  _bt_relbuf(rel, buf);
2718 
2719  /*
2720  * Maintain pages_newly_deleted, which is simply the number of pages
2721  * deleted by the ongoing VACUUM operation.
2722  *
2723  * Maintain pages_deleted in a way that takes into account how
2724  * btvacuumpage() will count deleted pages that have yet to become
2725  * scanblkno -- only count page when it's not going to get that treatment
2726  * later on.
2727  */
2728  stats->pages_newly_deleted++;
2729  if (target <= scanblkno)
2730  stats->pages_deleted++;
2731 
2732  /*
2733  * Remember information about the target page (now a newly deleted page)
2734  * in dedicated vstate space for later. The page will be considered as a
2735  * candidate to place in the FSM at the end of the current btvacuumscan()
2736  * call.
2737  */
2738  _bt_pendingfsm_add(vstate, target, safexid);
2739 
2740  return true;
2741 }
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
unsigned char uint8
Definition: c.h:488
static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
Definition: nbtpage.c:3021
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:621
static void BTPageSetDeleted(Page page, FullTransactionId safexid)
Definition: nbtree.h:239
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:35
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:36
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:320
BlockNumber pages_deleted
Definition: genam.h:81
FullTransactionId ReadNextFullTransactionId(void)
Definition: varsup.c:261

References _bt_getbuf(), _bt_lockbuf(), _bt_pendingfsm_add(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BlockNumberIsValid(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageSetDeleted(), BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetDownLink(), BTreeTupleGetTopParent(), BTreeTupleSetTopParent(), buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CHECK_FOR_INTERRUPTS, elog(), END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBlockNumber, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_unlink_page::leafleftsib, xl_btree_unlink_page::leafrightsib, xl_btree_unlink_page::leaftopparent, xl_btree_unlink_page::leftsib, xl_btree_metadata::level, xl_btree_unlink_page::level, LOG, MarkBufferDirty(), P_FIRSTDATAKEY, P_HIKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_newly_deleted, PageSetLSN(), ReadNextFullTransactionId(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, ReleaseBuffer(), xl_btree_unlink_page::rightsib, xl_btree_metadata::root, xl_btree_unlink_page::safexid, SizeOfBtreeUnlinkPage, START_CRIT_SECTION, BTVacState::stats, xl_btree_metadata::version, XLOG_BTREE_UNLINK_PAGE, XLOG_BTREE_UNLINK_PAGE_META, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_pagedel().

◆ _bt_unlockbuf()

void _bt_unlockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1082 of file nbtpage.c.

1083 {
1084  /*
1085  * Buffer is pinned and locked, which means that it is expected to be
1086  * defined and addressable. Check that proactively.
1087  */
1089 
1090  /* LockBuffer() asserts that pin is held by this backend */
1092 
1093  if (!RelationUsesLocalBuffers(rel))
1095 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:110
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_first(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_relandgetbuf(), _bt_relbuf(), _bt_search(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_upgradelockbufcleanup()

void _bt_upgradelockbufcleanup ( Relation  rel,
Buffer  buf 
)

Definition at line 1121 of file nbtpage.c.

1122 {
1123  /*
1124  * Buffer is pinned and locked, which means that it is expected to be
1125  * defined and addressable. Check that proactively.
1126  */
1128 
1129  /* LockBuffer() asserts that pin is held by this backend */
1132 }
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4303

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), LockBufferForCleanup(), and VALGRIND_CHECK_MEM_IS_DEFINED.

Referenced by btvacuumpage().

◆ _bt_upgrademetapage()

void _bt_upgrademetapage ( Page  page)

Definition at line 109 of file nbtpage.c.

110 {
111  BTMetaPageData *metad;
113 
114  metad = BTPageGetMeta(page);
115  metaopaque = BTPageGetOpaque(page);
116 
117  /* It must be really a meta page of upgradable version */
118  Assert(metaopaque->btpo_flags & BTP_META);
121 
122  /* Set version number and fill extra fields added into version 3 */
125  metad->btm_last_cleanup_num_heap_tuples = -1.0;
126  /* Only a REINDEX can set this field */
127  Assert(!metad->btm_allequalimage);
128  metad->btm_allequalimage = false;
129 
130  /* Adjust pd_lower (see _bt_initmetapage() for details) */
131  ((PageHeader) page)->pd_lower =
132  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
133 }
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:166

References Assert(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, and PG_USED_FOR_ASSERTS_ONLY.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_newroot(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_vacuum_needs_cleanup()

bool _bt_vacuum_needs_cleanup ( Relation  rel)

Definition at line 181 of file nbtpage.c.

182 {
183  Buffer metabuf;
184  Page metapg;
185  BTMetaPageData *metad;
186  uint32 btm_version;
187  BlockNumber prev_num_delpages;
188 
189  /*
190  * Copy details from metapage to local variables quickly.
191  *
192  * Note that we deliberately avoid using cached version of metapage here.
193  */
194  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
195  metapg = BufferGetPage(metabuf);
196  metad = BTPageGetMeta(metapg);
197  btm_version = metad->btm_version;
198 
199  if (btm_version < BTREE_NOVAC_VERSION)
200  {
201  /*
202  * Metapage needs to be dynamically upgraded to store fields that are
203  * only present when btm_version >= BTREE_NOVAC_VERSION
204  */
205  _bt_relbuf(rel, metabuf);
206  return true;
207  }
208 
209  prev_num_delpages = metad->btm_last_cleanup_num_delpages;
210  _bt_relbuf(rel, metabuf);
211 
212  /*
213  * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
214  * total size of the index. We can reasonably expect (though are not
215  * guaranteed) to be able to recycle this many pages if we decide to do a
216  * btvacuumscan call during the ongoing btvacuumcleanup. For further
217  * details see the nbtree/README section on placing deleted pages in the
218  * FSM.
219  */
220  if (prev_num_delpages > 0 &&
221  prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
222  return true;
223 
224  return false;
225 }
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:161

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), and RelationGetNumberOfBlocks.

Referenced by btvacuumcleanup().