PostgreSQL Source Code  git master
nbtpage.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/procarray.h"
#include "utils/memdebug.h"
#include "utils/memutils.h"
#include "utils/snapmgr.h"
Include dependency graph for nbtpage.c:

Go to the source code of this file.

Functions

static BTMetaPageData_bt_getmeta (Relation rel, Buffer metabuf)
 
static void _bt_log_reuse_page (Relation rel, BlockNumber blkno, FullTransactionId safexid)
 
static void _bt_delitems_delete (Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
static char * _bt_delitems_update (BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
 
static bool _bt_mark_page_halfdead (Relation rel, Buffer leafbuf, BTStack stack)
 
static bool _bt_unlink_halfdead_page (Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
 
static bool _bt_lock_subtree_parent (Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
 
static void _bt_pendingfsm_add (BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
void _bt_upgrademetapage (Page page)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
static int _bt_delitems_cmp (const void *a, const void *b)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
static bool _bt_leftsib_splitflag (Relation rel, BlockNumber leftsib, BlockNumber target)
 
static bool _bt_rightsib_halfdeadflag (Relation rel, BlockNumber leafrightsib)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 

Function Documentation

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 794 of file nbtpage.c.

795 {
796  Page page = BufferGetPage(buf);
797 
798  /*
799  * ReadBuffer verifies that every newly-read page passes
800  * PageHeaderIsValid, which means it either contains a reasonably sane
801  * page header or is all-zero. We have to defend against the all-zero
802  * case, however.
803  */
804  if (PageIsNew(page))
805  ereport(ERROR,
806  (errcode(ERRCODE_INDEX_CORRUPTED),
807  errmsg("index \"%s\" contains unexpected zero page at block %u",
810  errhint("Please REINDEX it.")));
811 
812  /*
813  * Additionally check that the special area looks sane.
814  */
815  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
816  ereport(ERROR,
817  (errcode(ERRCODE_INDEX_CORRUPTED),
818  errmsg("index \"%s\" contains corrupted page at block %u",
821  errhint("Please REINDEX it.")));
822 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2755
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
Pointer Page
Definition: bufpage.h:78
#define PageGetSpecialSize(page)
Definition: bufpage.h:299
#define PageIsNew(page)
Definition: bufpage.h:228
#define MAXALIGN(LEN)
Definition: c.h:757
int errhint(const char *fmt,...)
Definition: elog.c:1151
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define ERROR
Definition: elog.h:33
#define ereport(elevel,...)
Definition: elog.h:143
static char * buf
Definition: pg_test_fsync.c:67
#define RelationGetRelationName(relation)
Definition: rel.h:523

References buf, BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1105 of file nbtpage.c.

1106 {
1107  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1108  if (!ConditionalLockBuffer(buf))
1109  return false;
1110 
1111  if (!RelationUsesLocalBuffers(rel))
1113 
1114  return true;
1115 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4182
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:622

References buf, BufferGetPage, ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), and _bt_search_insert().

◆ _bt_delitems_cmp()

static int _bt_delitems_cmp ( const void *  a,
const void *  b 
)
static

Definition at line 1474 of file nbtpage.c.

1475 {
1476  TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1477  TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1478 
1479  if (indexdelete1->id > indexdelete2->id)
1480  return 1;
1481  if (indexdelete1->id < indexdelete2->id)
1482  return -1;
1483 
1484  Assert(false);
1485 
1486  return 0;
1487 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69
Assert(fmt[strlen(fmt) - 1] !='\n')

References a, Assert(), b, and TM_IndexDelete::id.

Referenced by _bt_delitems_delete_check().

◆ _bt_delitems_delete()

static void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
TransactionId  latestRemovedXid,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)
static

Definition at line 1296 of file nbtpage.c.

1299 {
1300  Page page = BufferGetPage(buf);
1301  BTPageOpaque opaque;
1302  bool needswal = RelationNeedsWAL(rel);
1303  char *updatedbuf = NULL;
1304  Size updatedbuflen = 0;
1305  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1306 
1307  /* Shouldn't be called unless there's something to do */
1308  Assert(ndeletable > 0 || nupdatable > 0);
1309 
1310  /* Generate new versions of posting lists without deleted TIDs */
1311  if (nupdatable > 0)
1312  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1313  updatedoffsets, &updatedbuflen,
1314  needswal);
1315 
1316  /* No ereport(ERROR) until changes are logged */
1318 
1319  /* Handle updates and deletes just like _bt_delitems_vacuum */
1320  for (int i = 0; i < nupdatable; i++)
1321  {
1322  OffsetNumber updatedoffset = updatedoffsets[i];
1323  IndexTuple itup;
1324  Size itemsz;
1325 
1326  itup = updatable[i]->itup;
1327  itemsz = MAXALIGN(IndexTupleSize(itup));
1328  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1329  itemsz))
1330  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1332  }
1333 
1334  if (ndeletable > 0)
1335  PageIndexMultiDelete(page, deletable, ndeletable);
1336 
1337  /*
1338  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1339  * this point. The VACUUM command alone controls vacuum cycle IDs.
1340  */
1341  opaque = BTPageGetOpaque(page);
1342 
1343  /*
1344  * Clear the BTP_HAS_GARBAGE page flag.
1345  *
1346  * This flag indicates the presence of LP_DEAD items on the page (though
1347  * not reliably). Note that we only rely on it with pg_upgrade'd
1348  * !heapkeyspace indexes.
1349  */
1350  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1351 
1353 
1354  /* XLOG stuff */
1355  if (needswal)
1356  {
1357  XLogRecPtr recptr;
1358  xl_btree_delete xlrec_delete;
1359 
1360  xlrec_delete.latestRemovedXid = latestRemovedXid;
1361  xlrec_delete.ndeleted = ndeletable;
1362  xlrec_delete.nupdated = nupdatable;
1363 
1364  XLogBeginInsert();
1366  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1367 
1368  if (ndeletable > 0)
1369  XLogRegisterBufData(0, (char *) deletable,
1370  ndeletable * sizeof(OffsetNumber));
1371 
1372  if (nupdatable > 0)
1373  {
1374  XLogRegisterBufData(0, (char *) updatedoffsets,
1375  nupdatable * sizeof(OffsetNumber));
1376  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1377  }
1378 
1379  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1380 
1381  PageSetLSN(page, recptr);
1382  }
1383 
1384  END_CRIT_SECTION();
1385 
1386  /* can't leak memory here */
1387  if (updatedbuf != NULL)
1388  pfree(updatedbuf);
1389  /* free tuples allocated within _bt_delitems_update() */
1390  for (int i = 0; i < nupdatable; i++)
1391  pfree(updatable[i]->itup);
1392 }
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1573
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1405
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
size_t Size
Definition: c.h:540
#define PANIC
Definition: elog.h:36
#define elog(elevel,...)
Definition: elog.h:218
int i
Definition: isn.c:73
Pointer Item
Definition: item.h:17
#define IndexTupleSize(itup)
Definition: itup.h:70
#define MaxIndexTuplesPerPage
Definition: itup.h:144
void pfree(void *pointer)
Definition: mcxt.c:1175
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1415
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define SizeOfBtreeDelete
Definition: nbtxlog.h:241
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:34
uint16 OffsetNumber
Definition: off.h:24
#define RelationNeedsWAL(relation)
Definition: rel.h:613
uint16 btpo_flags
Definition: nbtree.h:67
IndexTuple itup
Definition: nbtree.h:906
TransactionId latestRemovedXid
Definition: nbtxlog.h:232
uint16 ndeleted
Definition: nbtxlog.h:233
uint16 nupdated
Definition: nbtxlog.h:234
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:443
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:389
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:351
#define REGBUF_STANDARD
Definition: xloginsert.h:34

References _bt_delitems_update(), Assert(), BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, xl_btree_delete::latestRemovedXid, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_delete::ndeleted, xl_btree_delete::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN, PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeDelete, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delitems_delete_check().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1528 of file nbtpage.c.

1530 {
1531  Page page = BufferGetPage(buf);
1532  TransactionId latestRemovedXid;
1533  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1534  int ndeletable = 0,
1535  nupdatable = 0;
1538 
1539  /* Use tableam interface to determine which tuples to delete first */
1540  latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
1541 
1542  /* Should not WAL-log latestRemovedXid unless it's required */
1543  if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
1544  latestRemovedXid = InvalidTransactionId;
1545 
1546  /*
1547  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1548  * needs to do to physically delete index tuples from the page.
1549  *
1550  * Must sort deltids array to restore leaf-page-wise order (original order
1551  * before call to tableam). This is the order that the loop expects.
1552  *
1553  * Note that deltids array might be a lot smaller now. It might even have
1554  * no entries at all (with bottom-up deletion caller), in which case there
1555  * is nothing left to do.
1556  */
1557  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1559  if (delstate->ndeltids == 0)
1560  {
1561  Assert(delstate->bottomup);
1562  return;
1563  }
1564 
1565  /* We definitely have to delete at least one index tuple (or one TID) */
1566  for (int i = 0; i < delstate->ndeltids; i++)
1567  {
1568  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1569  OffsetNumber idxoffnum = dstatus->idxoffnum;
1570  ItemId itemid = PageGetItemId(page, idxoffnum);
1571  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1572  int nestedi,
1573  nitem;
1574  BTVacuumPosting vacposting;
1575 
1576  Assert(OffsetNumberIsValid(idxoffnum));
1577 
1578  if (idxoffnum == postingidxoffnum)
1579  {
1580  /*
1581  * This deltid entry is a TID from a posting list tuple that has
1582  * already been completely processed
1583  */
1584  Assert(BTreeTupleIsPosting(itup));
1586  &delstate->deltids[i].tid) < 0);
1588  &delstate->deltids[i].tid) >= 0);
1589  continue;
1590  }
1591 
1592  if (!BTreeTupleIsPosting(itup))
1593  {
1594  /* Plain non-pivot tuple */
1595  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1596  if (dstatus->knowndeletable)
1597  deletable[ndeletable++] = idxoffnum;
1598  continue;
1599  }
1600 
1601  /*
1602  * itup is a posting list tuple whose lowest deltids entry (which may
1603  * or may not be for the first TID from itup) is considered here now.
1604  * We should process all of the deltids entries for the posting list
1605  * together now, though (not just the lowest). Remember to skip over
1606  * later itup-related entries during later iterations of outermost
1607  * loop.
1608  */
1609  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1610  nestedi = i; /* Initialize for first itup deltids entry */
1611  vacposting = NULL; /* Describes final action for itup */
1612  nitem = BTreeTupleGetNPosting(itup);
1613  for (int p = 0; p < nitem; p++)
1614  {
1615  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1616  int ptidcmp = -1;
1617 
1618  /*
1619  * This nested loop reuses work across ptid TIDs taken from itup.
1620  * We take advantage of the fact that both itup's TIDs and deltids
1621  * entries (within a single itup/posting list grouping) must both
1622  * be in ascending TID order.
1623  */
1624  for (; nestedi < delstate->ndeltids; nestedi++)
1625  {
1626  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1627  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1628 
1629  /* Stop once we get past all itup related deltids entries */
1630  Assert(tdstatus->idxoffnum >= idxoffnum);
1631  if (tdstatus->idxoffnum != idxoffnum)
1632  break;
1633 
1634  /* Skip past non-deletable itup related entries up front */
1635  if (!tdstatus->knowndeletable)
1636  continue;
1637 
1638  /* Entry is first partial ptid match (or an exact match)? */
1639  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1640  if (ptidcmp >= 0)
1641  {
1642  /* Greater than or equal (partial or exact) match... */
1643  break;
1644  }
1645  }
1646 
1647  /* ...exact ptid match to a deletable deltids entry? */
1648  if (ptidcmp != 0)
1649  continue;
1650 
1651  /* Exact match for deletable deltids entry -- ptid gets deleted */
1652  if (vacposting == NULL)
1653  {
1654  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1655  nitem * sizeof(uint16));
1656  vacposting->itup = itup;
1657  vacposting->updatedoffset = idxoffnum;
1658  vacposting->ndeletedtids = 0;
1659  }
1660  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1661  }
1662 
1663  /* Final decision on itup, a posting list tuple */
1664 
1665  if (vacposting == NULL)
1666  {
1667  /* No TIDs to delete from itup -- do nothing */
1668  }
1669  else if (vacposting->ndeletedtids == nitem)
1670  {
1671  /* Straight delete of itup (to delete all TIDs) */
1672  deletable[ndeletable++] = idxoffnum;
1673  /* Turns out we won't need granular information */
1674  pfree(vacposting);
1675  }
1676  else
1677  {
1678  /* Delete some (but not all) TIDs from itup */
1679  Assert(vacposting->ndeletedtids > 0 &&
1680  vacposting->ndeletedtids < nitem);
1681  updatable[nupdatable++] = vacposting;
1682  }
1683  }
1684 
1685  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1686  _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
1687  updatable, nupdatable);
1688 
1689  /* be tidy */
1690  for (int i = 0; i < nupdatable; i++)
1691  pfree(updatable[i]);
1692 }
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:234
#define PageGetItem(page, itemId)
Definition: bufpage.h:339
unsigned short uint16
Definition: c.h:440
#define offsetof(type, field)
Definition: c.h:727
uint32 TransactionId
Definition: c.h:587
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
IndexTupleData * IndexTuple
Definition: itup.h:53
void * palloc(Size size)
Definition: mcxt.c:1068
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1296
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1474
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:513
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:539
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:659
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:487
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:633
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:495
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
OffsetNumber updatedoffset
Definition: nbtree.h:907
ItemPointerData t_tid
Definition: itup.h:37
TM_IndexStatus * status
Definition: tableam.h:231
TM_IndexDelete * deltids
Definition: tableam.h:230
ItemPointerData tid
Definition: tableam.h:189
bool knowndeletable
Definition: tableam.h:196
OffsetNumber idxoffnum
Definition: tableam.h:195
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1327
#define InvalidTransactionId
Definition: transam.h:31
#define XLogStandbyInfoActive()
Definition: xlog.h:118

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert(), TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage, BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, offsetof, PageGetItem, PageGetItemId, palloc(), pfree(), qsort, RelationNeedsWAL, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_update()

static char * _bt_delitems_update ( BTVacuumPosting updatable,
int  nupdatable,
OffsetNumber updatedoffsets,
Size updatedbuflen,
bool  needswal 
)
static

Definition at line 1415 of file nbtpage.c.

1418 {
1419  char *updatedbuf = NULL;
1420  Size buflen = 0;
1421 
1422  /* Shouldn't be called unless there's something to do */
1423  Assert(nupdatable > 0);
1424 
1425  for (int i = 0; i < nupdatable; i++)
1426  {
1427  BTVacuumPosting vacposting = updatable[i];
1428  Size itemsz;
1429 
1430  /* Replace work area IndexTuple with updated version */
1431  _bt_update_posting(vacposting);
1432 
1433  /* Keep track of size of xl_btree_update for updatedbuf in passing */
1434  itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1435  buflen += itemsz;
1436 
1437  /* Build updatedoffsets buffer in passing */
1438  updatedoffsets[i] = vacposting->updatedoffset;
1439  }
1440 
1441  /* XLOG stuff */
1442  if (needswal)
1443  {
1444  Size offset = 0;
1445 
1446  /* Allocate, set final size for caller */
1447  updatedbuf = palloc(buflen);
1448  *updatedbuflen = buflen;
1449  for (int i = 0; i < nupdatable; i++)
1450  {
1451  BTVacuumPosting vacposting = updatable[i];
1452  Size itemsz;
1453  xl_btree_update update;
1454 
1455  update.ndeletedtids = vacposting->ndeletedtids;
1456  memcpy(updatedbuf + offset, &update.ndeletedtids,
1458  offset += SizeOfBtreeUpdate;
1459 
1460  itemsz = update.ndeletedtids * sizeof(uint16);
1461  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1462  offset += itemsz;
1463  }
1464  }
1465 
1466  return updatedbuf;
1467 }
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:920
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:256
uint16 ndeletedtids
Definition: nbtxlog.h:251

References _bt_update_posting(), Assert(), BTVacuumPostingData::deletetids, i, BTVacuumPostingData::ndeletedtids, xl_btree_update::ndeletedtids, palloc(), SizeOfBtreeUpdate, and BTVacuumPostingData::updatedoffset.

Referenced by _bt_delitems_delete(), and _bt_delitems_vacuum().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1166 of file nbtpage.c.

1169 {
1170  Page page = BufferGetPage(buf);
1171  BTPageOpaque opaque;
1172  bool needswal = RelationNeedsWAL(rel);
1173  char *updatedbuf = NULL;
1174  Size updatedbuflen = 0;
1175  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1176 
1177  /* Shouldn't be called unless there's something to do */
1178  Assert(ndeletable > 0 || nupdatable > 0);
1179 
1180  /* Generate new version of posting lists without deleted TIDs */
1181  if (nupdatable > 0)
1182  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1183  updatedoffsets, &updatedbuflen,
1184  needswal);
1185 
1186  /* No ereport(ERROR) until changes are logged */
1188 
1189  /*
1190  * Handle posting tuple updates.
1191  *
1192  * Deliberately do this before handling simple deletes. If we did it the
1193  * other way around (i.e. WAL record order -- simple deletes before
1194  * updates) then we'd have to make compensating changes to the 'updatable'
1195  * array of offset numbers.
1196  *
1197  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1198  * happens to already be set. It's important that we not interfere with
1199  * any future simple index tuple deletion operations.
1200  */
1201  for (int i = 0; i < nupdatable; i++)
1202  {
1203  OffsetNumber updatedoffset = updatedoffsets[i];
1204  IndexTuple itup;
1205  Size itemsz;
1206 
1207  itup = updatable[i]->itup;
1208  itemsz = MAXALIGN(IndexTupleSize(itup));
1209  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1210  itemsz))
1211  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1213  }
1214 
1215  /* Now handle simple deletes of entire tuples */
1216  if (ndeletable > 0)
1217  PageIndexMultiDelete(page, deletable, ndeletable);
1218 
1219  /*
1220  * We can clear the vacuum cycle ID since this page has certainly been
1221  * processed by the current vacuum scan.
1222  */
1223  opaque = BTPageGetOpaque(page);
1224  opaque->btpo_cycleid = 0;
1225 
1226  /*
1227  * Clear the BTP_HAS_GARBAGE page flag.
1228  *
1229  * This flag indicates the presence of LP_DEAD items on the page (though
1230  * not reliably). Note that we only rely on it with pg_upgrade'd
1231  * !heapkeyspace indexes. That's why clearing it here won't usually
1232  * interfere with simple index tuple deletion.
1233  */
1234  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1235 
1237 
1238  /* XLOG stuff */
1239  if (needswal)
1240  {
1241  XLogRecPtr recptr;
1242  xl_btree_vacuum xlrec_vacuum;
1243 
1244  xlrec_vacuum.ndeleted = ndeletable;
1245  xlrec_vacuum.nupdated = nupdatable;
1246 
1247  XLogBeginInsert();
1249  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1250 
1251  if (ndeletable > 0)
1252  XLogRegisterBufData(0, (char *) deletable,
1253  ndeletable * sizeof(OffsetNumber));
1254 
1255  if (nupdatable > 0)
1256  {
1257  XLogRegisterBufData(0, (char *) updatedoffsets,
1258  nupdatable * sizeof(OffsetNumber));
1259  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1260  }
1261 
1262  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1263 
1264  PageSetLSN(page, recptr);
1265  }
1266 
1267  END_CRIT_SECTION();
1268 
1269  /* can't leak memory here */
1270  if (updatedbuf != NULL)
1271  pfree(updatedbuf);
1272  /* free tuples allocated within _bt_delitems_update() */
1273  for (int i = 0; i < nupdatable; i++)
1274  pfree(updatable[i]->itup);
1275 }
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:228
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:68
uint16 ndeleted
Definition: nbtxlog.h:220
uint16 nupdated
Definition: nbtxlog.h:221

References _bt_delitems_update(), Assert(), BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN, PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 871 of file nbtpage.c.

872 {
873  Buffer buf;
874 
875  if (blkno != P_NEW)
876  {
877  /* Read an existing block of the relation */
878  buf = ReadBuffer(rel, blkno);
879  _bt_lockbuf(rel, buf, access);
880  _bt_checkpage(rel, buf);
881  }
882  else
883  {
884  bool needLock;
885  Page page;
886 
887  Assert(access == BT_WRITE);
888 
889  /*
890  * First see if the FSM knows of any free pages.
891  *
892  * We can't trust the FSM's report unreservedly; we have to check that
893  * the page is still free. (For example, an already-free page could
894  * have been re-used between the time the last VACUUM scanned it and
895  * the time the VACUUM made its FSM updates.)
896  *
897  * In fact, it's worse than that: we can't even assume that it's safe
898  * to take a lock on the reported page. If somebody else has a lock
899  * on it, or even worse our own caller does, we could deadlock. (The
900  * own-caller scenario is actually not improbable. Consider an index
901  * on a serial or timestamp column. Nearly all splits will be at the
902  * rightmost page, so it's entirely likely that _bt_split will call us
903  * while holding a lock on the page most recently acquired from FSM. A
904  * VACUUM running concurrently with the previous split could well have
905  * placed that page back in FSM.)
906  *
907  * To get around that, we ask for only a conditional lock on the
908  * reported page. If we fail, then someone else is using the page,
909  * and we may reasonably assume it's not free. (If we happen to be
910  * wrong, the worst consequence is the page will be lost to use till
911  * the next VACUUM, which is no big problem.)
912  */
913  for (;;)
914  {
915  blkno = GetFreeIndexPage(rel);
916  if (blkno == InvalidBlockNumber)
917  break;
918  buf = ReadBuffer(rel, blkno);
919  if (_bt_conditionallockbuf(rel, buf))
920  {
921  page = BufferGetPage(buf);
922 
923  /*
924  * It's possible to find an all-zeroes page in an index. For
925  * example, a backend might successfully extend the relation
926  * one page and then crash before it is able to make a WAL
927  * entry for adding the page. If we find a zeroed page then
928  * reclaim it immediately.
929  */
930  if (PageIsNew(page))
931  {
932  /* Okay to use page. Initialize and return it. */
934  return buf;
935  }
936 
937  if (BTPageIsRecyclable(page))
938  {
939  /*
940  * If we are generating WAL for Hot Standby then create a
941  * WAL record that will allow us to conflict with queries
942  * running on standby, in case they have snapshots older
943  * than safexid value
944  */
946  _bt_log_reuse_page(rel, blkno,
947  BTPageGetDeleteXid(page));
948 
949  /* Okay to use page. Re-initialize and return it. */
951  return buf;
952  }
953  elog(DEBUG2, "FSM returned nonrecyclable page");
954  _bt_relbuf(rel, buf);
955  }
956  else
957  {
958  elog(DEBUG2, "FSM returned nonlockable page");
959  /* couldn't get lock, so just drop pin */
961  }
962  }
963 
964  /*
965  * Extend the relation by one page.
966  *
967  * We have to use a lock to ensure no one else is extending the rel at
968  * the same time, else we will both try to initialize the same new
969  * page. We can skip locking for new or temp relations, however,
970  * since no one else could be accessing them.
971  */
972  needLock = !RELATION_IS_LOCAL(rel);
973 
974  if (needLock)
976 
977  buf = ReadBuffer(rel, P_NEW);
978 
979  /* Acquire buffer lock on new page */
980  _bt_lockbuf(rel, buf, BT_WRITE);
981 
982  /*
983  * Release the file-extension lock; it's now OK for someone else to
984  * extend the relation some more. Note that we cannot release this
985  * lock before we have buffer lock on the new page, or we risk a race
986  * condition against btvacuumscan --- see comments therein.
987  */
988  if (needLock)
990 
991  /* Initialize the new page before returning it */
992  page = BufferGetPage(buf);
993  Assert(PageIsNew(page));
995  }
996 
997  /* ref count and lock type are correct */
998  return buf;
999 }
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3915
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:702
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
#define P_NEW
Definition: bufmgr.h:91
#define DEBUG2
Definition: elog.h:23
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1141
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1105
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1051
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
Definition: nbtpage.c:828
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:261
static bool BTPageIsRecyclable(Page page)
Definition: nbtree.h:292
#define BT_WRITE
Definition: nbtree.h:715
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:633

References _bt_checkpage(), _bt_conditionallockbuf(), _bt_lockbuf(), _bt_log_reuse_page(), _bt_pageinit(), _bt_relbuf(), Assert(), BT_WRITE, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage, BufferGetPageSize, DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockRelationForExtension(), P_NEW, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_metaversion(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

◆ _bt_getmeta()

static BTMetaPageData * _bt_getmeta ( Relation  rel,
Buffer  metabuf 
)
static

Definition at line 144 of file nbtpage.c.

145 {
146  Page metapg;
147  BTPageOpaque metaopaque;
148  BTMetaPageData *metad;
149 
150  metapg = BufferGetPage(metabuf);
151  metaopaque = BTPageGetOpaque(metapg);
152  metad = BTPageGetMeta(metapg);
153 
154  /* sanity-check the metapage */
155  if (!P_ISMETA(metaopaque) ||
156  metad->btm_magic != BTREE_MAGIC)
157  ereport(ERROR,
158  (errcode(ERRCODE_INDEX_CORRUPTED),
159  errmsg("index \"%s\" is not a btree",
160  RelationGetRelationName(rel))));
161 
162  if (metad->btm_version < BTREE_MIN_VERSION ||
163  metad->btm_version > BTREE_VERSION)
164  ereport(ERROR,
165  (errcode(ERRCODE_INDEX_CORRUPTED),
166  errmsg("version mismatch in index \"%s\": file version %d, "
167  "current version %d, minimal supported version %d",
170 
171  return metad;
172 }
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define P_ISMETA(opaque)
Definition: nbtree.h:224
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTREE_VERSION
Definition: nbtree.h:150
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105

References BTMetaPageData::btm_magic, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTREE_MAGIC, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage, ereport, errcode(), errmsg(), ERROR, P_ISMETA, and RelationGetRelationName.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_metaversion().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
int  access 
)

Definition at line 343 of file nbtpage.c.

344 {
345  Buffer metabuf;
346  Buffer rootbuf;
347  Page rootpage;
348  BTPageOpaque rootopaque;
349  BlockNumber rootblkno;
350  uint32 rootlevel;
351  BTMetaPageData *metad;
352 
353  /*
354  * Try to use previously-cached metapage data to find the root. This
355  * normally saves one buffer access per index search, which is a very
356  * helpful savings in bufmgr traffic and hence contention.
357  */
358  if (rel->rd_amcache != NULL)
359  {
360  metad = (BTMetaPageData *) rel->rd_amcache;
361  /* We shouldn't have cached it if any of these fail */
362  Assert(metad->btm_magic == BTREE_MAGIC);
364  Assert(metad->btm_version <= BTREE_VERSION);
365  Assert(!metad->btm_allequalimage ||
367  Assert(metad->btm_root != P_NONE);
368 
369  rootblkno = metad->btm_fastroot;
370  Assert(rootblkno != P_NONE);
371  rootlevel = metad->btm_fastlevel;
372 
373  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
374  rootpage = BufferGetPage(rootbuf);
375  rootopaque = BTPageGetOpaque(rootpage);
376 
377  /*
378  * Since the cache might be stale, we check the page more carefully
379  * here than normal. We *must* check that it's not deleted. If it's
380  * not alone on its level, then we reject too --- this may be overly
381  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
382  * because that's not set in a "fast root".
383  */
384  if (!P_IGNORE(rootopaque) &&
385  rootopaque->btpo_level == rootlevel &&
386  P_LEFTMOST(rootopaque) &&
387  P_RIGHTMOST(rootopaque))
388  {
389  /* OK, accept cached page as the root */
390  return rootbuf;
391  }
392  _bt_relbuf(rel, rootbuf);
393  /* Cache is stale, throw it away */
394  if (rel->rd_amcache)
395  pfree(rel->rd_amcache);
396  rel->rd_amcache = NULL;
397  }
398 
399  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
400  metad = _bt_getmeta(rel, metabuf);
401 
402  /* if no root page initialized yet, do it */
403  if (metad->btm_root == P_NONE)
404  {
405  Page metapg;
406 
407  /* If access = BT_READ, caller doesn't want us to create root yet */
408  if (access == BT_READ)
409  {
410  _bt_relbuf(rel, metabuf);
411  return InvalidBuffer;
412  }
413 
414  /* trade in our read lock for a write lock */
415  _bt_unlockbuf(rel, metabuf);
416  _bt_lockbuf(rel, metabuf, BT_WRITE);
417 
418  /*
419  * Race condition: if someone else initialized the metadata between
420  * the time we released the read lock and acquired the write lock, we
421  * must avoid doing it again.
422  */
423  if (metad->btm_root != P_NONE)
424  {
425  /*
426  * Metadata initialized by someone else. In order to guarantee no
427  * deadlocks, we have to release the metadata page and start all
428  * over again. (Is that really true? But it's hardly worth trying
429  * to optimize this case.)
430  */
431  _bt_relbuf(rel, metabuf);
432  return _bt_getroot(rel, access);
433  }
434 
435  /*
436  * Get, initialize, write, and leave a lock of the appropriate type on
437  * the new root page. Since this is the first page in the tree, it's
438  * a leaf as well as the root.
439  */
440  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
441  rootblkno = BufferGetBlockNumber(rootbuf);
442  rootpage = BufferGetPage(rootbuf);
443  rootopaque = BTPageGetOpaque(rootpage);
444  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
445  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
446  rootopaque->btpo_level = 0;
447  rootopaque->btpo_cycleid = 0;
448  /* Get raw page pointer for metapage */
449  metapg = BufferGetPage(metabuf);
450 
451  /* NO ELOG(ERROR) till meta is updated */
453 
454  /* upgrade metapage if needed */
455  if (metad->btm_version < BTREE_NOVAC_VERSION)
456  _bt_upgrademetapage(metapg);
457 
458  metad->btm_root = rootblkno;
459  metad->btm_level = 0;
460  metad->btm_fastroot = rootblkno;
461  metad->btm_fastlevel = 0;
463  metad->btm_last_cleanup_num_heap_tuples = -1.0;
464 
465  MarkBufferDirty(rootbuf);
466  MarkBufferDirty(metabuf);
467 
468  /* XLOG stuff */
469  if (RelationNeedsWAL(rel))
470  {
471  xl_btree_newroot xlrec;
472  XLogRecPtr recptr;
474 
475  XLogBeginInsert();
478 
480  md.version = metad->btm_version;
481  md.root = rootblkno;
482  md.level = 0;
483  md.fastroot = rootblkno;
484  md.fastlevel = 0;
486  md.allequalimage = metad->btm_allequalimage;
487 
488  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
489 
490  xlrec.rootblk = rootblkno;
491  xlrec.level = 0;
492 
493  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
494 
495  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
496 
497  PageSetLSN(rootpage, recptr);
498  PageSetLSN(metapg, recptr);
499  }
500 
502 
503  /*
504  * swap root write lock for read lock. There is no danger of anyone
505  * else accessing the new root page while it's unlocked, since no one
506  * else knows where it is yet.
507  */
508  _bt_unlockbuf(rel, rootbuf);
509  _bt_lockbuf(rel, rootbuf, BT_READ);
510 
511  /* okay, metadata is correct, release lock on it without caching */
512  _bt_relbuf(rel, metabuf);
513  }
514  else
515  {
516  rootblkno = metad->btm_fastroot;
517  Assert(rootblkno != P_NONE);
518  rootlevel = metad->btm_fastlevel;
519 
520  /*
521  * Cache the metapage data for next time
522  */
524  sizeof(BTMetaPageData));
525  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
526 
527  /*
528  * We are done with the metapage; arrange to release it via first
529  * _bt_relandgetbuf call
530  */
531  rootbuf = metabuf;
532 
533  for (;;)
534  {
535  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
536  rootpage = BufferGetPage(rootbuf);
537  rootopaque = BTPageGetOpaque(rootpage);
538 
539  if (!P_IGNORE(rootopaque))
540  break;
541 
542  /* it's dead, Jim. step right one page */
543  if (P_RIGHTMOST(rootopaque))
544  elog(ERROR, "no live root page found in index \"%s\"",
546  rootblkno = rootopaque->btpo_next;
547  }
548 
549  if (rootopaque->btpo_level != rootlevel)
550  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
551  rootblkno, RelationGetRelationName(rel),
552  rootopaque->btpo_level, rootlevel);
553  }
554 
555  /*
556  * By here, we have a pin and read lock on the root page, and no lock set
557  * on the metadata page. Return the root page's buffer.
558  */
559  return rootbuf;
560 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBuffer
Definition: buf.h:25
unsigned int uint32
Definition: c.h:441
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:863
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1015
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:109
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:343
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:144
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:871
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1082
#define BTP_LEAF
Definition: nbtree.h:76
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define BTP_ROOT
Definition: nbtree.h:77
#define P_NONE
Definition: nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:714
#define P_IGNORE(opaque)
Definition: nbtree.h:226
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:335
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
BlockNumber btm_root
Definition: nbtree.h:107
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint32 btpo_level
Definition: nbtree.h:66
void * rd_amcache
Definition: rel.h:225
MemoryContext rd_indexcxt
Definition: rel.h:200
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:332
BlockNumber rootblk
Definition: nbtxlog.h:331
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33

References _bt_getbuf(), _bt_getmeta(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageSetLSN, pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 672 of file nbtpage.c.

673 {
674  BTMetaPageData *metad;
675 
676  if (rel->rd_amcache == NULL)
677  {
678  Buffer metabuf;
679 
680  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
681  metad = _bt_getmeta(rel, metabuf);
682 
683  /*
684  * If there's no root page yet, _bt_getroot() doesn't expect a cache
685  * to be made, so just stop here and report the index height is zero.
686  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
687  */
688  if (metad->btm_root == P_NONE)
689  {
690  _bt_relbuf(rel, metabuf);
691  return 0;
692  }
693 
694  /*
695  * Cache the metapage data for next time
696  */
698  sizeof(BTMetaPageData));
699  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
700  _bt_relbuf(rel, metabuf);
701  }
702 
703  /* Get cached page */
704  metad = (BTMetaPageData *) rel->rd_amcache;
705  /* We shouldn't have cached it if any of these fail */
706  Assert(metad->btm_magic == BTREE_MAGIC);
708  Assert(metad->btm_version <= BTREE_VERSION);
709  Assert(!metad->btm_allequalimage ||
711  Assert(metad->btm_fastroot != P_NONE);
712 
713  return metad->btm_fastlevel;
714 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and get_relation_info().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 577 of file nbtpage.c.

578 {
579  Buffer metabuf;
580  Page metapg;
581  BTPageOpaque metaopaque;
582  Buffer rootbuf;
583  Page rootpage;
584  BTPageOpaque rootopaque;
585  BlockNumber rootblkno;
586  uint32 rootlevel;
587  BTMetaPageData *metad;
588 
589  /*
590  * We don't try to use cached metapage data here, since (a) this path is
591  * not performance-critical, and (b) if we are here it suggests our cache
592  * is out-of-date anyway. In light of point (b), it's probably safest to
593  * actively flush any cached metapage info.
594  */
595  if (rel->rd_amcache)
596  pfree(rel->rd_amcache);
597  rel->rd_amcache = NULL;
598 
599  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
600  metapg = BufferGetPage(metabuf);
601  metaopaque = BTPageGetOpaque(metapg);
602  metad = BTPageGetMeta(metapg);
603 
604  if (!P_ISMETA(metaopaque) ||
605  metad->btm_magic != BTREE_MAGIC)
606  ereport(ERROR,
607  (errcode(ERRCODE_INDEX_CORRUPTED),
608  errmsg("index \"%s\" is not a btree",
609  RelationGetRelationName(rel))));
610 
611  if (metad->btm_version < BTREE_MIN_VERSION ||
612  metad->btm_version > BTREE_VERSION)
613  ereport(ERROR,
614  (errcode(ERRCODE_INDEX_CORRUPTED),
615  errmsg("version mismatch in index \"%s\": file version %d, "
616  "current version %d, minimal supported version %d",
619 
620  /* if no root page initialized yet, fail */
621  if (metad->btm_root == P_NONE)
622  {
623  _bt_relbuf(rel, metabuf);
624  return InvalidBuffer;
625  }
626 
627  rootblkno = metad->btm_root;
628  rootlevel = metad->btm_level;
629 
630  /*
631  * We are done with the metapage; arrange to release it via first
632  * _bt_relandgetbuf call
633  */
634  rootbuf = metabuf;
635 
636  for (;;)
637  {
638  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
639  rootpage = BufferGetPage(rootbuf);
640  rootopaque = BTPageGetOpaque(rootpage);
641 
642  if (!P_IGNORE(rootopaque))
643  break;
644 
645  /* it's dead, Jim. step right one page */
646  if (P_RIGHTMOST(rootopaque))
647  elog(ERROR, "no live root page found in index \"%s\"",
649  rootblkno = rootopaque->btpo_next;
650  }
651 
652  if (rootopaque->btpo_level != rootlevel)
653  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
654  rootblkno, RelationGetRelationName(rel),
655  rootopaque->btpo_level, rootlevel);
656 
657  return rootbuf;
658 }

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage, elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 69 of file nbtpage.c.

71 {
72  BTMetaPageData *metad;
73  BTPageOpaque metaopaque;
74 
75  _bt_pageinit(page, BLCKSZ);
76 
77  metad = BTPageGetMeta(page);
78  metad->btm_magic = BTREE_MAGIC;
79  metad->btm_version = BTREE_VERSION;
80  metad->btm_root = rootbknum;
81  metad->btm_level = level;
82  metad->btm_fastroot = rootbknum;
83  metad->btm_fastlevel = level;
86  metad->btm_allequalimage = allequalimage;
87 
88  metaopaque = BTPageGetOpaque(page);
89  metaopaque->btpo_flags = BTP_META;
90 
91  /*
92  * Set pd_lower just past the end of the metadata. This is essential,
93  * because without doing so, metadata will be lost if xlog.c compresses
94  * the page.
95  */
96  ((PageHeader) page)->pd_lower =
97  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
98 }
PageHeaderData * PageHeader
Definition: bufpage.h:166
#define BTP_META
Definition: nbtree.h:79

References _bt_pageinit(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, and BTREE_VERSION.

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_leftsib_splitflag()

static bool _bt_leftsib_splitflag ( Relation  rel,
BlockNumber  leftsib,
BlockNumber  target 
)
static

Definition at line 1708 of file nbtpage.c.

1709 {
1710  Buffer buf;
1711  Page page;
1712  BTPageOpaque opaque;
1713  bool result;
1714 
1715  /* Easy case: No left sibling */
1716  if (leftsib == P_NONE)
1717  return false;
1718 
1719  buf = _bt_getbuf(rel, leftsib, BT_READ);
1720  page = BufferGetPage(buf);
1721  opaque = BTPageGetOpaque(page);
1722 
1723  /*
1724  * If the left sibling was concurrently split, so that its next-pointer
1725  * doesn't point to the current page anymore, the split that created
1726  * target must be completed. Caller can reasonably expect that there will
1727  * be a downlink to the target page that it can relocate using its stack.
1728  * (We don't allow splitting an incompletely split page again until the
1729  * previous split has been completed.)
1730  */
1731  result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1732  _bt_relbuf(rel, buf);
1733 
1734  return result;
1735 }
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetPage, P_INCOMPLETE_SPLIT, and P_NONE.

Referenced by _bt_lock_subtree_parent(), and _bt_pagedel().

◆ _bt_lock_subtree_parent()

static bool _bt_lock_subtree_parent ( Relation  rel,
BlockNumber  child,
BTStack  stack,
Buffer subtreeparent,
OffsetNumber poffset,
BlockNumber topparent,
BlockNumber topparentrightsib 
)
static

Definition at line 2775 of file nbtpage.c.

2778 {
2779  BlockNumber parent,
2780  leftsibparent;
2781  OffsetNumber parentoffset,
2782  maxoff;
2783  Buffer pbuf;
2784  Page page;
2785  BTPageOpaque opaque;
2786 
2787  /*
2788  * Locate the pivot tuple whose downlink points to "child". Write lock
2789  * the parent page itself.
2790  */
2791  pbuf = _bt_getstackbuf(rel, stack, child);
2792  if (pbuf == InvalidBuffer)
2793  {
2794  /*
2795  * Failed to "re-find" a pivot tuple whose downlink matched our child
2796  * block number on the parent level -- the index must be corrupt.
2797  * Don't even try to delete the leafbuf subtree. Just report the
2798  * issue and press on with vacuuming the index.
2799  *
2800  * Note: _bt_getstackbuf() recovers from concurrent page splits that
2801  * take place on the parent level. Its approach is a near-exhaustive
2802  * linear search. This also gives it a surprisingly good chance of
2803  * recovering in the event of a buggy or inconsistent opclass. But we
2804  * don't rely on that here.
2805  */
2806  ereport(LOG,
2807  (errcode(ERRCODE_INDEX_CORRUPTED),
2808  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2809  RelationGetRelationName(rel), child)));
2810  return false;
2811  }
2812 
2813  parent = stack->bts_blkno;
2814  parentoffset = stack->bts_offset;
2815 
2816  page = BufferGetPage(pbuf);
2817  opaque = BTPageGetOpaque(page);
2818  maxoff = PageGetMaxOffsetNumber(page);
2819  leftsibparent = opaque->btpo_prev;
2820 
2821  /*
2822  * _bt_getstackbuf() completes page splits on returned parent buffer when
2823  * required.
2824  *
2825  * In general it's a bad idea for VACUUM to use up more disk space, which
2826  * is why page deletion does not finish incomplete page splits most of the
2827  * time. We allow this limited exception because the risk is much lower,
2828  * and the potential downside of not proceeding is much higher: A single
2829  * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2830  * prevent us from deleting hundreds of empty leaf pages from one level
2831  * down.
2832  */
2833  Assert(!P_INCOMPLETE_SPLIT(opaque));
2834 
2835  if (parentoffset < maxoff)
2836  {
2837  /*
2838  * Child is not the rightmost child in parent, so it's safe to delete
2839  * the subtree whose root/topparent is child page
2840  */
2841  *subtreeparent = pbuf;
2842  *poffset = parentoffset;
2843  return true;
2844  }
2845 
2846  /*
2847  * Child is the rightmost child of parent.
2848  *
2849  * Since it's the rightmost child of parent, deleting the child (or
2850  * deleting the subtree whose root/topparent is the child page) is only
2851  * safe when it's also possible to delete the parent.
2852  */
2853  Assert(parentoffset == maxoff);
2854  if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2855  {
2856  /*
2857  * Child isn't parent's only child, or parent is rightmost on its
2858  * entire level. Definitely cannot delete any pages.
2859  */
2860  _bt_relbuf(rel, pbuf);
2861  return false;
2862  }
2863 
2864  /*
2865  * Now make sure that the parent deletion is itself safe by examining the
2866  * child's grandparent page. Recurse, passing the parent page as the
2867  * child page (child's grandparent is the parent on the next level up). If
2868  * parent deletion is unsafe, then child deletion must also be unsafe (in
2869  * which case caller cannot delete any pages at all).
2870  */
2871  *topparent = parent;
2872  *topparentrightsib = opaque->btpo_next;
2873 
2874  /*
2875  * Release lock on parent before recursing.
2876  *
2877  * It's OK to release page locks on parent before recursive call locks
2878  * grandparent. An internal page can only acquire an entry if the child
2879  * is split, but that cannot happen as long as we still hold a lock on the
2880  * leafbuf page.
2881  */
2882  _bt_relbuf(rel, pbuf);
2883 
2884  /*
2885  * Before recursing, check that the left sibling of parent (if any) is not
2886  * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2887  * parent lock).
2888  *
2889  * Note: We deliberately avoid completing incomplete splits here.
2890  */
2891  if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2892  return false;
2893 
2894  /* Recurse to examine child page's grandparent page */
2895  return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2896  subtreeparent, poffset,
2897  topparent, topparentrightsib);
2898 }
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:356
int errmsg_internal(const char *fmt,...)
Definition: elog.c:991
#define LOG
Definition: elog.h:25
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2307
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1708
static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2775
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:371
BlockNumber bts_blkno
Definition: nbtree.h:729
struct BTStackData * bts_parent
Definition: nbtree.h:731
OffsetNumber bts_offset
Definition: nbtree.h:730

References _bt_getstackbuf(), _bt_leftsib_splitflag(), _bt_relbuf(), Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetPage, ereport, errcode(), errmsg_internal(), InvalidBuffer, LOG, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetMaxOffsetNumber, and RelationGetRelationName.

Referenced by _bt_mark_page_halfdead().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)

Definition at line 1051 of file nbtpage.c.

1052 {
1053  /* LockBuffer() asserts that pin is held by this backend */
1054  LockBuffer(buf, access);
1055 
1056  /*
1057  * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1058  * an nbtree error (e.g. a unique violation error). That won't cause
1059  * Valgrind false positives.
1060  *
1061  * The nbtree client requests are superimposed on top of the bufmgr.c
1062  * buffer pin client requests. In the event of an nbtree error the buffer
1063  * will certainly get marked as defined when the backend once again
1064  * acquires its first pin on the buffer. (Of course, if the backend never
1065  * touches the buffer again then it doesn't matter that it remains
1066  * non-accessible to Valgrind.)
1067  *
1068  * Note: When an IndexTuple C pointer gets computed using an ItemId read
1069  * from a page while a lock was held, the C pointer becomes unsafe to
1070  * dereference forever as soon as the lock is released. Valgrind can only
1071  * detect cases where the pointer gets dereferenced with no _current_
1072  * lock/pin held, though.
1073  */
1074  if (!RelationUsesLocalBuffers(rel))
1076 }
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4156

References buf, BufferGetPage, LockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readnextpage(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), _bt_unlink_halfdead_page(), and btvacuumpage().

◆ _bt_log_reuse_page()

static void _bt_log_reuse_page ( Relation  rel,
BlockNumber  blkno,
FullTransactionId  safexid 
)
static

Definition at line 828 of file nbtpage.c.

829 {
830  xl_btree_reuse_page xlrec_reuse;
831 
832  /*
833  * Note that we don't register the buffer with the record, because this
834  * operation doesn't modify the page. This record only exists to provide a
835  * conflict point for Hot Standby.
836  */
837 
838  /* XLOG stuff */
839  xlrec_reuse.node = rel->rd_node;
840  xlrec_reuse.block = blkno;
841  xlrec_reuse.latestRemovedFullXid = safexid;
842 
843  XLogBeginInsert();
844  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
845 
846  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
847 }
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:190
RelFileNode rd_node
Definition: rel.h:56
BlockNumber block
Definition: nbtxlog.h:186
RelFileNode node
Definition: nbtxlog.h:185
FullTransactionId latestRemovedFullXid
Definition: nbtxlog.h:187

References xl_btree_reuse_page::block, xl_btree_reuse_page::latestRemovedFullXid, xl_btree_reuse_page::node, RelationData::rd_node, SizeOfBtreeReusePage, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), and XLogRegisterData().

Referenced by _bt_getbuf().

◆ _bt_mark_page_halfdead()

static bool _bt_mark_page_halfdead ( Relation  rel,
Buffer  leafbuf,
BTStack  stack 
)
static

Definition at line 2086 of file nbtpage.c.

2087 {
2088  BlockNumber leafblkno;
2089  BlockNumber leafrightsib;
2090  BlockNumber topparent;
2091  BlockNumber topparentrightsib;
2092  ItemId itemid;
2093  Page page;
2094  BTPageOpaque opaque;
2095  Buffer subtreeparent;
2096  OffsetNumber poffset;
2097  OffsetNumber nextoffset;
2098  IndexTuple itup;
2099  IndexTupleData trunctuple;
2100 
2101  page = BufferGetPage(leafbuf);
2102  opaque = BTPageGetOpaque(page);
2103 
2104  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2105  P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2106  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2107 
2108  /*
2109  * Save info about the leaf page.
2110  */
2111  leafblkno = BufferGetBlockNumber(leafbuf);
2112  leafrightsib = opaque->btpo_next;
2113 
2114  /*
2115  * Before attempting to lock the parent page, check that the right sibling
2116  * is not in half-dead state. A half-dead right sibling would have no
2117  * downlink in the parent, which would be highly confusing later when we
2118  * delete the downlink. It would fail the "right sibling of target page
2119  * is also the next child in parent page" cross-check below.
2120  */
2121  if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2122  {
2123  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2124  leafblkno, leafrightsib);
2125  return false;
2126  }
2127 
2128  /*
2129  * We cannot delete a page that is the rightmost child of its immediate
2130  * parent, unless it is the only child --- in which case the parent has to
2131  * be deleted too, and the same condition applies recursively to it. We
2132  * have to check this condition all the way up before trying to delete,
2133  * and lock the parent of the root of the to-be-deleted subtree (the
2134  * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2135  * for us. We remove the downlink to the "top parent" page (subtree root
2136  * page) from the subtree parent page below.
2137  *
2138  * Initialize topparent to be leafbuf page now. The final to-be-deleted
2139  * subtree is often a degenerate one page subtree consisting only of the
2140  * leafbuf page. When that happens, the leafbuf page is the final subtree
2141  * root page/top parent page.
2142  */
2143  topparent = leafblkno;
2144  topparentrightsib = leafrightsib;
2145  if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
2146  &subtreeparent, &poffset,
2147  &topparent, &topparentrightsib))
2148  return false;
2149 
2150  /*
2151  * Check that the parent-page index items we're about to delete/overwrite
2152  * in subtree parent page contain what we expect. This can fail if the
2153  * index has become corrupt for some reason. We want to throw any error
2154  * before entering the critical section --- otherwise it'd be a PANIC.
2155  */
2156  page = BufferGetPage(subtreeparent);
2157  opaque = BTPageGetOpaque(page);
2158 
2159 #ifdef USE_ASSERT_CHECKING
2160 
2161  /*
2162  * This is just an assertion because _bt_lock_subtree_parent should have
2163  * guaranteed tuple has the expected contents
2164  */
2165  itemid = PageGetItemId(page, poffset);
2166  itup = (IndexTuple) PageGetItem(page, itemid);
2167  Assert(BTreeTupleGetDownLink(itup) == topparent);
2168 #endif
2169 
2170  nextoffset = OffsetNumberNext(poffset);
2171  itemid = PageGetItemId(page, nextoffset);
2172  itup = (IndexTuple) PageGetItem(page, itemid);
2173  if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2174  ereport(ERROR,
2175  (errcode(ERRCODE_INDEX_CORRUPTED),
2176  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2177  topparentrightsib, topparent,
2178  BTreeTupleGetDownLink(itup),
2179  BufferGetBlockNumber(subtreeparent),
2180  RelationGetRelationName(rel))));
2181 
2182  /*
2183  * Any insert which would have gone on the leaf block will now go to its
2184  * right sibling. In other words, the key space moves right.
2185  */
2186  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2187 
2188  /* No ereport(ERROR) until changes are logged */
2190 
2191  /*
2192  * Update parent of subtree. We want to delete the downlink to the top
2193  * parent page/root of the subtree, and the *following* key. Easiest way
2194  * is to copy the right sibling's downlink over the downlink that points
2195  * to top parent page, and then delete the right sibling's original pivot
2196  * tuple.
2197  *
2198  * Lanin and Shasha make the key space move left when deleting a page,
2199  * whereas the key space moves right here. That's why we cannot simply
2200  * delete the pivot tuple with the downlink to the top parent page. See
2201  * nbtree/README.
2202  */
2203  page = BufferGetPage(subtreeparent);
2204  opaque = BTPageGetOpaque(page);
2205 
2206  itemid = PageGetItemId(page, poffset);
2207  itup = (IndexTuple) PageGetItem(page, itemid);
2208  BTreeTupleSetDownLink(itup, topparentrightsib);
2209 
2210  nextoffset = OffsetNumberNext(poffset);
2211  PageIndexTupleDelete(page, nextoffset);
2212 
2213  /*
2214  * Mark the leaf page as half-dead, and stamp it with a link to the top
2215  * parent page. When the leaf page is also the top parent page, the link
2216  * is set to InvalidBlockNumber.
2217  */
2218  page = BufferGetPage(leafbuf);
2219  opaque = BTPageGetOpaque(page);
2220  opaque->btpo_flags |= BTP_HALF_DEAD;
2221 
2223  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2224  trunctuple.t_info = sizeof(IndexTupleData);
2225  if (topparent != leafblkno)
2226  BTreeTupleSetTopParent(&trunctuple, topparent);
2227  else
2229 
2230  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2231  IndexTupleSize(&trunctuple)))
2232  elog(ERROR, "could not overwrite high key in half-dead page");
2233 
2234  /* Must mark buffers dirty before XLogInsert */
2235  MarkBufferDirty(subtreeparent);
2236  MarkBufferDirty(leafbuf);
2237 
2238  /* XLOG stuff */
2239  if (RelationNeedsWAL(rel))
2240  {
2242  XLogRecPtr recptr;
2243 
2244  xlrec.poffset = poffset;
2245  xlrec.leafblk = leafblkno;
2246  if (topparent != leafblkno)
2247  xlrec.topparent = topparent;
2248  else
2249  xlrec.topparent = InvalidBlockNumber;
2250 
2251  XLogBeginInsert();
2252  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2253  XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2254 
2255  page = BufferGetPage(leafbuf);
2256  opaque = BTPageGetOpaque(page);
2257  xlrec.leftblk = opaque->btpo_prev;
2258  xlrec.rightblk = opaque->btpo_next;
2259 
2261 
2262  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2263 
2264  page = BufferGetPage(subtreeparent);
2265  PageSetLSN(page, recptr);
2266  page = BufferGetPage(leafbuf);
2267  PageSetLSN(page, recptr);
2268  }
2269 
2270  END_CRIT_SECTION();
2271 
2272  _bt_relbuf(rel, subtreeparent);
2273  return true;
2274 }
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:1052
#define MemSet(start, val, len)
Definition: c.h:1008
#define DEBUG1
Definition: elog.h:24
struct IndexTupleData IndexTupleData
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1765
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define BTP_HALF_DEAD
Definition: nbtree.h:80
#define P_HIKEY
Definition: nbtree.h:369
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:621
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:557
#define P_ISROOT(opaque)
Definition: nbtree.h:222
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:551
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:279
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:38
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3254
unsigned short t_info
Definition: itup.h:49

References _bt_lock_subtree_parent(), _bt_relbuf(), _bt_rightsib_halfdeadflag(), Assert(), BTP_HALF_DEAD, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), BTreeTupleSetDownLink(), BTreeTupleSetTopParent(), BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, IndexTupleSize, InvalidBlockNumber, xl_btree_mark_page_halfdead::leafblk, xl_btree_mark_page_halfdead::leftblk, MarkBufferDirty(), MemSet, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageIndexTupleDelete(), PageIndexTupleOverwrite(), PageSetLSN, xl_btree_mark_page_halfdead::poffset, PredicateLockPageCombine(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_mark_page_halfdead::rightblk, SizeOfBtreeMarkPageHalfDead, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_mark_page_halfdead::topparent, XLOG_BTREE_MARK_PAGE_HALFDEAD, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_pagedel().

◆ _bt_metaversion()

void _bt_metaversion ( Relation  rel,
bool heapkeyspace,
bool allequalimage 
)

Definition at line 736 of file nbtpage.c.

737 {
738  BTMetaPageData *metad;
739 
740  if (rel->rd_amcache == NULL)
741  {
742  Buffer metabuf;
743 
744  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
745  metad = _bt_getmeta(rel, metabuf);
746 
747  /*
748  * If there's no root page yet, _bt_getroot() doesn't expect a cache
749  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
750  * be changed to allow this case.)
751  */
752  if (metad->btm_root == P_NONE)
753  {
754  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
755  *allequalimage = metad->btm_allequalimage;
756 
757  _bt_relbuf(rel, metabuf);
758  return;
759  }
760 
761  /*
762  * Cache the metapage data for next time
763  *
764  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
765  * can change the nbtree version for an index without invalidating any
766  * local cache. This is okay because it can only happen when moving
767  * from version 2 to version 3, both of which are !heapkeyspace
768  * versions.
769  */
771  sizeof(BTMetaPageData));
772  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
773  _bt_relbuf(rel, metabuf);
774  }
775 
776  /* Get cached page */
777  metad = (BTMetaPageData *) rel->rd_amcache;
778  /* We shouldn't have cached it if any of these fail */
779  Assert(metad->btm_magic == BTREE_MAGIC);
781  Assert(metad->btm_version <= BTREE_VERSION);
782  Assert(!metad->btm_allequalimage ||
784  Assert(metad->btm_fastroot != P_NONE);
785 
786  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
787  *allequalimage = metad->btm_allequalimage;
788 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_internal().

◆ _bt_pagedel()

void _bt_pagedel ( Relation  rel,
Buffer  leafbuf,
BTVacState vstate 
)

Definition at line 1815 of file nbtpage.c.

1816 {
1817  BlockNumber rightsib;
1818  bool rightsib_empty;
1819  Page page;
1820  BTPageOpaque opaque;
1821 
1822  /*
1823  * Save original leafbuf block number from caller. Only deleted blocks
1824  * that are <= scanblkno are added to bulk delete stat's pages_deleted
1825  * count.
1826  */
1827  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1828 
1829  /*
1830  * "stack" is a search stack leading (approximately) to the target page.
1831  * It is initially NULL, but when iterating, we keep it to avoid
1832  * duplicated search effort.
1833  *
1834  * Also, when "stack" is not NULL, we have already checked that the
1835  * current page is not the right half of an incomplete split, i.e. the
1836  * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1837  * when the current target page is to the right of caller's initial page
1838  * (the scanblkno page).
1839  */
1840  BTStack stack = NULL;
1841 
1842  for (;;)
1843  {
1844  page = BufferGetPage(leafbuf);
1845  opaque = BTPageGetOpaque(page);
1846 
1847  /*
1848  * Internal pages are never deleted directly, only as part of deleting
1849  * the whole subtree all the way down to leaf level.
1850  *
1851  * Also check for deleted pages here. Caller never passes us a fully
1852  * deleted page. Only VACUUM can delete pages, so there can't have
1853  * been a concurrent deletion. Assume that we reached any deleted
1854  * page encountered here by following a sibling link, and that the
1855  * index is corrupt.
1856  */
1857  Assert(!P_ISDELETED(opaque));
1858  if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1859  {
1860  /*
1861  * Pre-9.4 page deletion only marked internal pages as half-dead,
1862  * but now we only use that flag on leaf pages. The old algorithm
1863  * was never supposed to leave half-dead pages in the tree, it was
1864  * just a transient state, but it was nevertheless possible in
1865  * error scenarios. We don't know how to deal with them here. They
1866  * are harmless as far as searches are considered, but inserts
1867  * into the deleted keyspace could add out-of-order downlinks in
1868  * the upper levels. Log a notice, hopefully the admin will notice
1869  * and reindex.
1870  */
1871  if (P_ISHALFDEAD(opaque))
1872  ereport(LOG,
1873  (errcode(ERRCODE_INDEX_CORRUPTED),
1874  errmsg("index \"%s\" contains a half-dead internal page",
1876  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1877 
1878  if (P_ISDELETED(opaque))
1879  ereport(LOG,
1880  (errcode(ERRCODE_INDEX_CORRUPTED),
1881  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1882  BufferGetBlockNumber(leafbuf),
1883  scanblkno,
1884  RelationGetRelationName(rel))));
1885 
1886  _bt_relbuf(rel, leafbuf);
1887  return;
1888  }
1889 
1890  /*
1891  * We can never delete rightmost pages nor root pages. While at it,
1892  * check that page is empty, since it's possible that the leafbuf page
1893  * was empty a moment ago, but has since had some inserts.
1894  *
1895  * To keep the algorithm simple, we also never delete an incompletely
1896  * split page (they should be rare enough that this doesn't make any
1897  * meaningful difference to disk usage):
1898  *
1899  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1900  * left half of an incomplete split, but ensuring that it's not the
1901  * right half is more complicated. For that, we have to check that
1902  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1903  * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1904  * release the lock on scanblkno/leafbuf, check the left sibling, and
1905  * construct a search stack to scanblkno. On subsequent iterations,
1906  * we know we stepped right from a page that passed these tests, so
1907  * it's OK.
1908  */
1909  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1910  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1911  P_INCOMPLETE_SPLIT(opaque))
1912  {
1913  /* Should never fail to delete a half-dead page */
1914  Assert(!P_ISHALFDEAD(opaque));
1915 
1916  _bt_relbuf(rel, leafbuf);
1917  return;
1918  }
1919 
1920  /*
1921  * First, remove downlink pointing to the page (or a parent of the
1922  * page, if we are going to delete a taller subtree), and mark the
1923  * leafbuf page half-dead
1924  */
1925  if (!P_ISHALFDEAD(opaque))
1926  {
1927  /*
1928  * We need an approximate pointer to the page's parent page. We
1929  * use a variant of the standard search mechanism to search for
1930  * the page's high key; this will give us a link to either the
1931  * current parent or someplace to its left (if there are multiple
1932  * equal high keys, which is possible with !heapkeyspace indexes).
1933  *
1934  * Also check if this is the right-half of an incomplete split
1935  * (see comment above).
1936  */
1937  if (!stack)
1938  {
1939  BTScanInsert itup_key;
1940  ItemId itemid;
1941  IndexTuple targetkey;
1942  BlockNumber leftsib,
1943  leafblkno;
1944  Buffer sleafbuf;
1945 
1946  itemid = PageGetItemId(page, P_HIKEY);
1947  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1948 
1949  leftsib = opaque->btpo_prev;
1950  leafblkno = BufferGetBlockNumber(leafbuf);
1951 
1952  /*
1953  * To avoid deadlocks, we'd better drop the leaf page lock
1954  * before going further.
1955  */
1956  _bt_unlockbuf(rel, leafbuf);
1957 
1958  /*
1959  * Check that the left sibling of leafbuf (if any) is not
1960  * marked with INCOMPLETE_SPLIT flag before proceeding
1961  */
1962  Assert(leafblkno == scanblkno);
1963  if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1964  {
1965  ReleaseBuffer(leafbuf);
1966  return;
1967  }
1968 
1969  /* we need an insertion scan key for the search, so build one */
1970  itup_key = _bt_mkscankey(rel, targetkey);
1971  /* find the leftmost leaf page with matching pivot/high key */
1972  itup_key->pivotsearch = true;
1973  stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1974  /* won't need a second lock or pin on leafbuf */
1975  _bt_relbuf(rel, sleafbuf);
1976 
1977  /*
1978  * Re-lock the leaf page, and start over to use our stack
1979  * within _bt_mark_page_halfdead. We must do it that way
1980  * because it's possible that leafbuf can no longer be
1981  * deleted. We need to recheck.
1982  *
1983  * Note: We can't simply hold on to the sleafbuf lock instead,
1984  * because it's barely possible that sleafbuf is not the same
1985  * page as leafbuf. This happens when leafbuf split after our
1986  * original lock was dropped, but before _bt_search finished
1987  * its descent. We rely on the assumption that we'll find
1988  * leafbuf isn't safe to delete anymore in this scenario.
1989  * (Page deletion can cope with the stack being to the left of
1990  * leafbuf, but not to the right of leafbuf.)
1991  */
1992  _bt_lockbuf(rel, leafbuf, BT_WRITE);
1993  continue;
1994  }
1995 
1996  /*
1997  * See if it's safe to delete the leaf page, and determine how
1998  * many parent/internal pages above the leaf level will be
1999  * deleted. If it's safe then _bt_mark_page_halfdead will also
2000  * perform the first phase of deletion, which includes marking the
2001  * leafbuf page half-dead.
2002  */
2003  Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2004  if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
2005  {
2006  _bt_relbuf(rel, leafbuf);
2007  return;
2008  }
2009  }
2010 
2011  /*
2012  * Then unlink it from its siblings. Each call to
2013  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2014  * making it shallower. Iterate until the leafbuf page is deleted.
2015  */
2016  rightsib_empty = false;
2017  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2018  while (P_ISHALFDEAD(opaque))
2019  {
2020  /* Check for interrupts in _bt_unlink_halfdead_page */
2021  if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2022  &rightsib_empty, vstate))
2023  {
2024  /*
2025  * _bt_unlink_halfdead_page should never fail, since we
2026  * established that deletion is generally safe in
2027  * _bt_mark_page_halfdead -- index must be corrupt.
2028  *
2029  * Note that _bt_unlink_halfdead_page already released the
2030  * lock and pin on leafbuf for us.
2031  */
2032  Assert(false);
2033  return;
2034  }
2035  }
2036 
2037  Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2038 
2039  rightsib = opaque->btpo_next;
2040 
2041  _bt_relbuf(rel, leafbuf);
2042 
2043  /*
2044  * Check here, as calling loops will have locks held, preventing
2045  * interrupts from being processed.
2046  */
2048 
2049  /*
2050  * The page has now been deleted. If its right sibling is completely
2051  * empty, it's possible that the reason we haven't deleted it earlier
2052  * is that it was the rightmost child of the parent. Now that we
2053  * removed the downlink for this page, the right sibling might now be
2054  * the only child of the parent, and could be removed. It would be
2055  * picked up by the next vacuum anyway, but might as well try to
2056  * remove it now, so loop back to process the right sibling.
2057  *
2058  * Note: This relies on the assumption that _bt_getstackbuf() will be
2059  * able to reuse our original descent stack with a different child
2060  * block (provided that the child block is to the right of the
2061  * original leaf page reached by _bt_search()). It will even update
2062  * the descent stack each time we loop around, avoiding repeated work.
2063  */
2064  if (!rightsib_empty)
2065  break;
2066 
2067  leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2068  }
2069 }
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:528
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2086
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2302
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:96
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
bool pivotsearch
Definition: nbtree.h:790

References _bt_getbuf(), _bt_leftsib_splitflag(), _bt_lockbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), LOG, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, BTScanInsertData::pivotsearch, RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

◆ _bt_pageinit()

void _bt_pageinit ( Page  page,
Size  size 
)

Definition at line 1141 of file nbtpage.c.

1142 {
1143  PageInit(page, size, sizeof(BTPageOpaqueData));
1144 }
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42

References PageInit().

Referenced by _bt_blnewpage(), _bt_getbuf(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

◆ _bt_pendingfsm_add()

static void _bt_pendingfsm_add ( BTVacState vstate,
BlockNumber  target,
FullTransactionId  safexid 
)
static

Definition at line 3020 of file nbtpage.c.

3023 {
3024  Assert(vstate->npendingpages <= vstate->bufsize);
3025  Assert(vstate->bufsize <= vstate->maxbufsize);
3026 
3027 #ifdef USE_ASSERT_CHECKING
3028 
3029  /*
3030  * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3031  * array will always be in safexid order (since that is the order that we
3032  * save them in here)
3033  */
3034  if (vstate->npendingpages > 0)
3035  {
3036  FullTransactionId lastsafexid =
3037  vstate->pendingpages[vstate->npendingpages - 1].safexid;
3038 
3039  Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3040  }
3041 #endif
3042 
3043  /*
3044  * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3045  * information about this page.
3046  *
3047  * Note that this also covers the case where we opted to not use the
3048  * optimization in _bt_pendingfsm_init().
3049  */
3050  if (vstate->npendingpages == vstate->maxbufsize)
3051  return;
3052 
3053  /* Consider enlarging buffer */
3054  if (vstate->npendingpages == vstate->bufsize)
3055  {
3056  int newbufsize = vstate->bufsize * 2;
3057 
3058  /* Respect work_mem */
3059  if (newbufsize > vstate->maxbufsize)
3060  newbufsize = vstate->maxbufsize;
3061 
3062  vstate->bufsize = newbufsize;
3063  vstate->pendingpages =
3064  repalloc(vstate->pendingpages,
3065  sizeof(BTPendingFSM) * vstate->bufsize);
3066  }
3067 
3068  /* Save metadata for newly deleted page */
3069  vstate->pendingpages[vstate->npendingpages].target = target;
3070  vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3071  vstate->npendingpages++;
3072 }
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1188
FullTransactionId safexid
Definition: nbtree.h:329
BlockNumber target
Definition: nbtree.h:328
BTPendingFSM * pendingpages
Definition: nbtree.h:346
int npendingpages
Definition: nbtree.h:347
int bufsize
Definition: nbtree.h:344
int maxbufsize
Definition: nbtree.h:345
#define FullTransactionIdFollowsOrEquals(a, b)
Definition: transam.h:54

References Assert(), BTVacState::bufsize, FullTransactionIdFollowsOrEquals, BTVacState::maxbufsize, BTVacState::npendingpages, BTVacState::pendingpages, repalloc(), BTPendingFSM::safexid, and BTPendingFSM::target.

Referenced by _bt_unlink_halfdead_page().

◆ _bt_pendingfsm_finalize()

void _bt_pendingfsm_finalize ( Relation  rel,
BTVacState vstate 
)

Definition at line 2955 of file nbtpage.c.

2956 {
2957  IndexBulkDeleteResult *stats = vstate->stats;
2958 
2959  Assert(stats->pages_newly_deleted >= vstate->npendingpages);
2960 
2961  if (vstate->npendingpages == 0)
2962  {
2963  /* Just free memory when nothing to do */
2964  if (vstate->pendingpages)
2965  pfree(vstate->pendingpages);
2966 
2967  return;
2968  }
2969 
2970 #ifdef DEBUG_BTREE_PENDING_FSM
2971 
2972  /*
2973  * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
2974  * placing pending pages in the FSM. Note that the optimization will
2975  * never be effective without some other backend concurrently consuming an
2976  * XID.
2977  */
2978  pg_usleep(5000000L);
2979 #endif
2980 
2981  /*
2982  * Recompute VACUUM XID boundaries.
2983  *
2984  * We don't actually care about the oldest non-removable XID. Computing
2985  * the oldest such XID has a useful side-effect that we rely on: it
2986  * forcibly updates the XID horizon state for this backend. This step is
2987  * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
2988  * that it is now safe to recycle newly deleted pages without this step.
2989  */
2991 
2992  for (int i = 0; i < vstate->npendingpages; i++)
2993  {
2994  BlockNumber target = vstate->pendingpages[i].target;
2995  FullTransactionId safexid = vstate->pendingpages[i].safexid;
2996 
2997  /*
2998  * Do the equivalent of checking BTPageIsRecyclable(), but without
2999  * accessing the page again a second time.
3000  *
3001  * Give up on finding the first non-recyclable page -- all later pages
3002  * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3003  * to the array in safexid order.
3004  */
3005  if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
3006  break;
3007 
3008  RecordFreeIndexPage(rel, target);
3009  stats->pages_free++;
3010  }
3011 
3012  pfree(vstate->pendingpages);
3013 }
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:2021
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4275
void pg_usleep(long microsec)
Definition: signal.c:53
IndexBulkDeleteResult * stats
Definition: nbtree.h:335
BlockNumber pages_newly_deleted
Definition: genam.h:80
BlockNumber pages_free
Definition: genam.h:82

References Assert(), GetOldestNonRemovableTransactionId(), GlobalVisCheckRemovableFullXid(), i, BTVacState::npendingpages, IndexBulkDeleteResult::pages_free, IndexBulkDeleteResult::pages_newly_deleted, BTVacState::pendingpages, pfree(), pg_usleep(), RecordFreeIndexPage(), BTPendingFSM::safexid, BTVacState::stats, and BTPendingFSM::target.

Referenced by btvacuumscan().

◆ _bt_pendingfsm_init()

void _bt_pendingfsm_init ( Relation  rel,
BTVacState vstate,
bool  cleanuponly 
)

Definition at line 2914 of file nbtpage.c.

2915 {
2916  int64 maxbufsize;
2917 
2918  /*
2919  * Don't bother with optimization in cleanup-only case -- we don't expect
2920  * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2921  * can only take place because this optimization didn't work out during
2922  * the last VACUUM.
2923  */
2924  if (cleanuponly)
2925  return;
2926 
2927  /*
2928  * Cap maximum size of array so that we always respect work_mem. Avoid
2929  * int overflow here.
2930  */
2931  vstate->bufsize = 256;
2932  maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
2933  maxbufsize = Min(maxbufsize, INT_MAX);
2934  maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2935  /* Stay sane with small work_mem */
2936  maxbufsize = Max(maxbufsize, vstate->bufsize);
2937  vstate->maxbufsize = maxbufsize;
2938 
2939  /* Allocate buffer, indicate that there are currently 0 pending pages */
2940  vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2941  vstate->npendingpages = 0;
2942 }
#define Min(x, y)
Definition: c.h:986
#define Max(x, y)
Definition: c.h:980
int work_mem
Definition: globals.c:125
#define MaxAllocSize
Definition: memutils.h:40

References BTVacState::bufsize, Max, MaxAllocSize, BTVacState::maxbufsize, Min, BTVacState::npendingpages, palloc(), BTVacState::pendingpages, and work_mem.

Referenced by btvacuumscan().

◆ _bt_relandgetbuf()

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)

Definition at line 1015 of file nbtpage.c.

1016 {
1017  Buffer buf;
1018 
1019  Assert(blkno != P_NEW);
1020  if (BufferIsValid(obuf))
1021  _bt_unlockbuf(rel, obuf);
1022  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1023  _bt_lockbuf(rel, buf, access);
1024 
1025  _bt_checkpage(rel, buf);
1026  return buf;
1027 }
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1636
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

References _bt_checkpage(), _bt_lockbuf(), _bt_unlockbuf(), Assert(), buf, BufferIsValid, P_NEW, and ReleaseAndReadBuffer().

Referenced by _bt_check_unique(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_moveright(), _bt_search(), _bt_stepright(), and _bt_walk_left().

◆ _bt_relbuf()

◆ _bt_rightsib_halfdeadflag()

static bool _bt_rightsib_halfdeadflag ( Relation  rel,
BlockNumber  leafrightsib 
)
static

Definition at line 1765 of file nbtpage.c.

1766 {
1767  Buffer buf;
1768  Page page;
1769  BTPageOpaque opaque;
1770  bool result;
1771 
1772  Assert(leafrightsib != P_NONE);
1773 
1774  buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1775  page = BufferGetPage(buf);
1776  opaque = BTPageGetOpaque(page);
1777 
1778  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1779  result = P_ISHALFDEAD(opaque);
1780  _bt_relbuf(rel, buf);
1781 
1782  return result;
1783 }

References _bt_getbuf(), _bt_relbuf(), Assert(), BT_READ, BTPageGetOpaque, buf, BufferGetPage, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, and P_NONE.

Referenced by _bt_mark_page_halfdead().

◆ _bt_set_cleanup_info()

void _bt_set_cleanup_info ( Relation  rel,
BlockNumber  num_delpages 
)

Definition at line 234 of file nbtpage.c.

235 {
236  Buffer metabuf;
237  Page metapg;
238  BTMetaPageData *metad;
239 
240  /*
241  * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
242  * field started out as a TransactionId field called btm_oldest_btpo_xact.
243  * Both "versions" are just uint32 fields. It was convenient to repurpose
244  * the field when we began to use 64-bit XIDs in deleted pages.
245  *
246  * It's possible that a pg_upgrade'd database will contain an XID value in
247  * what is now recognized as the metapage's btm_last_cleanup_num_delpages
248  * field. _bt_vacuum_needs_cleanup() may even believe that this value
249  * indicates that there are lots of pages that it needs to recycle, when
250  * in reality there are only one or two. The worst that can happen is
251  * that there will be a call to btvacuumscan a little earlier, which will
252  * set btm_last_cleanup_num_delpages to a sane value when we're called.
253  *
254  * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
255  * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
256  * to be consistent.
257  */
258  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
259  metapg = BufferGetPage(metabuf);
260  metad = BTPageGetMeta(metapg);
261 
262  /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
263  if (metad->btm_version >= BTREE_NOVAC_VERSION &&
264  metad->btm_last_cleanup_num_delpages == num_delpages)
265  {
266  /* Usually means index continues to have num_delpages of 0 */
267  _bt_relbuf(rel, metabuf);
268  return;
269  }
270 
271  /* trade in our read lock for a write lock */
272  _bt_unlockbuf(rel, metabuf);
273  _bt_lockbuf(rel, metabuf, BT_WRITE);
274 
276 
277  /* upgrade meta-page if needed */
278  if (metad->btm_version < BTREE_NOVAC_VERSION)
279  _bt_upgrademetapage(metapg);
280 
281  /* update cleanup-related information */
282  metad->btm_last_cleanup_num_delpages = num_delpages;
283  metad->btm_last_cleanup_num_heap_tuples = -1.0;
284  MarkBufferDirty(metabuf);
285 
286  /* write wal record if needed */
287  if (RelationNeedsWAL(rel))
288  {
290  XLogRecPtr recptr;
291 
292  XLogBeginInsert();
294 
296  md.version = metad->btm_version;
297  md.root = metad->btm_root;
298  md.level = metad->btm_level;
299  md.fastroot = metad->btm_fastroot;
300  md.fastlevel = metad->btm_fastlevel;
301  md.last_cleanup_num_delpages = num_delpages;
302  md.allequalimage = metad->btm_allequalimage;
303 
304  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
305 
306  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
307 
308  PageSetLSN(metapg, recptr);
309  }
310 
312 
313  _bt_relbuf(rel, metabuf);
314 }
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), PageSetLSN, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationNeedsWAL, xl_btree_metadata::root, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_META_CLEANUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), and XLogRegisterBuffer().

Referenced by btvacuumcleanup().

◆ _bt_unlink_halfdead_page()

static bool _bt_unlink_halfdead_page ( Relation  rel,
Buffer  leafbuf,
BlockNumber  scanblkno,
bool rightsib_empty,
BTVacState vstate 
)
static

Definition at line 2302 of file nbtpage.c.

2304 {
2305  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2306  IndexBulkDeleteResult *stats = vstate->stats;
2307  BlockNumber leafleftsib;
2308  BlockNumber leafrightsib;
2309  BlockNumber target;
2310  BlockNumber leftsib;
2311  BlockNumber rightsib;
2312  Buffer lbuf = InvalidBuffer;
2313  Buffer buf;
2314  Buffer rbuf;
2315  Buffer metabuf = InvalidBuffer;
2316  Page metapg = NULL;
2317  BTMetaPageData *metad = NULL;
2318  ItemId itemid;
2319  Page page;
2320  BTPageOpaque opaque;
2321  FullTransactionId safexid;
2322  bool rightsib_is_rightmost;
2323  uint32 targetlevel;
2324  IndexTuple leafhikey;
2325  BlockNumber leaftopparent;
2326 
2327  page = BufferGetPage(leafbuf);
2328  opaque = BTPageGetOpaque(page);
2329 
2330  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2331 
2332  /*
2333  * Remember some information about the leaf page.
2334  */
2335  itemid = PageGetItemId(page, P_HIKEY);
2336  leafhikey = (IndexTuple) PageGetItem(page, itemid);
2337  target = BTreeTupleGetTopParent(leafhikey);
2338  leafleftsib = opaque->btpo_prev;
2339  leafrightsib = opaque->btpo_next;
2340 
2341  _bt_unlockbuf(rel, leafbuf);
2342 
2343  /*
2344  * Check here, as calling loops will have locks held, preventing
2345  * interrupts from being processed.
2346  */
2348 
2349  /* Unlink the current top parent of the subtree */
2350  if (!BlockNumberIsValid(target))
2351  {
2352  /* Target is leaf page (or leaf page is top parent, if you prefer) */
2353  target = leafblkno;
2354 
2355  buf = leafbuf;
2356  leftsib = leafleftsib;
2357  targetlevel = 0;
2358  }
2359  else
2360  {
2361  /* Target is the internal page taken from leaf's top parent link */
2362  Assert(target != leafblkno);
2363 
2364  /* Fetch the block number of the target's left sibling */
2365  buf = _bt_getbuf(rel, target, BT_READ);
2366  page = BufferGetPage(buf);
2367  opaque = BTPageGetOpaque(page);
2368  leftsib = opaque->btpo_prev;
2369  targetlevel = opaque->btpo_level;
2370  Assert(targetlevel > 0);
2371 
2372  /*
2373  * To avoid deadlocks, we'd better drop the target page lock before
2374  * going further.
2375  */
2376  _bt_unlockbuf(rel, buf);
2377  }
2378 
2379  /*
2380  * We have to lock the pages we need to modify in the standard order:
2381  * moving right, then up. Else we will deadlock against other writers.
2382  *
2383  * So, first lock the leaf page, if it's not the target. Then find and
2384  * write-lock the current left sibling of the target page. The sibling
2385  * that was current a moment ago could have split, so we may have to move
2386  * right.
2387  */
2388  if (target != leafblkno)
2389  _bt_lockbuf(rel, leafbuf, BT_WRITE);
2390  if (leftsib != P_NONE)
2391  {
2392  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2393  page = BufferGetPage(lbuf);
2394  opaque = BTPageGetOpaque(page);
2395  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2396  {
2397  bool leftsibvalid = true;
2398 
2399  /*
2400  * Before we follow the link from the page that was the left
2401  * sibling mere moments ago, validate its right link. This
2402  * reduces the opportunities for loop to fail to ever make any
2403  * progress in the presence of index corruption.
2404  *
2405  * Note: we rely on the assumption that there can only be one
2406  * vacuum process running at a time (against the same index).
2407  */
2408  if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2409  leftsib == opaque->btpo_next)
2410  leftsibvalid = false;
2411 
2412  leftsib = opaque->btpo_next;
2413  _bt_relbuf(rel, lbuf);
2414 
2415  if (!leftsibvalid)
2416  {
2417  if (target != leafblkno)
2418  {
2419  /* we have only a pin on target, but pin+lock on leafbuf */
2420  ReleaseBuffer(buf);
2421  _bt_relbuf(rel, leafbuf);
2422  }
2423  else
2424  {
2425  /* we have only a pin on leafbuf */
2426  ReleaseBuffer(leafbuf);
2427  }
2428 
2429  ereport(LOG,
2430  (errcode(ERRCODE_INDEX_CORRUPTED),
2431  errmsg_internal("valid left sibling for deletion target could not be located: "
2432  "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
2433  leftsib, target, leafblkno, scanblkno,
2434  RelationGetRelationName(rel))));
2435 
2436  return false;
2437  }
2438 
2440 
2441  /* step right one page */
2442  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2443  page = BufferGetPage(lbuf);
2444  opaque = BTPageGetOpaque(page);
2445  }
2446  }
2447  else
2448  lbuf = InvalidBuffer;
2449 
2450  /* Next write-lock the target page itself */
2451  _bt_lockbuf(rel, buf, BT_WRITE);
2452  page = BufferGetPage(buf);
2453  opaque = BTPageGetOpaque(page);
2454 
2455  /*
2456  * Check page is still empty etc, else abandon deletion. This is just for
2457  * paranoia's sake; a half-dead page cannot resurrect because there can be
2458  * only one vacuum process running at a time.
2459  */
2460  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2461  elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2462  target, RelationGetRelationName(rel));
2463 
2464  if (opaque->btpo_prev != leftsib)
2465  ereport(ERROR,
2466  (errcode(ERRCODE_INDEX_CORRUPTED),
2467  errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2468  leftsib, opaque->btpo_prev, target,
2469  RelationGetRelationName(rel))));
2470 
2471  if (target == leafblkno)
2472  {
2473  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2474  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2475  elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2476  target, RelationGetRelationName(rel));
2477 
2478  /* Leaf page is also target page: don't set leaftopparent */
2479  leaftopparent = InvalidBlockNumber;
2480  }
2481  else
2482  {
2483  IndexTuple finaldataitem;
2484 
2485  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2486  P_ISLEAF(opaque))
2487  elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2488  targetlevel, target, RelationGetRelationName(rel));
2489 
2490  /* Target is internal: set leaftopparent for next call here... */
2491  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2492  finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2493  leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2494  /* ...except when it would be a redundant pointer-to-self */
2495  if (leaftopparent == leafblkno)
2496  leaftopparent = InvalidBlockNumber;
2497  }
2498 
2499  /* No leaftopparent for level 0 (leaf page) or level 1 target */
2500  Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2501 
2502  /*
2503  * And next write-lock the (current) right sibling.
2504  */
2505  rightsib = opaque->btpo_next;
2506  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2507  page = BufferGetPage(rbuf);
2508  opaque = BTPageGetOpaque(page);
2509  if (opaque->btpo_prev != target)
2510  ereport(ERROR,
2511  (errcode(ERRCODE_INDEX_CORRUPTED),
2512  errmsg_internal("right sibling's left-link doesn't match: "
2513  "block %u links to %u instead of expected %u in index \"%s\"",
2514  rightsib, opaque->btpo_prev, target,
2515  RelationGetRelationName(rel))));
2516  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2517  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2518 
2519  /*
2520  * If we are deleting the next-to-last page on the target's level, then
2521  * the rightsib is a candidate to become the new fast root. (In theory, it
2522  * might be possible to push the fast root even further down, but the odds
2523  * of doing so are slim, and the locking considerations daunting.)
2524  *
2525  * We can safely acquire a lock on the metapage here --- see comments for
2526  * _bt_newroot().
2527  */
2528  if (leftsib == P_NONE && rightsib_is_rightmost)
2529  {
2530  page = BufferGetPage(rbuf);
2531  opaque = BTPageGetOpaque(page);
2532  if (P_RIGHTMOST(opaque))
2533  {
2534  /* rightsib will be the only one left on the level */
2535  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2536  metapg = BufferGetPage(metabuf);
2537  metad = BTPageGetMeta(metapg);
2538 
2539  /*
2540  * The expected case here is btm_fastlevel == targetlevel+1; if
2541  * the fastlevel is <= targetlevel, something is wrong, and we
2542  * choose to overwrite it to fix it.
2543  */
2544  if (metad->btm_fastlevel > targetlevel + 1)
2545  {
2546  /* no update wanted */
2547  _bt_relbuf(rel, metabuf);
2548  metabuf = InvalidBuffer;
2549  }
2550  }
2551  }
2552 
2553  /*
2554  * Here we begin doing the deletion.
2555  */
2556 
2557  /* No ereport(ERROR) until changes are logged */
2559 
2560  /*
2561  * Update siblings' side-links. Note the target page's side-links will
2562  * continue to point to the siblings. Asserts here are just rechecking
2563  * things we already verified above.
2564  */
2565  if (BufferIsValid(lbuf))
2566  {
2567  page = BufferGetPage(lbuf);
2568  opaque = BTPageGetOpaque(page);
2569  Assert(opaque->btpo_next == target);
2570  opaque->btpo_next = rightsib;
2571  }
2572  page = BufferGetPage(rbuf);
2573  opaque = BTPageGetOpaque(page);
2574  Assert(opaque->btpo_prev == target);
2575  opaque->btpo_prev = leftsib;
2576 
2577  /*
2578  * If we deleted a parent of the targeted leaf page, instead of the leaf
2579  * itself, update the leaf to point to the next remaining child in the
2580  * subtree.
2581  *
2582  * Note: We rely on the fact that a buffer pin on the leaf page has been
2583  * held since leafhikey was initialized. This is safe, though only
2584  * because the page was already half-dead at that point. The leaf page
2585  * cannot have been modified by any other backend during the period when
2586  * no lock was held.
2587  */
2588  if (target != leafblkno)
2589  BTreeTupleSetTopParent(leafhikey, leaftopparent);
2590 
2591  /*
2592  * Mark the page itself deleted. It can be recycled when all current
2593  * transactions are gone. Storing GetTopTransactionId() would work, but
2594  * we're in VACUUM and would not otherwise have an XID. Having already
2595  * updated links to the target, ReadNextFullTransactionId() suffices as an
2596  * upper bound. Any scan having retained a now-stale link is advertising
2597  * in its PGPROC an xmin less than or equal to the value we read here. It
2598  * will continue to do so, holding back the xmin horizon, for the duration
2599  * of that scan.
2600  */
2601  page = BufferGetPage(buf);
2602  opaque = BTPageGetOpaque(page);
2603  Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2604 
2605  /*
2606  * Store upper bound XID that's used to determine when deleted page is no
2607  * longer needed as a tombstone
2608  */
2609  safexid = ReadNextFullTransactionId();
2610  BTPageSetDeleted(page, safexid);
2611  opaque->btpo_cycleid = 0;
2612 
2613  /* And update the metapage, if needed */
2614  if (BufferIsValid(metabuf))
2615  {
2616  /* upgrade metapage if needed */
2617  if (metad->btm_version < BTREE_NOVAC_VERSION)
2618  _bt_upgrademetapage(metapg);
2619  metad->btm_fastroot = rightsib;
2620  metad->btm_fastlevel = targetlevel;
2621  MarkBufferDirty(metabuf);
2622  }
2623 
2624  /* Must mark buffers dirty before XLogInsert */
2625  MarkBufferDirty(rbuf);
2627  if (BufferIsValid(lbuf))
2628  MarkBufferDirty(lbuf);
2629  if (target != leafblkno)
2630  MarkBufferDirty(leafbuf);
2631 
2632  /* XLOG stuff */
2633  if (RelationNeedsWAL(rel))
2634  {
2635  xl_btree_unlink_page xlrec;
2636  xl_btree_metadata xlmeta;
2637  uint8 xlinfo;
2638  XLogRecPtr recptr;
2639 
2640  XLogBeginInsert();
2641 
2643  if (BufferIsValid(lbuf))
2646  if (target != leafblkno)
2647  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2648 
2649  /* information stored on the target/to-be-unlinked block */
2650  xlrec.leftsib = leftsib;
2651  xlrec.rightsib = rightsib;
2652  xlrec.level = targetlevel;
2653  xlrec.safexid = safexid;
2654 
2655  /* information needed to recreate the leaf block (if not the target) */
2656  xlrec.leafleftsib = leafleftsib;
2657  xlrec.leafrightsib = leafrightsib;
2658  xlrec.leaftopparent = leaftopparent;
2659 
2660  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2661 
2662  if (BufferIsValid(metabuf))
2663  {
2665 
2667  xlmeta.version = metad->btm_version;
2668  xlmeta.root = metad->btm_root;
2669  xlmeta.level = metad->btm_level;
2670  xlmeta.fastroot = metad->btm_fastroot;
2671  xlmeta.fastlevel = metad->btm_fastlevel;
2673  xlmeta.allequalimage = metad->btm_allequalimage;
2674 
2675  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2676  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2677  }
2678  else
2679  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2680 
2681  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2682 
2683  if (BufferIsValid(metabuf))
2684  {
2685  PageSetLSN(metapg, recptr);
2686  }
2687  page = BufferGetPage(rbuf);
2688  PageSetLSN(page, recptr);
2689  page = BufferGetPage(buf);
2690  PageSetLSN(page, recptr);
2691  if (BufferIsValid(lbuf))
2692  {
2693  page = BufferGetPage(lbuf);
2694  PageSetLSN(page, recptr);
2695  }
2696  if (target != leafblkno)
2697  {
2698  page = BufferGetPage(leafbuf);
2699  PageSetLSN(page, recptr);
2700  }
2701  }
2702 
2703  END_CRIT_SECTION();
2704 
2705  /* release metapage */
2706  if (BufferIsValid(metabuf))
2707  _bt_relbuf(rel, metabuf);
2708 
2709  /* release siblings */
2710  if (BufferIsValid(lbuf))
2711  _bt_relbuf(rel, lbuf);
2712  _bt_relbuf(rel, rbuf);
2713 
2714  /* If the target is not leafbuf, we're done with it now -- release it */
2715  if (target != leafblkno)
2716  _bt_relbuf(rel, buf);
2717 
2718  /*
2719  * Maintain pages_newly_deleted, which is simply the number of pages
2720  * deleted by the ongoing VACUUM operation.
2721  *
2722  * Maintain pages_deleted in a way that takes into account how
2723  * btvacuumpage() will count deleted pages that have yet to become
2724  * scanblkno -- only count page when it's not going to get that treatment
2725  * later on.
2726  */
2727  stats->pages_newly_deleted++;
2728  if (target <= scanblkno)
2729  stats->pages_deleted++;
2730 
2731  /*
2732  * Remember information about the target page (now a newly deleted page)
2733  * in dedicated vstate space for later. The page will be considered as a
2734  * candidate to place in the FSM at the end of the current btvacuumscan()
2735  * call.
2736  */
2737  _bt_pendingfsm_add(vstate, target, safexid);
2738 
2739  return true;
2740 }
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
unsigned char uint8
Definition: c.h:439
static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
Definition: nbtpage.c:3020
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:615
static void BTPageSetDeleted(Page page, FullTransactionId safexid)
Definition: nbtree.h:240
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:35
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:36
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:316
BlockNumber pages_deleted
Definition: genam.h:81
FullTransactionId ReadNextFullTransactionId(void)
Definition: varsup.c:261

References _bt_getbuf(), _bt_lockbuf(), _bt_pendingfsm_add(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BlockNumberIsValid, BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageSetDeleted(), BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetDownLink(), BTreeTupleGetTopParent(), BTreeTupleSetTopParent(), buf, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, CHECK_FOR_INTERRUPTS, elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBlockNumber, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_unlink_page::leafleftsib, xl_btree_unlink_page::leafrightsib, xl_btree_unlink_page::leaftopparent, xl_btree_unlink_page::leftsib, xl_btree_metadata::level, xl_btree_unlink_page::level, LOG, MarkBufferDirty(), P_FIRSTDATAKEY, P_HIKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_newly_deleted, PageSetLSN, ReadNextFullTransactionId(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, ReleaseBuffer(), xl_btree_unlink_page::rightsib, xl_btree_metadata::root, xl_btree_unlink_page::safexid, SizeOfBtreeUnlinkPage, START_CRIT_SECTION, BTVacState::stats, xl_btree_metadata::version, XLOG_BTREE_UNLINK_PAGE, XLOG_BTREE_UNLINK_PAGE_META, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_pagedel().

◆ _bt_unlockbuf()

void _bt_unlockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1082 of file nbtpage.c.

1083 {
1084  /*
1085  * Buffer is pinned and locked, which means that it is expected to be
1086  * defined and addressable. Check that proactively.
1087  */
1089 
1090  /* LockBuffer() asserts that pin is held by this backend */
1092 
1093  if (!RelationUsesLocalBuffers(rel))
1095 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage, LockBuffer(), RelationUsesLocalBuffers, VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_first(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_relandgetbuf(), _bt_relbuf(), _bt_search(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_upgradelockbufcleanup()

void _bt_upgradelockbufcleanup ( Relation  rel,
Buffer  buf 
)

Definition at line 1121 of file nbtpage.c.

1122 {
1123  /*
1124  * Buffer is pinned and locked, which means that it is expected to be
1125  * defined and addressable. Check that proactively.
1126  */
1128 
1129  /* LockBuffer() asserts that pin is held by this backend */
1132 }
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4213

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage, LockBuffer(), LockBufferForCleanup(), and VALGRIND_CHECK_MEM_IS_DEFINED.

Referenced by btvacuumpage().

◆ _bt_upgrademetapage()

void _bt_upgrademetapage ( Page  page)

Definition at line 109 of file nbtpage.c.

110 {
111  BTMetaPageData *metad;
113 
114  metad = BTPageGetMeta(page);
115  metaopaque = BTPageGetOpaque(page);
116 
117  /* It must be really a meta page of upgradable version */
118  Assert(metaopaque->btpo_flags & BTP_META);
121 
122  /* Set version number and fill extra fields added into version 3 */
125  metad->btm_last_cleanup_num_heap_tuples = -1.0;
126  /* Only a REINDEX can set this field */
127  Assert(!metad->btm_allequalimage);
128  metad->btm_allequalimage = false;
129 
130  /* Adjust pd_lower (see _bt_initmetapage() for details) */
131  ((PageHeader) page)->pd_lower =
132  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
133 }
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:155

References Assert(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, and PG_USED_FOR_ASSERTS_ONLY.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_newroot(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_vacuum_needs_cleanup()

bool _bt_vacuum_needs_cleanup ( Relation  rel)

Definition at line 181 of file nbtpage.c.

182 {
183  Buffer metabuf;
184  Page metapg;
185  BTMetaPageData *metad;
186  uint32 btm_version;
187  BlockNumber prev_num_delpages;
188 
189  /*
190  * Copy details from metapage to local variables quickly.
191  *
192  * Note that we deliberately avoid using cached version of metapage here.
193  */
194  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
195  metapg = BufferGetPage(metabuf);
196  metad = BTPageGetMeta(metapg);
197  btm_version = metad->btm_version;
198 
199  if (btm_version < BTREE_NOVAC_VERSION)
200  {
201  /*
202  * Metapage needs to be dynamically upgraded to store fields that are
203  * only present when btm_version >= BTREE_NOVAC_VERSION
204  */
205  _bt_relbuf(rel, metabuf);
206  return true;
207  }
208 
209  prev_num_delpages = metad->btm_last_cleanup_num_delpages;
210  _bt_relbuf(rel, metabuf);
211 
212  /*
213  * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
214  * total size of the index. We can reasonably expect (though are not
215  * guaranteed) to be able to recycle this many pages if we decide to do a
216  * btvacuumscan call during the ongoing btvacuumcleanup. For further
217  * details see the nbtree/README section on placing deleted pages in the
218  * FSM.
219  */
220  if (prev_num_delpages > 0 &&
221  prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
222  return true;
223 
224  return false;
225 }
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:216

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage, and RelationGetNumberOfBlocks.

Referenced by btvacuumcleanup().