PostgreSQL Source Code git master
nbtpage.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "common/int.h"
#include "miscadmin.h"
#include "storage/indexfsm.h"
#include "storage/predicate.h"
#include "storage/procarray.h"
#include "utils/memdebug.h"
#include "utils/memutils.h"
#include "utils/snapmgr.h"
Include dependency graph for nbtpage.c:

Go to the source code of this file.

Functions

static BTMetaPageData_bt_getmeta (Relation rel, Buffer metabuf)
 
static void _bt_delitems_delete (Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
static char * _bt_delitems_update (BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
 
static bool _bt_mark_page_halfdead (Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)
 
static bool _bt_unlink_halfdead_page (Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
 
static bool _bt_lock_subtree_parent (Relation rel, Relation heaprel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
 
static void _bt_pendingfsm_add (BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
void _bt_upgrademetapage (Page page)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
Buffer _bt_getroot (Relation rel, Relation heaprel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_allocbuf (Relation rel, Relation heaprel)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
static int _bt_delitems_cmp (const void *a, const void *b)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
static bool _bt_leftsib_splitflag (Relation rel, BlockNumber leftsib, BlockNumber target)
 
static bool _bt_rightsib_halfdeadflag (Relation rel, BlockNumber leafrightsib)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 

Function Documentation

◆ _bt_allocbuf()

Buffer _bt_allocbuf ( Relation  rel,
Relation  heaprel 
)

Definition at line 869 of file nbtpage.c.

870{
871 Buffer buf;
872 BlockNumber blkno;
873 Page page;
874
875 Assert(heaprel != NULL);
876
877 /*
878 * First see if the FSM knows of any free pages.
879 *
880 * We can't trust the FSM's report unreservedly; we have to check that the
881 * page is still free. (For example, an already-free page could have been
882 * re-used between the time the last VACUUM scanned it and the time the
883 * VACUUM made its FSM updates.)
884 *
885 * In fact, it's worse than that: we can't even assume that it's safe to
886 * take a lock on the reported page. If somebody else has a lock on it,
887 * or even worse our own caller does, we could deadlock. (The own-caller
888 * scenario is actually not improbable. Consider an index on a serial or
889 * timestamp column. Nearly all splits will be at the rightmost page, so
890 * it's entirely likely that _bt_split will call us while holding a lock
891 * on the page most recently acquired from FSM. A VACUUM running
892 * concurrently with the previous split could well have placed that page
893 * back in FSM.)
894 *
895 * To get around that, we ask for only a conditional lock on the reported
896 * page. If we fail, then someone else is using the page, and we may
897 * reasonably assume it's not free. (If we happen to be wrong, the worst
898 * consequence is the page will be lost to use till the next VACUUM, which
899 * is no big problem.)
900 */
901 for (;;)
902 {
903 blkno = GetFreeIndexPage(rel);
904 if (blkno == InvalidBlockNumber)
905 break;
906 buf = ReadBuffer(rel, blkno);
907 if (_bt_conditionallockbuf(rel, buf))
908 {
909 page = BufferGetPage(buf);
910
911 /*
912 * It's possible to find an all-zeroes page in an index. For
913 * example, a backend might successfully extend the relation one
914 * page and then crash before it is able to make a WAL entry for
915 * adding the page. If we find a zeroed page then reclaim it
916 * immediately.
917 */
918 if (PageIsNew(page))
919 {
920 /* Okay to use page. Initialize and return it. */
922 return buf;
923 }
924
925 if (BTPageIsRecyclable(page, heaprel))
926 {
927 /*
928 * If we are generating WAL for Hot Standby then create a WAL
929 * record that will allow us to conflict with queries running
930 * on standby, in case they have snapshots older than safexid
931 * value
932 */
934 {
935 xl_btree_reuse_page xlrec_reuse;
936
937 /*
938 * Note that we don't register the buffer with the record,
939 * because this operation doesn't modify the page (that
940 * already happened, back when VACUUM deleted the page).
941 * This record only exists to provide a conflict point for
942 * Hot Standby. See record REDO routine comments.
943 */
944 xlrec_reuse.locator = rel->rd_locator;
945 xlrec_reuse.block = blkno;
947 xlrec_reuse.isCatalogRel =
949
952
953 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
954 }
955
956 /* Okay to use page. Re-initialize and return it. */
958 return buf;
959 }
960 elog(DEBUG2, "FSM returned nonrecyclable page");
961 _bt_relbuf(rel, buf);
962 }
963 else
964 {
965 elog(DEBUG2, "FSM returned nonlockable page");
966 /* couldn't get lock, so just drop pin */
968 }
969 }
970
971 /*
972 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
973 * risk a race condition against btvacuumscan --- see comments therein.
974 * This forces us to repeat the valgrind request that _bt_lockbuf()
975 * otherwise would make, as we can't use _bt_lockbuf() without introducing
976 * a race.
977 */
979 if (!RelationUsesLocalBuffers(rel))
981
982 /* Initialize the new page before returning it */
983 page = BufferGetPage(buf);
984 Assert(PageIsNew(page));
986
987 return buf;
988}
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:851
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4917
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:751
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:401
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:390
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
PageData * Page
Definition: bufpage.h:82
#define DEBUG2
Definition: elog.h:29
#define elog(elevel,...)
Definition: elog.h:225
Assert(PointerIsAligned(start, uint64))
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:260
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:192
static char * buf
Definition: pg_test_fsync.c:72
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:692
#define RelationNeedsWAL(relation)
Definition: rel.h:636
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:645
@ MAIN_FORKNUM
Definition: relpath.h:58
RelFileLocator rd_locator
Definition: rel.h:57
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
#define XLogStandbyInfoActive()
Definition: xlog.h:123
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:364
void XLogBeginInsert(void)
Definition: xloginsert.c:149

References _bt_conditionallockbuf(), _bt_pageinit(), _bt_relbuf(), Assert(), xl_btree_reuse_page::block, BMR_REL, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, EB_LOCK_FIRST, elog, ExtendBufferedRel(), GetFreeIndexPage(), InvalidBlockNumber, xl_btree_reuse_page::isCatalogRel, xl_btree_reuse_page::locator, MAIN_FORKNUM, PageIsNew(), RelationData::rd_locator, ReadBuffer(), RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationUsesLocalBuffers, ReleaseBuffer(), SizeOfBtreeReusePage, xl_btree_reuse_page::snapshotConflictHorizon, VALGRIND_MAKE_MEM_DEFINED, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_newlevel(), and _bt_split().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 797 of file nbtpage.c.

798{
799 Page page = BufferGetPage(buf);
800
801 /*
802 * ReadBuffer verifies that every newly-read page passes
803 * PageHeaderIsValid, which means it either contains a reasonably sane
804 * page header or is all-zero. We have to defend against the all-zero
805 * case, however.
806 */
807 if (PageIsNew(page))
809 (errcode(ERRCODE_INDEX_CORRUPTED),
810 errmsg("index \"%s\" contains unexpected zero page at block %u",
813 errhint("Please REINDEX it.")));
814
815 /*
816 * Additionally check that the special area looks sane.
817 */
818 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
820 (errcode(ERRCODE_INDEX_CORRUPTED),
821 errmsg("index \"%s\" contains corrupted page at block %u",
824 errhint("Please REINDEX it.")));
825}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3795
static uint16 PageGetSpecialSize(const PageData *page)
Definition: bufpage.h:317
#define MAXALIGN(LEN)
Definition: c.h:782
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define RelationGetRelationName(relation)
Definition: rel.h:547

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1093 of file nbtpage.c.

1094{
1095 /* ConditionalLockBuffer() asserts that pin is held by this backend */
1097 return false;
1098
1099 if (!RelationUsesLocalBuffers(rel))
1101
1102 return true;
1103}
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5177

References buf, BufferGetPage(), ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_allocbuf(), and _bt_search_insert().

◆ _bt_delitems_cmp()

static int _bt_delitems_cmp ( const void *  a,
const void *  b 
)
static

Definition at line 1464 of file nbtpage.c.

1465{
1466 TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1467 TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1468
1469 Assert(indexdelete1->id != indexdelete2->id);
1470
1471 return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
1472}
static int pg_cmp_s16(int16 a, int16 b)
Definition: int.h:634
int b
Definition: isn.c:71
int a
Definition: isn.c:70

References a, Assert(), b, TM_IndexDelete::id, and pg_cmp_s16().

Referenced by _bt_delitems_delete_check().

◆ _bt_delitems_delete()

static void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
TransactionId  snapshotConflictHorizon,
bool  isCatalogRel,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)
static

Definition at line 1284 of file nbtpage.c.

1288{
1289 Page page = BufferGetPage(buf);
1290 BTPageOpaque opaque;
1291 bool needswal = RelationNeedsWAL(rel);
1292 char *updatedbuf = NULL;
1293 Size updatedbuflen = 0;
1294 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1295
1296 /* Shouldn't be called unless there's something to do */
1297 Assert(ndeletable > 0 || nupdatable > 0);
1298
1299 /* Generate new versions of posting lists without deleted TIDs */
1300 if (nupdatable > 0)
1301 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1302 updatedoffsets, &updatedbuflen,
1303 needswal);
1304
1305 /* No ereport(ERROR) until changes are logged */
1307
1308 /* Handle updates and deletes just like _bt_delitems_vacuum */
1309 for (int i = 0; i < nupdatable; i++)
1310 {
1311 OffsetNumber updatedoffset = updatedoffsets[i];
1312 IndexTuple itup;
1313 Size itemsz;
1314
1315 itup = updatable[i]->itup;
1316 itemsz = MAXALIGN(IndexTupleSize(itup));
1317 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1318 itemsz))
1319 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1321 }
1322
1323 if (ndeletable > 0)
1324 PageIndexMultiDelete(page, deletable, ndeletable);
1325
1326 /*
1327 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1328 * this point. The VACUUM command alone controls vacuum cycle IDs.
1329 */
1330 opaque = BTPageGetOpaque(page);
1331
1332 /*
1333 * Clear the BTP_HAS_GARBAGE page flag.
1334 *
1335 * This flag indicates the presence of LP_DEAD items on the page (though
1336 * not reliably). Note that we only rely on it with pg_upgrade'd
1337 * !heapkeyspace indexes.
1338 */
1339 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1340
1342
1343 /* XLOG stuff */
1344 if (needswal)
1345 {
1346 XLogRecPtr recptr;
1347 xl_btree_delete xlrec_delete;
1348
1349 xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
1350 xlrec_delete.ndeleted = ndeletable;
1351 xlrec_delete.nupdated = nupdatable;
1352 xlrec_delete.isCatalogRel = isCatalogRel;
1353
1356 XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);
1357
1358 if (ndeletable > 0)
1359 XLogRegisterBufData(0, deletable,
1360 ndeletable * sizeof(OffsetNumber));
1361
1362 if (nupdatable > 0)
1363 {
1364 XLogRegisterBufData(0, updatedoffsets,
1365 nupdatable * sizeof(OffsetNumber));
1366 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1367 }
1368
1369 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1370
1371 PageSetLSN(page, recptr);
1372 }
1373
1375
1376 /* can't leak memory here */
1377 if (updatedbuf != NULL)
1378 pfree(updatedbuf);
1379 /* free tuples allocated within _bt_delitems_update() */
1380 for (int i = 0; i < nupdatable; i++)
1381 pfree(updatable[i]->itup);
1382}
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2596
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1150
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1394
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
size_t Size
Definition: c.h:576
#define PANIC
Definition: elog.h:42
int i
Definition: isn.c:74
Pointer Item
Definition: item.h:17
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void pfree(void *pointer)
Definition: mcxt.c:1524
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1405
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define SizeOfBtreeDelete
Definition: nbtxlog.h:253
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:34
uint16 OffsetNumber
Definition: off.h:24
uint16 btpo_flags
Definition: nbtree.h:67
IndexTuple itup
Definition: nbtree.h:911
TransactionId snapshotConflictHorizon
Definition: nbtxlog.h:238
bool isCatalogRel
Definition: nbtxlog.h:241
uint16 ndeleted
Definition: nbtxlog.h:239
uint16 nupdated
Definition: nbtxlog.h:240
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
#define REGBUF_STANDARD
Definition: xloginsert.h:35

References _bt_delitems_update(), Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, i, IndexTupleSize(), xl_btree_delete::isCatalogRel, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_delete::ndeleted, xl_btree_delete::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeDelete, xl_btree_delete::snapshotConflictHorizon, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delitems_delete_check().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1513 of file nbtpage.c.

1515{
1516 Page page = BufferGetPage(buf);
1517 TransactionId snapshotConflictHorizon;
1518 bool isCatalogRel;
1519 OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1520 int ndeletable = 0,
1521 nupdatable = 0;
1524
1525 /* Use tableam interface to determine which tuples to delete first */
1526 snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1527 isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1528
1529 /* Should not WAL-log snapshotConflictHorizon unless it's required */
1530 if (!XLogStandbyInfoActive())
1531 snapshotConflictHorizon = InvalidTransactionId;
1532
1533 /*
1534 * Construct a leaf-page-wise description of what _bt_delitems_delete()
1535 * needs to do to physically delete index tuples from the page.
1536 *
1537 * Must sort deltids array to restore leaf-page-wise order (original order
1538 * before call to tableam). This is the order that the loop expects.
1539 *
1540 * Note that deltids array might be a lot smaller now. It might even have
1541 * no entries at all (with bottom-up deletion caller), in which case there
1542 * is nothing left to do.
1543 */
1544 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1546 if (delstate->ndeltids == 0)
1547 {
1548 Assert(delstate->bottomup);
1549 return;
1550 }
1551
1552 /* We definitely have to delete at least one index tuple (or one TID) */
1553 for (int i = 0; i < delstate->ndeltids; i++)
1554 {
1555 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1556 OffsetNumber idxoffnum = dstatus->idxoffnum;
1557 ItemId itemid = PageGetItemId(page, idxoffnum);
1558 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1559 int nestedi,
1560 nitem;
1561 BTVacuumPosting vacposting;
1562
1563 Assert(OffsetNumberIsValid(idxoffnum));
1564
1565 if (idxoffnum == postingidxoffnum)
1566 {
1567 /*
1568 * This deltid entry is a TID from a posting list tuple that has
1569 * already been completely processed
1570 */
1573 &delstate->deltids[i].tid) < 0);
1575 &delstate->deltids[i].tid) >= 0);
1576 continue;
1577 }
1578
1579 if (!BTreeTupleIsPosting(itup))
1580 {
1581 /* Plain non-pivot tuple */
1582 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1583 if (dstatus->knowndeletable)
1584 deletable[ndeletable++] = idxoffnum;
1585 continue;
1586 }
1587
1588 /*
1589 * itup is a posting list tuple whose lowest deltids entry (which may
1590 * or may not be for the first TID from itup) is considered here now.
1591 * We should process all of the deltids entries for the posting list
1592 * together now, though (not just the lowest). Remember to skip over
1593 * later itup-related entries during later iterations of outermost
1594 * loop.
1595 */
1596 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1597 nestedi = i; /* Initialize for first itup deltids entry */
1598 vacposting = NULL; /* Describes final action for itup */
1599 nitem = BTreeTupleGetNPosting(itup);
1600 for (int p = 0; p < nitem; p++)
1601 {
1602 ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1603 int ptidcmp = -1;
1604
1605 /*
1606 * This nested loop reuses work across ptid TIDs taken from itup.
1607 * We take advantage of the fact that both itup's TIDs and deltids
1608 * entries (within a single itup/posting list grouping) must both
1609 * be in ascending TID order.
1610 */
1611 for (; nestedi < delstate->ndeltids; nestedi++)
1612 {
1613 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1614 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1615
1616 /* Stop once we get past all itup related deltids entries */
1617 Assert(tdstatus->idxoffnum >= idxoffnum);
1618 if (tdstatus->idxoffnum != idxoffnum)
1619 break;
1620
1621 /* Skip past non-deletable itup related entries up front */
1622 if (!tdstatus->knowndeletable)
1623 continue;
1624
1625 /* Entry is first partial ptid match (or an exact match)? */
1626 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1627 if (ptidcmp >= 0)
1628 {
1629 /* Greater than or equal (partial or exact) match... */
1630 break;
1631 }
1632 }
1633
1634 /* ...exact ptid match to a deletable deltids entry? */
1635 if (ptidcmp != 0)
1636 continue;
1637
1638 /* Exact match for deletable deltids entry -- ptid gets deleted */
1639 if (vacposting == NULL)
1640 {
1641 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1642 nitem * sizeof(uint16));
1643 vacposting->itup = itup;
1644 vacposting->updatedoffset = idxoffnum;
1645 vacposting->ndeletedtids = 0;
1646 }
1647 vacposting->deletetids[vacposting->ndeletedtids++] = p;
1648 }
1649
1650 /* Final decision on itup, a posting list tuple */
1651
1652 if (vacposting == NULL)
1653 {
1654 /* No TIDs to delete from itup -- do nothing */
1655 }
1656 else if (vacposting->ndeletedtids == nitem)
1657 {
1658 /* Straight delete of itup (to delete all TIDs) */
1659 deletable[ndeletable++] = idxoffnum;
1660 /* Turns out we won't need granular information */
1661 pfree(vacposting);
1662 }
1663 else
1664 {
1665 /* Delete some (but not all) TIDs from itup */
1666 Assert(vacposting->ndeletedtids > 0 &&
1667 vacposting->ndeletedtids < nitem);
1668 updatable[nupdatable++] = vacposting;
1669 }
1670 }
1671
1672 /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1673 _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1674 deletable, ndeletable, updatable, nupdatable);
1675
1676 /* be tidy */
1677 for (int i = 0; i < nupdatable; i++)
1678 pfree(updatable[i]);
1679}
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
uint16_t uint16
Definition: c.h:501
uint32 TransactionId
Definition: c.h:623
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
IndexTupleData * IndexTuple
Definition: itup.h:53
void * palloc(Size size)
Definition: mcxt.c:1317
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1284
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1464
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:475
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:916
uint16 ndeletedtids
Definition: nbtree.h:915
OffsetNumber updatedoffset
Definition: nbtree.h:912
ItemPointerData t_tid
Definition: itup.h:37
TM_IndexStatus * status
Definition: tableam.h:255
TM_IndexDelete * deltids
Definition: tableam.h:254
ItemPointerData tid
Definition: tableam.h:213
bool knowndeletable
Definition: tableam.h:220
OffsetNumber idxoffnum
Definition: tableam.h:219
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1326
#define InvalidTransactionId
Definition: transam.h:31

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert(), TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, RelationIsAccessibleInLogicalDecoding, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_update()

static char * _bt_delitems_update ( BTVacuumPosting updatable,
int  nupdatable,
OffsetNumber updatedoffsets,
Size updatedbuflen,
bool  needswal 
)
static

Definition at line 1405 of file nbtpage.c.

1408{
1409 char *updatedbuf = NULL;
1410 Size buflen = 0;
1411
1412 /* Shouldn't be called unless there's something to do */
1413 Assert(nupdatable > 0);
1414
1415 for (int i = 0; i < nupdatable; i++)
1416 {
1417 BTVacuumPosting vacposting = updatable[i];
1418 Size itemsz;
1419
1420 /* Replace work area IndexTuple with updated version */
1421 _bt_update_posting(vacposting);
1422
1423 /* Keep track of size of xl_btree_update for updatedbuf in passing */
1424 itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1425 buflen += itemsz;
1426
1427 /* Build updatedoffsets buffer in passing */
1428 updatedoffsets[i] = vacposting->updatedoffset;
1429 }
1430
1431 /* XLOG stuff */
1432 if (needswal)
1433 {
1434 Size offset = 0;
1435
1436 /* Allocate, set final size for caller */
1437 updatedbuf = palloc(buflen);
1438 *updatedbuflen = buflen;
1439 for (int i = 0; i < nupdatable; i++)
1440 {
1441 BTVacuumPosting vacposting = updatable[i];
1442 Size itemsz;
1443 xl_btree_update update;
1444
1445 update.ndeletedtids = vacposting->ndeletedtids;
1446 memcpy(updatedbuf + offset, &update.ndeletedtids,
1448 offset += SizeOfBtreeUpdate;
1449
1450 itemsz = update.ndeletedtids * sizeof(uint16);
1451 memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1452 offset += itemsz;
1453 }
1454 }
1455
1456 return updatedbuf;
1457}
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:924
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:268
uint16 ndeletedtids
Definition: nbtxlog.h:263

References _bt_update_posting(), Assert(), BTVacuumPostingData::deletetids, i, BTVacuumPostingData::ndeletedtids, xl_btree_update::ndeletedtids, palloc(), SizeOfBtreeUpdate, and BTVacuumPostingData::updatedoffset.

Referenced by _bt_delitems_delete(), and _bt_delitems_vacuum().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1154 of file nbtpage.c.

1157{
1158 Page page = BufferGetPage(buf);
1159 BTPageOpaque opaque;
1160 bool needswal = RelationNeedsWAL(rel);
1161 char *updatedbuf = NULL;
1162 Size updatedbuflen = 0;
1163 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1164
1165 /* Shouldn't be called unless there's something to do */
1166 Assert(ndeletable > 0 || nupdatable > 0);
1167
1168 /* Generate new version of posting lists without deleted TIDs */
1169 if (nupdatable > 0)
1170 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1171 updatedoffsets, &updatedbuflen,
1172 needswal);
1173
1174 /* No ereport(ERROR) until changes are logged */
1176
1177 /*
1178 * Handle posting tuple updates.
1179 *
1180 * Deliberately do this before handling simple deletes. If we did it the
1181 * other way around (i.e. WAL record order -- simple deletes before
1182 * updates) then we'd have to make compensating changes to the 'updatable'
1183 * array of offset numbers.
1184 *
1185 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1186 * happens to already be set. It's important that we not interfere with
1187 * any future simple index tuple deletion operations.
1188 */
1189 for (int i = 0; i < nupdatable; i++)
1190 {
1191 OffsetNumber updatedoffset = updatedoffsets[i];
1192 IndexTuple itup;
1193 Size itemsz;
1194
1195 itup = updatable[i]->itup;
1196 itemsz = MAXALIGN(IndexTupleSize(itup));
1197 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1198 itemsz))
1199 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1201 }
1202
1203 /* Now handle simple deletes of entire tuples */
1204 if (ndeletable > 0)
1205 PageIndexMultiDelete(page, deletable, ndeletable);
1206
1207 /*
1208 * We can clear the vacuum cycle ID since this page has certainly been
1209 * processed by the current vacuum scan.
1210 */
1211 opaque = BTPageGetOpaque(page);
1212 opaque->btpo_cycleid = 0;
1213
1214 /*
1215 * Clear the BTP_HAS_GARBAGE page flag.
1216 *
1217 * This flag indicates the presence of LP_DEAD items on the page (though
1218 * not reliably). Note that we only rely on it with pg_upgrade'd
1219 * !heapkeyspace indexes. That's why clearing it here won't usually
1220 * interfere with simple index tuple deletion.
1221 */
1222 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1223
1225
1226 /* XLOG stuff */
1227 if (needswal)
1228 {
1229 XLogRecPtr recptr;
1230 xl_btree_vacuum xlrec_vacuum;
1231
1232 xlrec_vacuum.ndeleted = ndeletable;
1233 xlrec_vacuum.nupdated = nupdatable;
1234
1237 XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
1238
1239 if (ndeletable > 0)
1240 XLogRegisterBufData(0, deletable,
1241 ndeletable * sizeof(OffsetNumber));
1242
1243 if (nupdatable > 0)
1244 {
1245 XLogRegisterBufData(0, updatedoffsets,
1246 nupdatable * sizeof(OffsetNumber));
1247 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1248 }
1249
1250 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1251
1252 PageSetLSN(page, recptr);
1253 }
1254
1256
1257 /* can't leak memory here */
1258 if (updatedbuf != NULL)
1259 pfree(updatedbuf);
1260 /* free tuples allocated within _bt_delitems_update() */
1261 for (int i = 0; i < nupdatable; i++)
1262 pfree(updatable[i]->itup);
1263}
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:234
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:68
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223

References _bt_delitems_update(), Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, i, IndexTupleSize(), BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 845 of file nbtpage.c.

846{
847 Buffer buf;
848
850
851 /* Read an existing block of the relation */
852 buf = ReadBuffer(rel, blkno);
853 _bt_lockbuf(rel, buf, access);
854 _bt_checkpage(rel, buf);
855
856 return buf;
857}
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
short access
Definition: preproc-type.c:36

References _bt_checkpage(), _bt_lockbuf(), Assert(), BlockNumberIsValid(), buf, and ReadBuffer().

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_lock_and_validate_left(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), and _bt_vacuum_needs_cleanup().

◆ _bt_getmeta()

static BTMetaPageData * _bt_getmeta ( Relation  rel,
Buffer  metabuf 
)
static

Definition at line 142 of file nbtpage.c.

143{
144 Page metapg;
145 BTPageOpaque metaopaque;
146 BTMetaPageData *metad;
147
148 metapg = BufferGetPage(metabuf);
149 metaopaque = BTPageGetOpaque(metapg);
150 metad = BTPageGetMeta(metapg);
151
152 /* sanity-check the metapage */
153 if (!P_ISMETA(metaopaque) ||
154 metad->btm_magic != BTREE_MAGIC)
156 (errcode(ERRCODE_INDEX_CORRUPTED),
157 errmsg("index \"%s\" is not a btree",
159
160 if (metad->btm_version < BTREE_MIN_VERSION ||
161 metad->btm_version > BTREE_VERSION)
163 (errcode(ERRCODE_INDEX_CORRUPTED),
164 errmsg("version mismatch in index \"%s\": file version %d, "
165 "current version %d, minimal supported version %d",
168
169 return metad;
170}
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define P_ISMETA(opaque)
Definition: nbtree.h:223
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTREE_VERSION
Definition: nbtree.h:150
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105

References BTMetaPageData::btm_magic, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTREE_MAGIC, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), ereport, errcode(), errmsg(), ERROR, P_ISMETA, and RelationGetRelationName.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_metaversion().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
Relation  heaprel,
int  access 
)

Definition at line 344 of file nbtpage.c.

345{
346 Buffer metabuf;
347 Buffer rootbuf;
348 Page rootpage;
349 BTPageOpaque rootopaque;
350 BlockNumber rootblkno;
351 uint32 rootlevel;
352 BTMetaPageData *metad;
353
354 Assert(access == BT_READ || heaprel != NULL);
355
356 /*
357 * Try to use previously-cached metapage data to find the root. This
358 * normally saves one buffer access per index search, which is a very
359 * helpful savings in bufmgr traffic and hence contention.
360 */
361 if (rel->rd_amcache != NULL)
362 {
363 metad = (BTMetaPageData *) rel->rd_amcache;
364 /* We shouldn't have cached it if any of these fail */
365 Assert(metad->btm_magic == BTREE_MAGIC);
368 Assert(!metad->btm_allequalimage ||
370 Assert(metad->btm_root != P_NONE);
371
372 rootblkno = metad->btm_fastroot;
373 Assert(rootblkno != P_NONE);
374 rootlevel = metad->btm_fastlevel;
375
376 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
377 rootpage = BufferGetPage(rootbuf);
378 rootopaque = BTPageGetOpaque(rootpage);
379
380 /*
381 * Since the cache might be stale, we check the page more carefully
382 * here than normal. We *must* check that it's not deleted. If it's
383 * not alone on its level, then we reject too --- this may be overly
384 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
385 * because that's not set in a "fast root".
386 */
387 if (!P_IGNORE(rootopaque) &&
388 rootopaque->btpo_level == rootlevel &&
389 P_LEFTMOST(rootopaque) &&
390 P_RIGHTMOST(rootopaque))
391 {
392 /* OK, accept cached page as the root */
393 return rootbuf;
394 }
395 _bt_relbuf(rel, rootbuf);
396 /* Cache is stale, throw it away */
397 if (rel->rd_amcache)
398 pfree(rel->rd_amcache);
399 rel->rd_amcache = NULL;
400 }
401
402 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
403 metad = _bt_getmeta(rel, metabuf);
404
405 /* if no root page initialized yet, do it */
406 if (metad->btm_root == P_NONE)
407 {
408 Page metapg;
409
410 /* If access = BT_READ, caller doesn't want us to create root yet */
411 if (access == BT_READ)
412 {
413 _bt_relbuf(rel, metabuf);
414 return InvalidBuffer;
415 }
416
417 /* trade in our read lock for a write lock */
418 _bt_unlockbuf(rel, metabuf);
419 _bt_lockbuf(rel, metabuf, BT_WRITE);
420
421 /*
422 * Race condition: if someone else initialized the metadata between
423 * the time we released the read lock and acquired the write lock, we
424 * must avoid doing it again.
425 */
426 if (metad->btm_root != P_NONE)
427 {
428 /*
429 * Metadata initialized by someone else. In order to guarantee no
430 * deadlocks, we have to release the metadata page and start all
431 * over again. (Is that really true? But it's hardly worth trying
432 * to optimize this case.)
433 */
434 _bt_relbuf(rel, metabuf);
435 return _bt_getroot(rel, heaprel, access);
436 }
437
438 /*
439 * Get, initialize, write, and leave a lock of the appropriate type on
440 * the new root page. Since this is the first page in the tree, it's
441 * a leaf as well as the root.
442 */
443 rootbuf = _bt_allocbuf(rel, heaprel);
444 rootblkno = BufferGetBlockNumber(rootbuf);
445 rootpage = BufferGetPage(rootbuf);
446 rootopaque = BTPageGetOpaque(rootpage);
447 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
448 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
449 rootopaque->btpo_level = 0;
450 rootopaque->btpo_cycleid = 0;
451 /* Get raw page pointer for metapage */
452 metapg = BufferGetPage(metabuf);
453
454 /* NO ELOG(ERROR) till meta is updated */
456
457 /* upgrade metapage if needed */
458 if (metad->btm_version < BTREE_NOVAC_VERSION)
459 _bt_upgrademetapage(metapg);
460
461 metad->btm_root = rootblkno;
462 metad->btm_level = 0;
463 metad->btm_fastroot = rootblkno;
464 metad->btm_fastlevel = 0;
467
468 MarkBufferDirty(rootbuf);
469 MarkBufferDirty(metabuf);
470
471 /* XLOG stuff */
472 if (RelationNeedsWAL(rel))
473 {
474 xl_btree_newroot xlrec;
475 XLogRecPtr recptr;
477
481
483 md.version = metad->btm_version;
484 md.root = rootblkno;
485 md.level = 0;
486 md.fastroot = rootblkno;
487 md.fastlevel = 0;
490
491 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
492
493 xlrec.rootblk = rootblkno;
494 xlrec.level = 0;
495
497
498 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
499
500 PageSetLSN(rootpage, recptr);
501 PageSetLSN(metapg, recptr);
502 }
503
505
506 /*
507 * swap root write lock for read lock. There is no danger of anyone
508 * else accessing the new root page while it's unlocked, since no one
509 * else knows where it is yet.
510 */
511 _bt_unlockbuf(rel, rootbuf);
512 _bt_lockbuf(rel, rootbuf, BT_READ);
513
514 /* okay, metadata is correct, release lock on it without caching */
515 _bt_relbuf(rel, metabuf);
516 }
517 else
518 {
519 rootblkno = metad->btm_fastroot;
520 Assert(rootblkno != P_NONE);
521 rootlevel = metad->btm_fastlevel;
522
523 /*
524 * Cache the metapage data for next time
525 */
527 sizeof(BTMetaPageData));
528 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
529
530 /*
531 * We are done with the metapage; arrange to release it via first
532 * _bt_relandgetbuf call
533 */
534 rootbuf = metabuf;
535
536 for (;;)
537 {
538 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
539 rootpage = BufferGetPage(rootbuf);
540 rootopaque = BTPageGetOpaque(rootpage);
541
542 if (!P_IGNORE(rootopaque))
543 break;
544
545 /* it's dead, Jim. step right one page */
546 if (P_RIGHTMOST(rootopaque))
547 elog(ERROR, "no live root page found in index \"%s\"",
549 rootblkno = rootopaque->btpo_next;
550 }
551
552 if (rootopaque->btpo_level != rootlevel)
553 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
554 rootblkno, RelationGetRelationName(rel),
555 rootopaque->btpo_level, rootlevel);
556 }
557
558 /*
559 * By here, we have a pin and read lock on the root page, and no lock set
560 * on the metadata page. Return the root page's buffer.
561 */
562 return rootbuf;
563}
#define InvalidBuffer
Definition: buf.h:25
uint32_t uint32
Definition: c.h:502
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:142
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
#define BTP_LEAF
Definition: nbtree.h:76
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTP_ROOT
Definition: nbtree.h:77
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:724
#define P_IGNORE(opaque)
Definition: nbtree.h:225
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
#define BT_WRITE
Definition: nbtree.h:725
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
BlockNumber btm_root
Definition: nbtree.h:107
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint32 btpo_level
Definition: nbtree.h:66
void * rd_amcache
Definition: rel.h:229
MemoryContext rd_indexcxt
Definition: rel.h:204
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34

References _bt_allocbuf(), _bt_getbuf(), _bt_getmeta(), _bt_getroot(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 675 of file nbtpage.c.

676{
677 BTMetaPageData *metad;
678
679 if (rel->rd_amcache == NULL)
680 {
681 Buffer metabuf;
682
683 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
684 metad = _bt_getmeta(rel, metabuf);
685
686 /*
687 * If there's no root page yet, _bt_getroot() doesn't expect a cache
688 * to be made, so just stop here and report the index height is zero.
689 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
690 */
691 if (metad->btm_root == P_NONE)
692 {
693 _bt_relbuf(rel, metabuf);
694 return 0;
695 }
696
697 /*
698 * Cache the metapage data for next time
699 */
701 sizeof(BTMetaPageData));
702 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
703 _bt_relbuf(rel, metabuf);
704 }
705
706 /* Get cached page */
707 metad = (BTMetaPageData *) rel->rd_amcache;
708 /* We shouldn't have cached it if any of these fail */
709 Assert(metad->btm_magic == BTREE_MAGIC);
712 Assert(!metad->btm_allequalimage ||
714 Assert(metad->btm_fastroot != P_NONE);
715
716 return metad->btm_fastlevel;
717}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and btgettreeheight().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 580 of file nbtpage.c.

581{
582 Buffer metabuf;
583 Page metapg;
584 BTPageOpaque metaopaque;
585 Buffer rootbuf;
586 Page rootpage;
587 BTPageOpaque rootopaque;
588 BlockNumber rootblkno;
589 uint32 rootlevel;
590 BTMetaPageData *metad;
591
592 /*
593 * We don't try to use cached metapage data here, since (a) this path is
594 * not performance-critical, and (b) if we are here it suggests our cache
595 * is out-of-date anyway. In light of point (b), it's probably safest to
596 * actively flush any cached metapage info.
597 */
598 if (rel->rd_amcache)
599 pfree(rel->rd_amcache);
600 rel->rd_amcache = NULL;
601
602 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
603 metapg = BufferGetPage(metabuf);
604 metaopaque = BTPageGetOpaque(metapg);
605 metad = BTPageGetMeta(metapg);
606
607 if (!P_ISMETA(metaopaque) ||
608 metad->btm_magic != BTREE_MAGIC)
610 (errcode(ERRCODE_INDEX_CORRUPTED),
611 errmsg("index \"%s\" is not a btree",
613
614 if (metad->btm_version < BTREE_MIN_VERSION ||
615 metad->btm_version > BTREE_VERSION)
617 (errcode(ERRCODE_INDEX_CORRUPTED),
618 errmsg("version mismatch in index \"%s\": file version %d, "
619 "current version %d, minimal supported version %d",
622
623 /* if no root page initialized yet, fail */
624 if (metad->btm_root == P_NONE)
625 {
626 _bt_relbuf(rel, metabuf);
627 return InvalidBuffer;
628 }
629
630 rootblkno = metad->btm_root;
631 rootlevel = metad->btm_level;
632
633 /*
634 * We are done with the metapage; arrange to release it via first
635 * _bt_relandgetbuf call
636 */
637 rootbuf = metabuf;
638
639 for (;;)
640 {
641 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
642 rootpage = BufferGetPage(rootbuf);
643 rootopaque = BTPageGetOpaque(rootpage);
644
645 if (!P_IGNORE(rootopaque))
646 break;
647
648 /* it's dead, Jim. step right one page */
649 if (P_RIGHTMOST(rootopaque))
650 elog(ERROR, "no live root page found in index \"%s\"",
652 rootblkno = rootopaque->btpo_next;
653 }
654
655 if (rootopaque->btpo_level != rootlevel)
656 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
657 rootblkno, RelationGetRelationName(rel),
658 rootopaque->btpo_level, rootlevel);
659
660 return rootbuf;
661}

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 67 of file nbtpage.c.

69{
70 BTMetaPageData *metad;
71 BTPageOpaque metaopaque;
72
73 _bt_pageinit(page, BLCKSZ);
74
75 metad = BTPageGetMeta(page);
76 metad->btm_magic = BTREE_MAGIC;
78 metad->btm_root = rootbknum;
79 metad->btm_level = level;
80 metad->btm_fastroot = rootbknum;
81 metad->btm_fastlevel = level;
84 metad->btm_allequalimage = allequalimage;
85
86 metaopaque = BTPageGetOpaque(page);
87 metaopaque->btpo_flags = BTP_META;
88
89 /*
90 * Set pd_lower just past the end of the metadata. This is essential,
91 * because without doing so, metadata will be lost if xlog.c compresses
92 * the page.
93 */
94 ((PageHeader) page)->pd_lower =
95 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
96}
PageHeaderData * PageHeader
Definition: bufpage.h:174
#define BTP_META
Definition: nbtree.h:79

References _bt_pageinit(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, and BTREE_VERSION.

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_leftsib_splitflag()

static bool _bt_leftsib_splitflag ( Relation  rel,
BlockNumber  leftsib,
BlockNumber  target 
)
static

Definition at line 1695 of file nbtpage.c.

1696{
1697 Buffer buf;
1698 Page page;
1699 BTPageOpaque opaque;
1700 bool result;
1701
1702 /* Easy case: No left sibling */
1703 if (leftsib == P_NONE)
1704 return false;
1705
1706 buf = _bt_getbuf(rel, leftsib, BT_READ);
1707 page = BufferGetPage(buf);
1708 opaque = BTPageGetOpaque(page);
1709
1710 /*
1711 * If the left sibling was concurrently split, so that its next-pointer
1712 * doesn't point to the current page anymore, the split that created
1713 * target must be completed. Caller can reasonably expect that there will
1714 * be a downlink to the target page that it can relocate using its stack.
1715 * (We don't allow splitting an incompletely split page again until the
1716 * previous split has been completed.)
1717 */
1718 result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1719 _bt_relbuf(rel, buf);
1720
1721 return result;
1722}
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetPage(), P_INCOMPLETE_SPLIT, and P_NONE.

Referenced by _bt_lock_subtree_parent(), and _bt_pagedel().

◆ _bt_lock_subtree_parent()

static bool _bt_lock_subtree_parent ( Relation  rel,
Relation  heaprel,
BlockNumber  child,
BTStack  stack,
Buffer subtreeparent,
OffsetNumber poffset,
BlockNumber topparent,
BlockNumber topparentrightsib 
)
static

Definition at line 2813 of file nbtpage.c.

2817{
2818 BlockNumber parent,
2819 leftsibparent;
2820 OffsetNumber parentoffset,
2821 maxoff;
2822 Buffer pbuf;
2823 Page page;
2824 BTPageOpaque opaque;
2825
2826 /*
2827 * Locate the pivot tuple whose downlink points to "child". Write lock
2828 * the parent page itself.
2829 */
2830 pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
2831 if (pbuf == InvalidBuffer)
2832 {
2833 /*
2834 * Failed to "re-find" a pivot tuple whose downlink matched our child
2835 * block number on the parent level -- the index must be corrupt.
2836 * Don't even try to delete the leafbuf subtree. Just report the
2837 * issue and press on with vacuuming the index.
2838 *
2839 * Note: _bt_getstackbuf() recovers from concurrent page splits that
2840 * take place on the parent level. Its approach is a near-exhaustive
2841 * linear search. This also gives it a surprisingly good chance of
2842 * recovering in the event of a buggy or inconsistent opclass. But we
2843 * don't rely on that here.
2844 */
2845 ereport(LOG,
2846 (errcode(ERRCODE_INDEX_CORRUPTED),
2847 errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2848 RelationGetRelationName(rel), child)));
2849 Assert(false);
2850 return false;
2851 }
2852
2853 parent = stack->bts_blkno;
2854 parentoffset = stack->bts_offset;
2855
2856 page = BufferGetPage(pbuf);
2857 opaque = BTPageGetOpaque(page);
2858 maxoff = PageGetMaxOffsetNumber(page);
2859 leftsibparent = opaque->btpo_prev;
2860
2861 /*
2862 * _bt_getstackbuf() completes page splits on returned parent buffer when
2863 * required.
2864 *
2865 * In general it's a bad idea for VACUUM to use up more disk space, which
2866 * is why page deletion does not finish incomplete page splits most of the
2867 * time. We allow this limited exception because the risk is much lower,
2868 * and the potential downside of not proceeding is much higher: A single
2869 * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2870 * prevent us from deleting hundreds of empty leaf pages from one level
2871 * down.
2872 */
2873 Assert(!P_INCOMPLETE_SPLIT(opaque));
2874
2875 if (parentoffset < maxoff)
2876 {
2877 /*
2878 * Child is not the rightmost child in parent, so it's safe to delete
2879 * the subtree whose root/topparent is child page
2880 */
2881 *subtreeparent = pbuf;
2882 *poffset = parentoffset;
2883 return true;
2884 }
2885
2886 /*
2887 * Child is the rightmost child of parent.
2888 *
2889 * Since it's the rightmost child of parent, deleting the child (or
2890 * deleting the subtree whose root/topparent is the child page) is only
2891 * safe when it's also possible to delete the parent.
2892 */
2893 Assert(parentoffset == maxoff);
2894 if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2895 {
2896 /*
2897 * Child isn't parent's only child, or parent is rightmost on its
2898 * entire level. Definitely cannot delete any pages.
2899 */
2900 _bt_relbuf(rel, pbuf);
2901 return false;
2902 }
2903
2904 /*
2905 * Now make sure that the parent deletion is itself safe by examining the
2906 * child's grandparent page. Recurse, passing the parent page as the
2907 * child page (child's grandparent is the parent on the next level up). If
2908 * parent deletion is unsafe, then child deletion must also be unsafe (in
2909 * which case caller cannot delete any pages at all).
2910 */
2911 *topparent = parent;
2912 *topparentrightsib = opaque->btpo_next;
2913
2914 /*
2915 * Release lock on parent before recursing.
2916 *
2917 * It's OK to release page locks on parent before recursive call locks
2918 * grandparent. An internal page can only acquire an entry if the child
2919 * is split, but that cannot happen as long as we still hold a lock on the
2920 * leafbuf page.
2921 */
2922 _bt_relbuf(rel, pbuf);
2923
2924 /*
2925 * Before recursing, check that the left sibling of parent (if any) is not
2926 * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2927 * parent lock).
2928 *
2929 * Note: We deliberately avoid completing incomplete splits here.
2930 */
2931 if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2932 return false;
2933
2934 /* Recurse to examine child page's grandparent page */
2935 return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
2936 subtreeparent, poffset,
2937 topparent, topparentrightsib);
2938}
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
#define LOG
Definition: elog.h:31
Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2319
static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2813
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1695
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
BlockNumber bts_blkno
Definition: nbtree.h:739
struct BTStackData * bts_parent
Definition: nbtree.h:741
OffsetNumber bts_offset
Definition: nbtree.h:740

References _bt_getstackbuf(), _bt_leftsib_splitflag(), _bt_lock_subtree_parent(), _bt_relbuf(), Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetPage(), ereport, errcode(), errmsg_internal(), InvalidBuffer, LOG, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_lock_subtree_parent(), and _bt_mark_page_halfdead().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)

Definition at line 1039 of file nbtpage.c.

1040{
1041 /* LockBuffer() asserts that pin is held by this backend */
1043
1044 /*
1045 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1046 * an nbtree error (e.g. a unique violation error). That won't cause
1047 * Valgrind false positives.
1048 *
1049 * The nbtree client requests are superimposed on top of the bufmgr.c
1050 * buffer pin client requests. In the event of an nbtree error the buffer
1051 * will certainly get marked as defined when the backend once again
1052 * acquires its first pin on the buffer. (Of course, if the backend never
1053 * touches the buffer again then it doesn't matter that it remains
1054 * non-accessible to Valgrind.)
1055 *
1056 * Note: When an IndexTuple C pointer gets computed using an ItemId read
1057 * from a page while a lock was held, the C pointer becomes unsafe to
1058 * dereference forever as soon as the lock is released. Valgrind can only
1059 * detect cases where the pointer gets dereferenced with no _current_
1060 * lock/pin held, though.
1061 */
1062 if (!RelationUsesLocalBuffers(rel))
1064}
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5151

References buf, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), _bt_unlink_halfdead_page(), and btvacuumpage().

◆ _bt_mark_page_halfdead()

static bool _bt_mark_page_halfdead ( Relation  rel,
Relation  heaprel,
Buffer  leafbuf,
BTStack  stack 
)
static

Definition at line 2088 of file nbtpage.c.

2090{
2091 BlockNumber leafblkno;
2092 BlockNumber leafrightsib;
2093 BlockNumber topparent;
2094 BlockNumber topparentrightsib;
2095 ItemId itemid;
2096 Page page;
2097 BTPageOpaque opaque;
2098 Buffer subtreeparent;
2099 OffsetNumber poffset;
2100 OffsetNumber nextoffset;
2101 IndexTuple itup;
2102 IndexTupleData trunctuple;
2103
2104 page = BufferGetPage(leafbuf);
2105 opaque = BTPageGetOpaque(page);
2106
2107 Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2108 P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2109 P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2110 Assert(heaprel != NULL);
2111
2112 /*
2113 * Save info about the leaf page.
2114 */
2115 leafblkno = BufferGetBlockNumber(leafbuf);
2116 leafrightsib = opaque->btpo_next;
2117
2118 /*
2119 * Before attempting to lock the parent page, check that the right sibling
2120 * is not in half-dead state. A half-dead right sibling would have no
2121 * downlink in the parent, which would be highly confusing later when we
2122 * delete the downlink. It would fail the "right sibling of target page
2123 * is also the next child in parent page" cross-check below.
2124 */
2125 if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2126 {
2127 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2128 leafblkno, leafrightsib);
2129 return false;
2130 }
2131
2132 /*
2133 * We cannot delete a page that is the rightmost child of its immediate
2134 * parent, unless it is the only child --- in which case the parent has to
2135 * be deleted too, and the same condition applies recursively to it. We
2136 * have to check this condition all the way up before trying to delete,
2137 * and lock the parent of the root of the to-be-deleted subtree (the
2138 * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2139 * for us. We remove the downlink to the "top parent" page (subtree root
2140 * page) from the subtree parent page below.
2141 *
2142 * Initialize topparent to be leafbuf page now. The final to-be-deleted
2143 * subtree is often a degenerate one page subtree consisting only of the
2144 * leafbuf page. When that happens, the leafbuf page is the final subtree
2145 * root page/top parent page.
2146 */
2147 topparent = leafblkno;
2148 topparentrightsib = leafrightsib;
2149 if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
2150 &subtreeparent, &poffset,
2151 &topparent, &topparentrightsib))
2152 return false;
2153
2154 page = BufferGetPage(subtreeparent);
2155 opaque = BTPageGetOpaque(page);
2156
2157#ifdef USE_ASSERT_CHECKING
2158
2159 /*
2160 * This is just an assertion because _bt_lock_subtree_parent should have
2161 * guaranteed tuple has the expected contents
2162 */
2163 itemid = PageGetItemId(page, poffset);
2164 itup = (IndexTuple) PageGetItem(page, itemid);
2165 Assert(BTreeTupleGetDownLink(itup) == topparent);
2166#endif
2167
2168 nextoffset = OffsetNumberNext(poffset);
2169 itemid = PageGetItemId(page, nextoffset);
2170 itup = (IndexTuple) PageGetItem(page, itemid);
2171
2172 /*
2173 * Check that the parent-page index items we're about to delete/overwrite
2174 * in subtree parent page contain what we expect. This can fail if the
2175 * index has become corrupt for some reason. When that happens we back
2176 * out of deletion of the leafbuf subtree. (This is just like the case
2177 * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
2178 */
2179 if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2180 {
2181 ereport(LOG,
2182 (errcode(ERRCODE_INDEX_CORRUPTED),
2183 errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2184 topparentrightsib, topparent,
2186 BufferGetBlockNumber(subtreeparent),
2188
2189 _bt_relbuf(rel, subtreeparent);
2190 Assert(false);
2191 return false;
2192 }
2193
2194 /*
2195 * Any insert which would have gone on the leaf block will now go to its
2196 * right sibling. In other words, the key space moves right.
2197 */
2198 PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2199
2200 /* No ereport(ERROR) until changes are logged */
2202
2203 /*
2204 * Update parent of subtree. We want to delete the downlink to the top
2205 * parent page/root of the subtree, and the *following* key. Easiest way
2206 * is to copy the right sibling's downlink over the downlink that points
2207 * to top parent page, and then delete the right sibling's original pivot
2208 * tuple.
2209 *
2210 * Lanin and Shasha make the key space move left when deleting a page,
2211 * whereas the key space moves right here. That's why we cannot simply
2212 * delete the pivot tuple with the downlink to the top parent page. See
2213 * nbtree/README.
2214 */
2215 page = BufferGetPage(subtreeparent);
2216 opaque = BTPageGetOpaque(page);
2217
2218 itemid = PageGetItemId(page, poffset);
2219 itup = (IndexTuple) PageGetItem(page, itemid);
2220 BTreeTupleSetDownLink(itup, topparentrightsib);
2221
2222 nextoffset = OffsetNumberNext(poffset);
2223 PageIndexTupleDelete(page, nextoffset);
2224
2225 /*
2226 * Mark the leaf page as half-dead, and stamp it with a link to the top
2227 * parent page. When the leaf page is also the top parent page, the link
2228 * is set to InvalidBlockNumber.
2229 */
2230 page = BufferGetPage(leafbuf);
2231 opaque = BTPageGetOpaque(page);
2232 opaque->btpo_flags |= BTP_HALF_DEAD;
2233
2235 MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2236 trunctuple.t_info = sizeof(IndexTupleData);
2237 if (topparent != leafblkno)
2238 BTreeTupleSetTopParent(&trunctuple, topparent);
2239 else
2241
2242 if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2243 IndexTupleSize(&trunctuple)))
2244 elog(ERROR, "could not overwrite high key in half-dead page");
2245
2246 /* Must mark buffers dirty before XLogInsert */
2247 MarkBufferDirty(subtreeparent);
2248 MarkBufferDirty(leafbuf);
2249
2250 /* XLOG stuff */
2251 if (RelationNeedsWAL(rel))
2252 {
2254 XLogRecPtr recptr;
2255
2256 xlrec.poffset = poffset;
2257 xlrec.leafblk = leafblkno;
2258 if (topparent != leafblkno)
2259 xlrec.topparent = topparent;
2260 else
2262
2265 XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2266
2267 page = BufferGetPage(leafbuf);
2268 opaque = BTPageGetOpaque(page);
2269 xlrec.leftblk = opaque->btpo_prev;
2270 xlrec.rightblk = opaque->btpo_next;
2271
2273
2274 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2275
2276 page = BufferGetPage(subtreeparent);
2277 PageSetLSN(page, recptr);
2278 page = BufferGetPage(leafbuf);
2279 PageSetLSN(page, recptr);
2280 }
2281
2283
2284 _bt_relbuf(rel, subtreeparent);
2285 return true;
2286}
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:1041
#define MemSet(start, val, len)
Definition: c.h:991
#define DEBUG1
Definition: elog.h:30
struct IndexTupleData IndexTupleData
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1752
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTP_HALF_DEAD
Definition: nbtree.h:80
#define P_HIKEY
Definition: nbtree.h:367
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:626
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:562
#define P_ISROOT(opaque)
Definition: nbtree.h:221
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:291
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:38
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3219
unsigned short t_info
Definition: itup.h:49

References _bt_lock_subtree_parent(), _bt_relbuf(), _bt_rightsib_halfdeadflag(), Assert(), BTP_HALF_DEAD, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), BTreeTupleSetDownLink(), BTreeTupleSetTopParent(), BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, IndexTupleSize(), InvalidBlockNumber, xl_btree_mark_page_halfdead::leafblk, xl_btree_mark_page_halfdead::leftblk, LOG, MarkBufferDirty(), MemSet, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageIndexTupleDelete(), PageIndexTupleOverwrite(), PageSetLSN(), xl_btree_mark_page_halfdead::poffset, PredicateLockPageCombine(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_mark_page_halfdead::rightblk, SizeOfBtreeMarkPageHalfDead, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_mark_page_halfdead::topparent, XLOG_BTREE_MARK_PAGE_HALFDEAD, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_pagedel().

◆ _bt_metaversion()

void _bt_metaversion ( Relation  rel,
bool *  heapkeyspace,
bool *  allequalimage 
)

Definition at line 739 of file nbtpage.c.

740{
741 BTMetaPageData *metad;
742
743 if (rel->rd_amcache == NULL)
744 {
745 Buffer metabuf;
746
747 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
748 metad = _bt_getmeta(rel, metabuf);
749
750 /*
751 * If there's no root page yet, _bt_getroot() doesn't expect a cache
752 * to be made, so just stop here. (XXX perhaps _bt_getroot() should
753 * be changed to allow this case.)
754 */
755 if (metad->btm_root == P_NONE)
756 {
757 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
758 *allequalimage = metad->btm_allequalimage;
759
760 _bt_relbuf(rel, metabuf);
761 return;
762 }
763
764 /*
765 * Cache the metapage data for next time
766 *
767 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
768 * can change the nbtree version for an index without invalidating any
769 * local cache. This is okay because it can only happen when moving
770 * from version 2 to version 3, both of which are !heapkeyspace
771 * versions.
772 */
774 sizeof(BTMetaPageData));
775 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
776 _bt_relbuf(rel, metabuf);
777 }
778
779 /* Get cached page */
780 metad = (BTMetaPageData *) rel->rd_amcache;
781 /* We shouldn't have cached it if any of these fail */
782 Assert(metad->btm_magic == BTREE_MAGIC);
785 Assert(!metad->btm_allequalimage ||
787 Assert(metad->btm_fastroot != P_NONE);
788
789 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
790 *allequalimage = metad->btm_allequalimage;
791}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_internal().

◆ _bt_pagedel()

void _bt_pagedel ( Relation  rel,
Buffer  leafbuf,
BTVacState vstate 
)

Definition at line 1802 of file nbtpage.c.

1803{
1804 BlockNumber rightsib;
1805 bool rightsib_empty;
1806 Page page;
1807 BTPageOpaque opaque;
1808
1809 /*
1810 * Save original leafbuf block number from caller. Only deleted blocks
1811 * that are <= scanblkno are added to bulk delete stat's pages_deleted
1812 * count.
1813 */
1814 BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1815
1816 /*
1817 * "stack" is a search stack leading (approximately) to the target page.
1818 * It is initially NULL, but when iterating, we keep it to avoid
1819 * duplicated search effort.
1820 *
1821 * Also, when "stack" is not NULL, we have already checked that the
1822 * current page is not the right half of an incomplete split, i.e. the
1823 * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1824 * when the current target page is to the right of caller's initial page
1825 * (the scanblkno page).
1826 */
1827 BTStack stack = NULL;
1828
1829 for (;;)
1830 {
1831 page = BufferGetPage(leafbuf);
1832 opaque = BTPageGetOpaque(page);
1833
1834 /*
1835 * Internal pages are never deleted directly, only as part of deleting
1836 * the whole subtree all the way down to leaf level.
1837 *
1838 * Also check for deleted pages here. Caller never passes us a fully
1839 * deleted page. Only VACUUM can delete pages, so there can't have
1840 * been a concurrent deletion. Assume that we reached any deleted
1841 * page encountered here by following a sibling link, and that the
1842 * index is corrupt.
1843 */
1844 Assert(!P_ISDELETED(opaque));
1845 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1846 {
1847 /*
1848 * Pre-9.4 page deletion only marked internal pages as half-dead,
1849 * but now we only use that flag on leaf pages. The old algorithm
1850 * was never supposed to leave half-dead pages in the tree, it was
1851 * just a transient state, but it was nevertheless possible in
1852 * error scenarios. We don't know how to deal with them here. They
1853 * are harmless as far as searches are considered, but inserts
1854 * into the deleted keyspace could add out-of-order downlinks in
1855 * the upper levels. Log a notice, hopefully the admin will notice
1856 * and reindex.
1857 */
1858 if (P_ISHALFDEAD(opaque))
1859 ereport(LOG,
1860 (errcode(ERRCODE_INDEX_CORRUPTED),
1861 errmsg("index \"%s\" contains a half-dead internal page",
1863 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1864
1865 if (P_ISDELETED(opaque))
1866 ereport(LOG,
1867 (errcode(ERRCODE_INDEX_CORRUPTED),
1868 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1869 BufferGetBlockNumber(leafbuf),
1870 scanblkno,
1872
1873 _bt_relbuf(rel, leafbuf);
1874 return;
1875 }
1876
1877 /*
1878 * We can never delete rightmost pages nor root pages. While at it,
1879 * check that page is empty, since it's possible that the leafbuf page
1880 * was empty a moment ago, but has since had some inserts.
1881 *
1882 * To keep the algorithm simple, we also never delete an incompletely
1883 * split page (they should be rare enough that this doesn't make any
1884 * meaningful difference to disk usage):
1885 *
1886 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1887 * left half of an incomplete split, but ensuring that it's not the
1888 * right half is more complicated. For that, we have to check that
1889 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1890 * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1891 * release the lock on scanblkno/leafbuf, check the left sibling, and
1892 * construct a search stack to scanblkno. On subsequent iterations,
1893 * we know we stepped right from a page that passed these tests, so
1894 * it's OK.
1895 */
1896 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1897 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1898 P_INCOMPLETE_SPLIT(opaque))
1899 {
1900 /* Should never fail to delete a half-dead page */
1901 Assert(!P_ISHALFDEAD(opaque));
1902
1903 _bt_relbuf(rel, leafbuf);
1904 return;
1905 }
1906
1907 /*
1908 * First, remove downlink pointing to the page (or a parent of the
1909 * page, if we are going to delete a taller subtree), and mark the
1910 * leafbuf page half-dead
1911 */
1912 if (!P_ISHALFDEAD(opaque))
1913 {
1914 /*
1915 * We need an approximate pointer to the page's parent page. We
1916 * use a variant of the standard search mechanism to search for
1917 * the page's high key; this will give us a link to either the
1918 * current parent or someplace to its left (if there are multiple
1919 * equal high keys, which is possible with !heapkeyspace indexes).
1920 *
1921 * Also check if this is the right-half of an incomplete split
1922 * (see comment above).
1923 */
1924 if (!stack)
1925 {
1926 BTScanInsert itup_key;
1927 ItemId itemid;
1928 IndexTuple targetkey;
1929 BlockNumber leftsib,
1930 leafblkno;
1931 Buffer sleafbuf;
1932
1933 itemid = PageGetItemId(page, P_HIKEY);
1934 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1935
1936 leftsib = opaque->btpo_prev;
1937 leafblkno = BufferGetBlockNumber(leafbuf);
1938
1939 /*
1940 * To avoid deadlocks, we'd better drop the leaf page lock
1941 * before going further.
1942 */
1943 _bt_unlockbuf(rel, leafbuf);
1944
1945 /*
1946 * Check that the left sibling of leafbuf (if any) is not
1947 * marked with INCOMPLETE_SPLIT flag before proceeding
1948 */
1949 Assert(leafblkno == scanblkno);
1950 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1951 {
1952 ReleaseBuffer(leafbuf);
1953 return;
1954 }
1955
1956 /*
1957 * We need an insertion scan key, so build one.
1958 *
1959 * _bt_search searches for the leaf page that contains any
1960 * matching non-pivot tuples, but we need it to "search" for
1961 * the high key pivot from the page that we're set to delete.
1962 * Compensate for the mismatch by having _bt_search locate the
1963 * last position < equal-to-untruncated-prefix non-pivots.
1964 */
1965 itup_key = _bt_mkscankey(rel, targetkey);
1966
1967 /* Set up a BTLessStrategyNumber-like insertion scan key */
1968 itup_key->nextkey = false;
1969 itup_key->backward = true;
1970 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
1971 /* won't need a second lock or pin on leafbuf */
1972 _bt_relbuf(rel, sleafbuf);
1973
1974 /*
1975 * Re-lock the leaf page, and start over to use our stack
1976 * within _bt_mark_page_halfdead. We must do it that way
1977 * because it's possible that leafbuf can no longer be
1978 * deleted. We need to recheck.
1979 *
1980 * Note: We can't simply hold on to the sleafbuf lock instead,
1981 * because it's barely possible that sleafbuf is not the same
1982 * page as leafbuf. This happens when leafbuf split after our
1983 * original lock was dropped, but before _bt_search finished
1984 * its descent. We rely on the assumption that we'll find
1985 * leafbuf isn't safe to delete anymore in this scenario.
1986 * (Page deletion can cope with the stack being to the left of
1987 * leafbuf, but not to the right of leafbuf.)
1988 */
1989 _bt_lockbuf(rel, leafbuf, BT_WRITE);
1990 continue;
1991 }
1992
1993 /*
1994 * See if it's safe to delete the leaf page, and determine how
1995 * many parent/internal pages above the leaf level will be
1996 * deleted. If it's safe then _bt_mark_page_halfdead will also
1997 * perform the first phase of deletion, which includes marking the
1998 * leafbuf page half-dead.
1999 */
2000 Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2001 if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2002 stack))
2003 {
2004 _bt_relbuf(rel, leafbuf);
2005 return;
2006 }
2007 }
2008
2009 /*
2010 * Then unlink it from its siblings. Each call to
2011 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2012 * making it shallower. Iterate until the leafbuf page is deleted.
2013 */
2014 rightsib_empty = false;
2015 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2016 while (P_ISHALFDEAD(opaque))
2017 {
2018 /* Check for interrupts in _bt_unlink_halfdead_page */
2019 if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2020 &rightsib_empty, vstate))
2021 {
2022 /*
2023 * _bt_unlink_halfdead_page should never fail, since we
2024 * established that deletion is generally safe in
2025 * _bt_mark_page_halfdead -- index must be corrupt.
2026 *
2027 * Note that _bt_unlink_halfdead_page already released the
2028 * lock and pin on leafbuf for us.
2029 */
2030 Assert(false);
2031 return;
2032 }
2033 }
2034
2035 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2036
2037 rightsib = opaque->btpo_next;
2038
2039 _bt_relbuf(rel, leafbuf);
2040
2041 /*
2042 * Check here, as calling loops will have locks held, preventing
2043 * interrupts from being processed.
2044 */
2046
2047 /*
2048 * The page has now been deleted. If its right sibling is completely
2049 * empty, it's possible that the reason we haven't deleted it earlier
2050 * is that it was the rightmost child of the parent. Now that we
2051 * removed the downlink for this page, the right sibling might now be
2052 * the only child of the parent, and could be removed. It would be
2053 * picked up by the next vacuum anyway, but might as well try to
2054 * remove it now, so loop back to process the right sibling.
2055 *
2056 * Note: This relies on the assumption that _bt_getstackbuf() will be
2057 * able to reuse our original descent stack with a different child
2058 * block (provided that the child block is to the right of the
2059 * original leaf page reached by _bt_search()). It will even update
2060 * the descent stack each time we loop around, avoiding repeated work.
2061 */
2062 if (!rightsib_empty)
2063 break;
2064
2065 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2066 }
2067}
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2088
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2314
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:80
IndexVacuumInfo * info
Definition: nbtree.h:332
Relation heaprel
Definition: genam.h:70

References _bt_getbuf(), _bt_leftsib_splitflag(), _bt_lockbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_unlockbuf(), Assert(), BTScanInsertData::backward, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), IndexVacuumInfo::heaprel, BTVacState::info, LOG, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

◆ _bt_pageinit()

void _bt_pageinit ( Page  page,
Size  size 
)

Definition at line 1129 of file nbtpage.c.

1130{
1131 PageInit(page, size, sizeof(BTPageOpaqueData));
1132}
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42

References PageInit().

Referenced by _bt_allocbuf(), _bt_blnewpage(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

◆ _bt_pendingfsm_add()

static void _bt_pendingfsm_add ( BTVacState vstate,
BlockNumber  target,
FullTransactionId  safexid 
)
static

Definition at line 3063 of file nbtpage.c.

3066{
3067 Assert(vstate->npendingpages <= vstate->bufsize);
3068 Assert(vstate->bufsize <= vstate->maxbufsize);
3069
3070#ifdef USE_ASSERT_CHECKING
3071
3072 /*
3073 * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3074 * array will always be in safexid order (since that is the order that we
3075 * save them in here)
3076 */
3077 if (vstate->npendingpages > 0)
3078 {
3079 FullTransactionId lastsafexid =
3080 vstate->pendingpages[vstate->npendingpages - 1].safexid;
3081
3082 Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3083 }
3084#endif
3085
3086 /*
3087 * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3088 * information about this page.
3089 *
3090 * Note that this also covers the case where we opted to not use the
3091 * optimization in _bt_pendingfsm_init().
3092 */
3093 if (vstate->npendingpages == vstate->maxbufsize)
3094 return;
3095
3096 /* Consider enlarging buffer */
3097 if (vstate->npendingpages == vstate->bufsize)
3098 {
3099 int newbufsize = vstate->bufsize * 2;
3100
3101 /* Respect work_mem */
3102 if (newbufsize > vstate->maxbufsize)
3103 newbufsize = vstate->maxbufsize;
3104
3105 vstate->bufsize = newbufsize;
3106 vstate->pendingpages =
3107 repalloc(vstate->pendingpages,
3108 sizeof(BTPendingFSM) * vstate->bufsize);
3109 }
3110
3111 /* Save metadata for newly deleted page */
3112 vstate->pendingpages[vstate->npendingpages].target = target;
3113 vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3114 vstate->npendingpages++;
3115}
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1544
FullTransactionId safexid
Definition: nbtree.h:327
BlockNumber target
Definition: nbtree.h:326
BTPendingFSM * pendingpages
Definition: nbtree.h:344
int npendingpages
Definition: nbtree.h:345
int bufsize
Definition: nbtree.h:342
int maxbufsize
Definition: nbtree.h:343
#define FullTransactionIdFollowsOrEquals(a, b)
Definition: transam.h:54

References Assert(), BTVacState::bufsize, FullTransactionIdFollowsOrEquals, BTVacState::maxbufsize, BTVacState::npendingpages, BTVacState::pendingpages, repalloc(), BTPendingFSM::safexid, and BTPendingFSM::target.

Referenced by _bt_unlink_halfdead_page().

◆ _bt_pendingfsm_finalize()

void _bt_pendingfsm_finalize ( Relation  rel,
BTVacState vstate 
)

Definition at line 2996 of file nbtpage.c.

2997{
2998 IndexBulkDeleteResult *stats = vstate->stats;
2999 Relation heaprel = vstate->info->heaprel;
3000
3001 Assert(stats->pages_newly_deleted >= vstate->npendingpages);
3002 Assert(heaprel != NULL);
3003
3004 if (vstate->npendingpages == 0)
3005 {
3006 /* Just free memory when nothing to do */
3007 if (vstate->pendingpages)
3008 pfree(vstate->pendingpages);
3009
3010 return;
3011 }
3012
3013#ifdef DEBUG_BTREE_PENDING_FSM
3014
3015 /*
3016 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3017 * placing pending pages in the FSM. Note that the optimization will
3018 * never be effective without some other backend concurrently consuming an
3019 * XID.
3020 */
3021 pg_usleep(5000000L);
3022#endif
3023
3024 /*
3025 * Recompute VACUUM XID boundaries.
3026 *
3027 * We don't actually care about the oldest non-removable XID. Computing
3028 * the oldest such XID has a useful side-effect that we rely on: it
3029 * forcibly updates the XID horizon state for this backend. This step is
3030 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3031 * that it is now safe to recycle newly deleted pages without this step.
3032 */
3034
3035 for (int i = 0; i < vstate->npendingpages; i++)
3036 {
3037 BlockNumber target = vstate->pendingpages[i].target;
3038 FullTransactionId safexid = vstate->pendingpages[i].safexid;
3039
3040 /*
3041 * Do the equivalent of checking BTPageIsRecyclable(), but without
3042 * accessing the page again a second time.
3043 *
3044 * Give up on finding the first non-recyclable page -- all later pages
3045 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3046 * to the array in safexid order.
3047 */
3048 if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3049 break;
3050
3051 RecordFreeIndexPage(rel, target);
3052 stats->pages_free++;
3053 }
3054
3055 pfree(vstate->pendingpages);
3056}
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:2005
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4286
void pg_usleep(long microsec)
Definition: signal.c:53
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
BlockNumber pages_newly_deleted
Definition: genam.h:104
BlockNumber pages_free
Definition: genam.h:106

References Assert(), GetOldestNonRemovableTransactionId(), GlobalVisCheckRemovableFullXid(), IndexVacuumInfo::heaprel, i, BTVacState::info, BTVacState::npendingpages, IndexBulkDeleteResult::pages_free, IndexBulkDeleteResult::pages_newly_deleted, BTVacState::pendingpages, pfree(), pg_usleep(), RecordFreeIndexPage(), BTPendingFSM::safexid, BTVacState::stats, and BTPendingFSM::target.

Referenced by btvacuumscan().

◆ _bt_pendingfsm_init()

void _bt_pendingfsm_init ( Relation  rel,
BTVacState vstate,
bool  cleanuponly 
)

Definition at line 2954 of file nbtpage.c.

2955{
2956 Size maxbufsize;
2957
2958 /*
2959 * Don't bother with optimization in cleanup-only case -- we don't expect
2960 * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2961 * can only take place because this optimization didn't work out during
2962 * the last VACUUM.
2963 */
2964 if (cleanuponly)
2965 return;
2966
2967 /*
2968 * Cap maximum size of array so that we always respect work_mem. Avoid
2969 * int overflow here.
2970 */
2971 vstate->bufsize = 256;
2972 maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
2973 maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2974 /* BTVacState.maxbufsize has type int */
2975 maxbufsize = Min(maxbufsize, INT_MAX);
2976 /* Stay sane with small work_mem */
2977 maxbufsize = Max(maxbufsize, vstate->bufsize);
2978 vstate->maxbufsize = (int) maxbufsize;
2979
2980 /* Allocate buffer, indicate that there are currently 0 pending pages */
2981 vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2982 vstate->npendingpages = 0;
2983}
#define Min(x, y)
Definition: c.h:975
#define Max(x, y)
Definition: c.h:969
#define MaxAllocSize
Definition: fe_memutils.h:22
int work_mem
Definition: globals.c:130
struct BTPendingFSM BTPendingFSM

References BTVacState::bufsize, Max, MaxAllocSize, BTVacState::maxbufsize, Min, BTVacState::npendingpages, palloc(), BTVacState::pendingpages, and work_mem.

Referenced by btvacuumscan().

◆ _bt_relandgetbuf()

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)

Definition at line 1003 of file nbtpage.c.

1004{
1005 Buffer buf;
1006
1007 Assert(BlockNumberIsValid(blkno));
1008 if (BufferIsValid(obuf))
1009 _bt_unlockbuf(rel, obuf);
1010 buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1011 _bt_lockbuf(rel, buf, access);
1012
1013 _bt_checkpage(rel, buf);
1014 return buf;
1015}
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:2658
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:352

References _bt_checkpage(), _bt_lockbuf(), _bt_unlockbuf(), Assert(), BlockNumberIsValid(), buf, BufferIsValid(), and ReleaseAndReadBuffer().

Referenced by _bt_check_unique(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_lock_and_validate_left(), _bt_moveright(), _bt_search(), and _bt_stepright().

◆ _bt_relbuf()

◆ _bt_rightsib_halfdeadflag()

static bool _bt_rightsib_halfdeadflag ( Relation  rel,
BlockNumber  leafrightsib 
)
static

Definition at line 1752 of file nbtpage.c.

1753{
1754 Buffer buf;
1755 Page page;
1756 BTPageOpaque opaque;
1757 bool result;
1758
1759 Assert(leafrightsib != P_NONE);
1760
1761 buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1762 page = BufferGetPage(buf);
1763 opaque = BTPageGetOpaque(page);
1764
1765 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1766 result = P_ISHALFDEAD(opaque);
1767 _bt_relbuf(rel, buf);
1768
1769 return result;
1770}

References _bt_getbuf(), _bt_relbuf(), Assert(), BT_READ, BTPageGetOpaque, buf, BufferGetPage(), P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, and P_NONE.

Referenced by _bt_mark_page_halfdead().

◆ _bt_set_cleanup_info()

void _bt_set_cleanup_info ( Relation  rel,
BlockNumber  num_delpages 
)

Definition at line 232 of file nbtpage.c.

233{
234 Buffer metabuf;
235 Page metapg;
236 BTMetaPageData *metad;
237
238 /*
239 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
240 * field started out as a TransactionId field called btm_oldest_btpo_xact.
241 * Both "versions" are just uint32 fields. It was convenient to repurpose
242 * the field when we began to use 64-bit XIDs in deleted pages.
243 *
244 * It's possible that a pg_upgrade'd database will contain an XID value in
245 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
246 * field. _bt_vacuum_needs_cleanup() may even believe that this value
247 * indicates that there are lots of pages that it needs to recycle, when
248 * in reality there are only one or two. The worst that can happen is
249 * that there will be a call to btvacuumscan a little earlier, which will
250 * set btm_last_cleanup_num_delpages to a sane value when we're called.
251 *
252 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
253 * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
254 * to be consistent.
255 */
256 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
257 metapg = BufferGetPage(metabuf);
258 metad = BTPageGetMeta(metapg);
259
260 /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
261 if (metad->btm_version >= BTREE_NOVAC_VERSION &&
262 metad->btm_last_cleanup_num_delpages == num_delpages)
263 {
264 /* Usually means index continues to have num_delpages of 0 */
265 _bt_relbuf(rel, metabuf);
266 return;
267 }
268
269 /* trade in our read lock for a write lock */
270 _bt_unlockbuf(rel, metabuf);
271 _bt_lockbuf(rel, metabuf, BT_WRITE);
272
274
275 /* upgrade meta-page if needed */
276 if (metad->btm_version < BTREE_NOVAC_VERSION)
277 _bt_upgrademetapage(metapg);
278
279 /* update cleanup-related information */
280 metad->btm_last_cleanup_num_delpages = num_delpages;
282 MarkBufferDirty(metabuf);
283
284 /* write wal record if needed */
285 if (RelationNeedsWAL(rel))
286 {
288 XLogRecPtr recptr;
289
292
294 md.version = metad->btm_version;
295 md.root = metad->btm_root;
296 md.level = metad->btm_level;
297 md.fastroot = metad->btm_fastroot;
298 md.fastlevel = metad->btm_fastlevel;
299 md.last_cleanup_num_delpages = num_delpages;
301
302 XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
303
304 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
305
306 PageSetLSN(metapg, recptr);
307 }
308
310
311 _bt_relbuf(rel, metabuf);
312}
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), PageSetLSN(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationNeedsWAL, xl_btree_metadata::root, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_META_CLEANUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), and XLogRegisterBuffer().

Referenced by btvacuumcleanup().

◆ _bt_unlink_halfdead_page()

static bool _bt_unlink_halfdead_page ( Relation  rel,
Buffer  leafbuf,
BlockNumber  scanblkno,
bool *  rightsib_empty,
BTVacState vstate 
)
static

Definition at line 2314 of file nbtpage.c.

2316{
2317 BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2318 IndexBulkDeleteResult *stats = vstate->stats;
2319 BlockNumber leafleftsib;
2320 BlockNumber leafrightsib;
2321 BlockNumber target;
2322 BlockNumber leftsib;
2323 BlockNumber rightsib;
2324 Buffer lbuf = InvalidBuffer;
2325 Buffer buf;
2326 Buffer rbuf;
2327 Buffer metabuf = InvalidBuffer;
2328 Page metapg = NULL;
2329 BTMetaPageData *metad = NULL;
2330 ItemId itemid;
2331 Page page;
2332 BTPageOpaque opaque;
2333 FullTransactionId safexid;
2334 bool rightsib_is_rightmost;
2335 uint32 targetlevel;
2336 IndexTuple leafhikey;
2337 BlockNumber leaftopparent;
2338
2339 page = BufferGetPage(leafbuf);
2340 opaque = BTPageGetOpaque(page);
2341
2342 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2343
2344 /*
2345 * Remember some information about the leaf page.
2346 */
2347 itemid = PageGetItemId(page, P_HIKEY);
2348 leafhikey = (IndexTuple) PageGetItem(page, itemid);
2349 target = BTreeTupleGetTopParent(leafhikey);
2350 leafleftsib = opaque->btpo_prev;
2351 leafrightsib = opaque->btpo_next;
2352
2353 _bt_unlockbuf(rel, leafbuf);
2354
2355 /*
2356 * Check here, as calling loops will have locks held, preventing
2357 * interrupts from being processed.
2358 */
2360
2361 /* Unlink the current top parent of the subtree */
2362 if (!BlockNumberIsValid(target))
2363 {
2364 /* Target is leaf page (or leaf page is top parent, if you prefer) */
2365 target = leafblkno;
2366
2367 buf = leafbuf;
2368 leftsib = leafleftsib;
2369 targetlevel = 0;
2370 }
2371 else
2372 {
2373 /* Target is the internal page taken from leaf's top parent link */
2374 Assert(target != leafblkno);
2375
2376 /* Fetch the block number of the target's left sibling */
2377 buf = _bt_getbuf(rel, target, BT_READ);
2378 page = BufferGetPage(buf);
2379 opaque = BTPageGetOpaque(page);
2380 leftsib = opaque->btpo_prev;
2381 targetlevel = opaque->btpo_level;
2382 Assert(targetlevel > 0);
2383
2384 /*
2385 * To avoid deadlocks, we'd better drop the target page lock before
2386 * going further.
2387 */
2388 _bt_unlockbuf(rel, buf);
2389 }
2390
2391 /*
2392 * We have to lock the pages we need to modify in the standard order:
2393 * moving right, then up. Else we will deadlock against other writers.
2394 *
2395 * So, first lock the leaf page, if it's not the target. Then find and
2396 * write-lock the current left sibling of the target page. The sibling
2397 * that was current a moment ago could have split, so we may have to move
2398 * right.
2399 */
2400 if (target != leafblkno)
2401 _bt_lockbuf(rel, leafbuf, BT_WRITE);
2402 if (leftsib != P_NONE)
2403 {
2404 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2405 page = BufferGetPage(lbuf);
2406 opaque = BTPageGetOpaque(page);
2407 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2408 {
2409 bool leftsibvalid = true;
2410
2411 /*
2412 * Before we follow the link from the page that was the left
2413 * sibling mere moments ago, validate its right link. This
2414 * reduces the opportunities for loop to fail to ever make any
2415 * progress in the presence of index corruption.
2416 *
2417 * Note: we rely on the assumption that there can only be one
2418 * vacuum process running at a time (against the same index).
2419 */
2420 if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2421 leftsib == opaque->btpo_next)
2422 leftsibvalid = false;
2423
2424 leftsib = opaque->btpo_next;
2425 _bt_relbuf(rel, lbuf);
2426
2427 if (!leftsibvalid)
2428 {
2429 /*
2430 * This is known to fail in the field; sibling link corruption
2431 * is relatively common. Press on with vacuuming rather than
2432 * just throwing an ERROR.
2433 */
2434 ereport(LOG,
2435 (errcode(ERRCODE_INDEX_CORRUPTED),
2436 errmsg_internal("valid left sibling for deletion target could not be located: "
2437 "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
2438 leftsib, target, leafblkno, scanblkno,
2439 targetlevel, RelationGetRelationName(rel))));
2440
2441 /* Must release all pins and locks on failure exit */
2443 if (target != leafblkno)
2444 _bt_relbuf(rel, leafbuf);
2445
2446 return false;
2447 }
2448
2450
2451 /* step right one page */
2452 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2453 page = BufferGetPage(lbuf);
2454 opaque = BTPageGetOpaque(page);
2455 }
2456 }
2457 else
2458 lbuf = InvalidBuffer;
2459
2460 /* Next write-lock the target page itself */
2461 _bt_lockbuf(rel, buf, BT_WRITE);
2462 page = BufferGetPage(buf);
2463 opaque = BTPageGetOpaque(page);
2464
2465 /*
2466 * Check page is still empty etc, else abandon deletion. This is just for
2467 * paranoia's sake; a half-dead page cannot resurrect because there can be
2468 * only one vacuum process running at a time.
2469 */
2470 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2471 elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2472 target, RelationGetRelationName(rel));
2473
2474 if (opaque->btpo_prev != leftsib)
2475 ereport(ERROR,
2476 (errcode(ERRCODE_INDEX_CORRUPTED),
2477 errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2478 leftsib, opaque->btpo_prev, target,
2480
2481 if (target == leafblkno)
2482 {
2483 if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2484 !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2485 elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2486 target, RelationGetRelationName(rel));
2487
2488 /* Leaf page is also target page: don't set leaftopparent */
2489 leaftopparent = InvalidBlockNumber;
2490 }
2491 else
2492 {
2493 IndexTuple finaldataitem;
2494
2495 if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2496 P_ISLEAF(opaque))
2497 elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2498 targetlevel, target, RelationGetRelationName(rel));
2499
2500 /* Target is internal: set leaftopparent for next call here... */
2501 itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2502 finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2503 leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2504 /* ...except when it would be a redundant pointer-to-self */
2505 if (leaftopparent == leafblkno)
2506 leaftopparent = InvalidBlockNumber;
2507 }
2508
2509 /* No leaftopparent for level 0 (leaf page) or level 1 target */
2510 Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2511
2512 /*
2513 * And next write-lock the (current) right sibling.
2514 */
2515 rightsib = opaque->btpo_next;
2516 rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2517 page = BufferGetPage(rbuf);
2518 opaque = BTPageGetOpaque(page);
2519
2520 /*
2521 * Validate target's right sibling page. Its left link must point back to
2522 * the target page.
2523 */
2524 if (opaque->btpo_prev != target)
2525 {
2526 /*
2527 * This is known to fail in the field; sibling link corruption is
2528 * relatively common. Press on with vacuuming rather than just
2529 * throwing an ERROR (same approach used for left-sibling's-right-link
2530 * validation check a moment ago).
2531 */
2532 ereport(LOG,
2533 (errcode(ERRCODE_INDEX_CORRUPTED),
2534 errmsg_internal("right sibling's left-link doesn't match: "
2535 "right sibling %u of target %u with leafblkno %u "
2536 "and scanblkno %u spuriously links to non-target %u "
2537 "on level %u of index \"%s\"",
2538 rightsib, target, leafblkno,
2539 scanblkno, opaque->btpo_prev,
2540 targetlevel, RelationGetRelationName(rel))));
2541
2542 /* Must release all pins and locks on failure exit */
2543 if (BufferIsValid(lbuf))
2544 _bt_relbuf(rel, lbuf);
2545 _bt_relbuf(rel, rbuf);
2546 _bt_relbuf(rel, buf);
2547 if (target != leafblkno)
2548 _bt_relbuf(rel, leafbuf);
2549
2550 return false;
2551 }
2552
2553 rightsib_is_rightmost = P_RIGHTMOST(opaque);
2554 *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2555
2556 /*
2557 * If we are deleting the next-to-last page on the target's level, then
2558 * the rightsib is a candidate to become the new fast root. (In theory, it
2559 * might be possible to push the fast root even further down, but the odds
2560 * of doing so are slim, and the locking considerations daunting.)
2561 *
2562 * We can safely acquire a lock on the metapage here --- see comments for
2563 * _bt_newlevel().
2564 */
2565 if (leftsib == P_NONE && rightsib_is_rightmost)
2566 {
2567 page = BufferGetPage(rbuf);
2568 opaque = BTPageGetOpaque(page);
2569 if (P_RIGHTMOST(opaque))
2570 {
2571 /* rightsib will be the only one left on the level */
2572 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2573 metapg = BufferGetPage(metabuf);
2574 metad = BTPageGetMeta(metapg);
2575
2576 /*
2577 * The expected case here is btm_fastlevel == targetlevel+1; if
2578 * the fastlevel is <= targetlevel, something is wrong, and we
2579 * choose to overwrite it to fix it.
2580 */
2581 if (metad->btm_fastlevel > targetlevel + 1)
2582 {
2583 /* no update wanted */
2584 _bt_relbuf(rel, metabuf);
2585 metabuf = InvalidBuffer;
2586 }
2587 }
2588 }
2589
2590 /*
2591 * Here we begin doing the deletion.
2592 */
2593
2594 /* No ereport(ERROR) until changes are logged */
2596
2597 /*
2598 * Update siblings' side-links. Note the target page's side-links will
2599 * continue to point to the siblings. Asserts here are just rechecking
2600 * things we already verified above.
2601 */
2602 if (BufferIsValid(lbuf))
2603 {
2604 page = BufferGetPage(lbuf);
2605 opaque = BTPageGetOpaque(page);
2606 Assert(opaque->btpo_next == target);
2607 opaque->btpo_next = rightsib;
2608 }
2609 page = BufferGetPage(rbuf);
2610 opaque = BTPageGetOpaque(page);
2611 Assert(opaque->btpo_prev == target);
2612 opaque->btpo_prev = leftsib;
2613
2614 /*
2615 * If we deleted a parent of the targeted leaf page, instead of the leaf
2616 * itself, update the leaf to point to the next remaining child in the
2617 * subtree.
2618 *
2619 * Note: We rely on the fact that a buffer pin on the leaf page has been
2620 * held since leafhikey was initialized. This is safe, though only
2621 * because the page was already half-dead at that point. The leaf page
2622 * cannot have been modified by any other backend during the period when
2623 * no lock was held.
2624 */
2625 if (target != leafblkno)
2626 BTreeTupleSetTopParent(leafhikey, leaftopparent);
2627
2628 /*
2629 * Mark the page itself deleted. It can be recycled when all current
2630 * transactions are gone. Storing GetTopTransactionId() would work, but
2631 * we're in VACUUM and would not otherwise have an XID. Having already
2632 * updated links to the target, ReadNextFullTransactionId() suffices as an
2633 * upper bound. Any scan having retained a now-stale link is advertising
2634 * in its PGPROC an xmin less than or equal to the value we read here. It
2635 * will continue to do so, holding back the xmin horizon, for the duration
2636 * of that scan.
2637 */
2638 page = BufferGetPage(buf);
2639 opaque = BTPageGetOpaque(page);
2640 Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2641
2642 /*
2643 * Store upper bound XID that's used to determine when deleted page is no
2644 * longer needed as a tombstone
2645 */
2646 safexid = ReadNextFullTransactionId();
2647 BTPageSetDeleted(page, safexid);
2648 opaque->btpo_cycleid = 0;
2649
2650 /* And update the metapage, if needed */
2651 if (BufferIsValid(metabuf))
2652 {
2653 /* upgrade metapage if needed */
2654 if (metad->btm_version < BTREE_NOVAC_VERSION)
2655 _bt_upgrademetapage(metapg);
2656 metad->btm_fastroot = rightsib;
2657 metad->btm_fastlevel = targetlevel;
2658 MarkBufferDirty(metabuf);
2659 }
2660
2661 /* Must mark buffers dirty before XLogInsert */
2662 MarkBufferDirty(rbuf);
2664 if (BufferIsValid(lbuf))
2665 MarkBufferDirty(lbuf);
2666 if (target != leafblkno)
2667 MarkBufferDirty(leafbuf);
2668
2669 /* XLOG stuff */
2670 if (RelationNeedsWAL(rel))
2671 {
2673 xl_btree_metadata xlmeta;
2674 uint8 xlinfo;
2675 XLogRecPtr recptr;
2676
2678
2680 if (BufferIsValid(lbuf))
2683 if (target != leafblkno)
2685
2686 /* information stored on the target/to-be-unlinked block */
2687 xlrec.leftsib = leftsib;
2688 xlrec.rightsib = rightsib;
2689 xlrec.level = targetlevel;
2690 xlrec.safexid = safexid;
2691
2692 /* information needed to recreate the leaf block (if not the target) */
2693 xlrec.leafleftsib = leafleftsib;
2694 xlrec.leafrightsib = leafrightsib;
2695 xlrec.leaftopparent = leaftopparent;
2696
2698
2699 if (BufferIsValid(metabuf))
2700 {
2702
2704 xlmeta.version = metad->btm_version;
2705 xlmeta.root = metad->btm_root;
2706 xlmeta.level = metad->btm_level;
2707 xlmeta.fastroot = metad->btm_fastroot;
2708 xlmeta.fastlevel = metad->btm_fastlevel;
2710 xlmeta.allequalimage = metad->btm_allequalimage;
2711
2712 XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));
2714 }
2715 else
2716 xlinfo = XLOG_BTREE_UNLINK_PAGE;
2717
2718 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2719
2720 if (BufferIsValid(metabuf))
2721 {
2722 PageSetLSN(metapg, recptr);
2723 }
2724 page = BufferGetPage(rbuf);
2725 PageSetLSN(page, recptr);
2726 page = BufferGetPage(buf);
2727 PageSetLSN(page, recptr);
2728 if (BufferIsValid(lbuf))
2729 {
2730 page = BufferGetPage(lbuf);
2731 PageSetLSN(page, recptr);
2732 }
2733 if (target != leafblkno)
2734 {
2735 page = BufferGetPage(leafbuf);
2736 PageSetLSN(page, recptr);
2737 }
2738 }
2739
2741
2742 /* release metapage */
2743 if (BufferIsValid(metabuf))
2744 _bt_relbuf(rel, metabuf);
2745
2746 /* release siblings */
2747 if (BufferIsValid(lbuf))
2748 _bt_relbuf(rel, lbuf);
2749 _bt_relbuf(rel, rbuf);
2750
2751 /* If the target is not leafbuf, we're done with it now -- release it */
2752 if (target != leafblkno)
2753 _bt_relbuf(rel, buf);
2754
2755 /*
2756 * Maintain pages_newly_deleted, which is simply the number of pages
2757 * deleted by the ongoing VACUUM operation.
2758 *
2759 * Maintain pages_deleted in a way that takes into account how
2760 * btvacuumpage() will count deleted pages that have yet to become
2761 * scanblkno -- only count page when it's not going to get that treatment
2762 * later on.
2763 */
2764 stats->pages_newly_deleted++;
2765 if (target <= scanblkno)
2766 stats->pages_deleted++;
2767
2768 /*
2769 * Remember information about the target page (now a newly deleted page)
2770 * in dedicated vstate space for later. The page will be considered as a
2771 * candidate to place in the FSM at the end of the current btvacuumscan()
2772 * call.
2773 */
2774 _bt_pendingfsm_add(vstate, target, safexid);
2775
2776 /* Success - hold on to lock on leafbuf (might also have been target) */
2777 return true;
2778}
uint8_t uint8
Definition: c.h:500
static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
Definition: nbtpage.c:3063
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:620
static void BTPageSetDeleted(Page page, FullTransactionId safexid)
Definition: nbtree.h:239
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:35
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:36
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:328
BlockNumber pages_deleted
Definition: genam.h:105
FullTransactionId ReadNextFullTransactionId(void)
Definition: varsup.c:288

References _bt_getbuf(), _bt_lockbuf(), _bt_pendingfsm_add(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BlockNumberIsValid(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageSetDeleted(), BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetDownLink(), BTreeTupleGetTopParent(), BTreeTupleSetTopParent(), buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CHECK_FOR_INTERRUPTS, elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBlockNumber, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_unlink_page::leafleftsib, xl_btree_unlink_page::leafrightsib, xl_btree_unlink_page::leaftopparent, xl_btree_unlink_page::leftsib, xl_btree_metadata::level, xl_btree_unlink_page::level, LOG, MarkBufferDirty(), P_FIRSTDATAKEY, P_HIKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_newly_deleted, PageSetLSN(), ReadNextFullTransactionId(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, ReleaseBuffer(), xl_btree_unlink_page::rightsib, xl_btree_metadata::root, xl_btree_unlink_page::safexid, SizeOfBtreeUnlinkPage, START_CRIT_SECTION, BTVacState::stats, xl_btree_metadata::version, XLOG_BTREE_UNLINK_PAGE, XLOG_BTREE_UNLINK_PAGE_META, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_pagedel().

◆ _bt_unlockbuf()

void _bt_unlockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1070 of file nbtpage.c.

1071{
1072 /*
1073 * Buffer is pinned and locked, which means that it is expected to be
1074 * defined and addressable. Check that proactively.
1075 */
1077
1078 /* LockBuffer() asserts that pin is held by this backend */
1080
1081 if (!RelationUsesLocalBuffers(rel))
1083}
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:190
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readfirstpage(), _bt_relandgetbuf(), _bt_relbuf(), _bt_search(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_upgradelockbufcleanup()

void _bt_upgradelockbufcleanup ( Relation  rel,
Buffer  buf 
)

Definition at line 1109 of file nbtpage.c.

1110{
1111 /*
1112 * Buffer is pinned and locked, which means that it is expected to be
1113 * defined and addressable. Check that proactively.
1114 */
1116
1117 /* LockBuffer() asserts that pin is held by this backend */
1120}
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5231

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), LockBufferForCleanup(), and VALGRIND_CHECK_MEM_IS_DEFINED.

Referenced by btvacuumpage().

◆ _bt_upgrademetapage()

void _bt_upgrademetapage ( Page  page)

Definition at line 107 of file nbtpage.c.

108{
109 BTMetaPageData *metad;
111
112 metad = BTPageGetMeta(page);
113 metaopaque = BTPageGetOpaque(page);
114
115 /* It must be really a meta page of upgradable version */
116 Assert(metaopaque->btpo_flags & BTP_META);
119
120 /* Set version number and fill extra fields added into version 3 */
124 /* Only a REINDEX can set this field */
125 Assert(!metad->btm_allequalimage);
126 metad->btm_allequalimage = false;
127
128 /* Adjust pd_lower (see _bt_initmetapage() for details) */
129 ((PageHeader) page)->pd_lower =
130 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
131}
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224

References Assert(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, and PG_USED_FOR_ASSERTS_ONLY.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_newlevel(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_vacuum_needs_cleanup()

bool _bt_vacuum_needs_cleanup ( Relation  rel)

Definition at line 179 of file nbtpage.c.

180{
181 Buffer metabuf;
182 Page metapg;
183 BTMetaPageData *metad;
184 uint32 btm_version;
185 BlockNumber prev_num_delpages;
186
187 /*
188 * Copy details from metapage to local variables quickly.
189 *
190 * Note that we deliberately avoid using cached version of metapage here.
191 */
192 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
193 metapg = BufferGetPage(metabuf);
194 metad = BTPageGetMeta(metapg);
195 btm_version = metad->btm_version;
196
197 if (btm_version < BTREE_NOVAC_VERSION)
198 {
199 /*
200 * Metapage needs to be dynamically upgraded to store fields that are
201 * only present when btm_version >= BTREE_NOVAC_VERSION
202 */
203 _bt_relbuf(rel, metabuf);
204 return true;
205 }
206
207 prev_num_delpages = metad->btm_last_cleanup_num_delpages;
208 _bt_relbuf(rel, metabuf);
209
210 /*
211 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
212 * total size of the index. We can reasonably expect (though are not
213 * guaranteed) to be able to recycle this many pages if we decide to do a
214 * btvacuumscan call during the ongoing btvacuumcleanup. For further
215 * details see the nbtree/README section on placing deleted pages in the
216 * FSM.
217 */
218 if (prev_num_delpages > 0 &&
219 prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
220 return true;
221
222 return false;
223}
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:274

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), and RelationGetNumberOfBlocks.

Referenced by btvacuumcleanup().