PostgreSQL Source Code  git master
verify_nbtree.c File Reference
#include "postgres.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/table.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/index.h"
#include "catalog/pg_am.h"
#include "catalog/pg_opfamily_d.h"
#include "commands/tablecmds.h"
#include "common/pg_prng.h"
#include "lib/bloomfilter.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/smgr.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/snapmgr.h"
Include dependency graph for verify_nbtree.c:

Go to the source code of this file.

Data Structures

struct  BtreeCheckState
 
struct  BtreeLevel
 

Macros

#define InvalidBtreeLevel   ((uint32) InvalidBlockNumber)
 
#define BTreeTupleGetNKeyAtts(itup, rel)    Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel))
 

Typedefs

typedef struct BtreeCheckState BtreeCheckState
 
typedef struct BtreeLevel BtreeLevel
 

Functions

 PG_FUNCTION_INFO_V1 (bt_index_check)
 
 PG_FUNCTION_INFO_V1 (bt_index_parent_check)
 
static void bt_index_check_internal (Oid indrelid, bool parentcheck, bool heapallindexed, bool rootdescend, bool checkunique)
 
static void btree_index_checkable (Relation rel)
 
static bool btree_index_mainfork_expected (Relation rel)
 
static void bt_check_every_level (Relation rel, Relation heaprel, bool heapkeyspace, bool readonly, bool heapallindexed, bool rootdescend, bool checkunique)
 
static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState *state, BtreeLevel level)
 
static bool bt_leftmost_ignoring_half_dead (BtreeCheckState *state, BlockNumber start, BTPageOpaque start_opaque)
 
static void bt_recheck_sibling_links (BtreeCheckState *state, BlockNumber btpo_prev_from_target, BlockNumber leftcurrent)
 
static bool heap_entry_is_visible (BtreeCheckState *state, ItemPointer tid)
 
static void bt_report_duplicate (BtreeCheckState *state, ItemPointer tid, BlockNumber block, OffsetNumber offset, int posting, ItemPointer nexttid, BlockNumber nblock, OffsetNumber noffset, int nposting)
 
static void bt_entry_unique_check (BtreeCheckState *state, IndexTuple itup, BlockNumber targetblock, OffsetNumber offset, int *lVis_i, ItemPointer *lVis_tid, OffsetNumber *lVis_offset, BlockNumber *lVis_block)
 
static void bt_target_page_check (BtreeCheckState *state)
 
static BTScanInsert bt_right_page_check_scankey (BtreeCheckState *state, OffsetNumber *rightfirstoffset)
 
static void bt_child_check (BtreeCheckState *state, BTScanInsert targetkey, OffsetNumber downlinkoffnum)
 
static void bt_child_highkey_check (BtreeCheckState *state, OffsetNumber target_downlinkoffnum, Page loaded_child, uint32 target_level)
 
static void bt_downlink_missing_check (BtreeCheckState *state, bool rightsplit, BlockNumber blkno, Page page)
 
static void bt_tuple_present_callback (Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate)
 
static IndexTuple bt_normalize_tuple (BtreeCheckState *state, IndexTuple itup)
 
static IndexTuple bt_posting_plain_tuple (IndexTuple itup, int n)
 
static bool bt_rootdescend (BtreeCheckState *state, IndexTuple itup)
 
static bool offset_is_negative_infinity (BTPageOpaque opaque, OffsetNumber offset)
 
static bool invariant_l_offset (BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
 
static bool invariant_leq_offset (BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
 
static bool invariant_g_offset (BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound)
 
static bool invariant_l_nontarget_offset (BtreeCheckState *state, BTScanInsert key, BlockNumber nontargetblock, Page nontarget, OffsetNumber upperbound)
 
static Page palloc_btree_page (BtreeCheckState *state, BlockNumber blocknum)
 
static BTScanInsert bt_mkscankey_pivotsearch (Relation rel, IndexTuple itup)
 
static ItemId PageGetItemIdCareful (BtreeCheckState *state, BlockNumber block, Page page, OffsetNumber offset)
 
static ItemPointer BTreeTupleGetHeapTIDCareful (BtreeCheckState *state, IndexTuple itup, bool nonpivot)
 
static ItemPointer BTreeTupleGetPointsToTID (IndexTuple itup)
 
Datum bt_index_check (PG_FUNCTION_ARGS)
 
Datum bt_index_parent_check (PG_FUNCTION_ARGS)
 
static bool bt_pivot_tuple_identical (bool heapkeyspace, IndexTuple itup1, IndexTuple itup2)
 

Variables

 PG_MODULE_MAGIC
 

Macro Definition Documentation

◆ BTreeTupleGetNKeyAtts

#define BTreeTupleGetNKeyAtts (   itup,
  rel 
)     Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel))

Definition at line 53 of file verify_nbtree.c.

◆ InvalidBtreeLevel

#define InvalidBtreeLevel   ((uint32) InvalidBlockNumber)

Definition at line 52 of file verify_nbtree.c.

Typedef Documentation

◆ BtreeCheckState

◆ BtreeLevel

typedef struct BtreeLevel BtreeLevel

Function Documentation

◆ bt_check_every_level()

static void bt_check_every_level ( Relation  rel,
Relation  heaprel,
bool  heapkeyspace,
bool  readonly,
bool  heapallindexed,
bool  rootdescend,
bool  checkunique 
)
static

Definition at line 490 of file verify_nbtree.c.

493 {
495  Page metapage;
496  BTMetaPageData *metad;
497  uint32 previouslevel;
498  BtreeLevel current;
499  Snapshot snapshot = SnapshotAny;
500 
501  if (!readonly)
502  elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"",
504  else
505  elog(DEBUG1, "verifying consistency of tree structure for index \"%s\" with cross-level checks",
507 
508  /*
509  * This assertion matches the one in index_getnext_tid(). See page
510  * recycling/"visible to everyone" notes in nbtree README.
511  */
513 
514  /*
515  * Initialize state for entire verification operation
516  */
517  state = palloc0(sizeof(BtreeCheckState));
518  state->rel = rel;
519  state->heaprel = heaprel;
520  state->heapkeyspace = heapkeyspace;
521  state->readonly = readonly;
522  state->heapallindexed = heapallindexed;
523  state->rootdescend = rootdescend;
524  state->checkunique = checkunique;
525  state->snapshot = InvalidSnapshot;
526 
527  if (state->heapallindexed)
528  {
529  int64 total_pages;
530  int64 total_elems;
531  uint64 seed;
532 
533  /*
534  * Size Bloom filter based on estimated number of tuples in index,
535  * while conservatively assuming that each block must contain at least
536  * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
537  * bt_posting_plain_tuple() for definition, and details of how posting
538  * list tuples are handled.
539  */
540  total_pages = RelationGetNumberOfBlocks(rel);
541  total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
542  (int64) state->rel->rd_rel->reltuples);
543  /* Generate a random seed to avoid repetition */
545  /* Create Bloom filter to fingerprint index */
546  state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
547  state->heaptuplespresent = 0;
548 
549  /*
550  * Register our own snapshot in !readonly case, rather than asking
551  * table_index_build_scan() to do this for us later. This needs to
552  * happen before index fingerprinting begins, so we can later be
553  * certain that index fingerprinting should have reached all tuples
554  * returned by table_index_build_scan().
555  */
556  if (!state->readonly)
557  {
559 
560  /*
561  * GetTransactionSnapshot() always acquires a new MVCC snapshot in
562  * READ COMMITTED mode. A new snapshot is guaranteed to have all
563  * the entries it requires in the index.
564  *
565  * We must defend against the possibility that an old xact
566  * snapshot was returned at higher isolation levels when that
567  * snapshot is not safe for index scans of the target index. This
568  * is possible when the snapshot sees tuples that are before the
569  * index's indcheckxmin horizon. Throwing an error here should be
570  * very rare. It doesn't seem worth using a secondary snapshot to
571  * avoid this.
572  */
573  if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&
575  snapshot->xmin))
576  ereport(ERROR,
578  errmsg("index \"%s\" cannot be verified using transaction snapshot",
579  RelationGetRelationName(rel))));
580  }
581  }
582 
583  /*
584  * We need a snapshot to check the uniqueness of the index. For better
585  * performance take it once per index check. If snapshot already taken
586  * reuse it.
587  */
588  if (state->checkunique)
589  {
590  state->indexinfo = BuildIndexInfo(state->rel);
591  if (state->indexinfo->ii_Unique)
592  {
593  if (snapshot != SnapshotAny)
594  state->snapshot = snapshot;
595  else
597  }
598  }
599 
600  Assert(!state->rootdescend || state->readonly);
601  if (state->rootdescend && !state->heapkeyspace)
602  ereport(ERROR,
603  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
604  errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search",
606  errhint("Only B-Tree version 4 indexes support rootdescend verification.")));
607 
608  /* Create context for page */
610  "amcheck context",
612  state->checkstrategy = GetAccessStrategy(BAS_BULKREAD);
613 
614  /* Get true root block from meta-page */
616  metad = BTPageGetMeta(metapage);
617 
618  /*
619  * Certain deletion patterns can result in "skinny" B-Tree indexes, where
620  * the fast root and true root differ.
621  *
622  * Start from the true root, not the fast root, unlike conventional index
623  * scans. This approach is more thorough, and removes the risk of
624  * following a stale fast root from the meta page.
625  */
626  if (metad->btm_fastroot != metad->btm_root)
627  ereport(DEBUG1,
628  (errcode(ERRCODE_NO_DATA),
629  errmsg_internal("harmless fast root mismatch in index \"%s\"",
631  errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).",
632  metad->btm_fastroot, metad->btm_fastlevel,
633  metad->btm_root, metad->btm_level)));
634 
635  /*
636  * Starting at the root, verify every level. Move left to right, top to
637  * bottom. Note that there may be no pages other than the meta page (meta
638  * page can indicate that root is P_NONE when the index is totally empty).
639  */
640  previouslevel = InvalidBtreeLevel;
641  current.level = metad->btm_level;
642  current.leftmost = metad->btm_root;
643  current.istruerootlevel = true;
644  while (current.leftmost != P_NONE)
645  {
646  /*
647  * Verify this level, and get left most page for next level down, if
648  * not at leaf level
649  */
650  current = bt_check_level_from_leftmost(state, current);
651 
652  if (current.leftmost == InvalidBlockNumber)
653  ereport(ERROR,
654  (errcode(ERRCODE_INDEX_CORRUPTED),
655  errmsg("index \"%s\" has no valid pages on level below %u or first level",
656  RelationGetRelationName(rel), previouslevel)));
657 
658  previouslevel = current.level;
659  }
660 
661  /*
662  * * Check whether heap contains unindexed/malformed tuples *
663  */
664  if (state->heapallindexed)
665  {
666  IndexInfo *indexinfo = BuildIndexInfo(state->rel);
667  TableScanDesc scan;
668 
669  /*
670  * Create our own scan for table_index_build_scan(), rather than
671  * getting it to do so for us. This is required so that we can
672  * actually use the MVCC snapshot registered earlier in !readonly
673  * case.
674  *
675  * Note that table_index_build_scan() calls heap_endscan() for us.
676  */
677  scan = table_beginscan_strat(state->heaprel, /* relation */
678  snapshot, /* snapshot */
679  0, /* number of keys */
680  NULL, /* scan key */
681  true, /* buffer access strategy OK */
682  true); /* syncscan OK? */
683 
684  /*
685  * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY
686  * behaves in !readonly case.
687  *
688  * It's okay that we don't actually use the same lock strength for the
689  * heap relation as any other ii_Concurrent caller would in !readonly
690  * case. We have no reason to care about a concurrent VACUUM
691  * operation, since there isn't going to be a second scan of the heap
692  * that needs to be sure that there was no concurrent recycling of
693  * TIDs.
694  */
695  indexinfo->ii_Concurrent = !state->readonly;
696 
697  /*
698  * Don't wait for uncommitted tuple xact commit/abort when index is a
699  * unique index on a catalog (or an index used by an exclusion
700  * constraint). This could otherwise happen in the readonly case.
701  */
702  indexinfo->ii_Unique = false;
703  indexinfo->ii_ExclusionOps = NULL;
704  indexinfo->ii_ExclusionProcs = NULL;
705  indexinfo->ii_ExclusionStrats = NULL;
706 
707  elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"",
709  RelationGetRelationName(state->heaprel));
710 
711  table_index_build_scan(state->heaprel, state->rel, indexinfo, true, false,
712  bt_tuple_present_callback, (void *) state, scan);
713 
714  ereport(DEBUG1,
715  (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set",
716  state->heaptuplespresent, RelationGetRelationName(heaprel),
717  100.0 * bloom_prop_bits_set(state->filter))));
718 
719  if (snapshot != SnapshotAny)
720  UnregisterSnapshot(snapshot);
721 
722  bloom_free(state->filter);
723  }
724 
725  /* Be tidy: */
726  if (snapshot == SnapshotAny && state->snapshot != InvalidSnapshot)
727  UnregisterSnapshot(state->snapshot);
728  MemoryContextDelete(state->targetcontext);
729 }
#define InvalidBlockNumber
Definition: block.h:33
void bloom_free(bloom_filter *filter)
Definition: bloomfilter.c:126
bloom_filter * bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
Definition: bloomfilter.c:87
double bloom_prop_bits_set(bloom_filter *filter)
Definition: bloomfilter.c:187
@ BAS_BULKREAD
Definition: bufmgr.h:35
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:229
Pointer Page
Definition: bufpage.h:78
unsigned int uint32
Definition: c.h:495
#define Max(x, y)
Definition: c.h:987
#define INT64_FORMAT
Definition: c.h:537
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1229
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
int maintenance_work_mem
Definition: globals.c:129
#define HeapTupleHeaderGetXmin(tup)
Definition: htup_details.h:309
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2425
Assert(fmt[strlen(fmt) - 1] !='\n')
void * palloc0(Size size)
Definition: mcxt.c:1257
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:403
#define AllocSetContextCreate
Definition: memutils.h:126
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:150
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define P_NONE
Definition: nbtree.h:212
#define BTREE_METAPAGE
Definition: nbtree.h:148
uint64 pg_prng_uint64(pg_prng_state *state)
Definition: pg_prng.c:134
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
#define ERRCODE_T_R_SERIALIZATION_FAILURE
Definition: pgbench.c:76
#define RelationGetRelationName(relation)
Definition: rel.h:538
TransactionId RecentXmin
Definition: snapmgr.c:106
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:223
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:843
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:801
#define SnapshotAny
Definition: snapmgr.h:33
#define InvalidSnapshot
Definition: snapshot.h:123
uint32 btm_level
Definition: nbtree.h:108
BlockNumber btm_fastroot
Definition: nbtree.h:109
BlockNumber btm_root
Definition: nbtree.h:107
uint32 btm_fastlevel
Definition: nbtree.h:110
bool istruerootlevel
uint32 level
BlockNumber leftmost
HeapTupleHeader t_data
Definition: htup.h:68
bool ii_Unique
Definition: execnodes.h:190
uint16 * ii_ExclusionStrats
Definition: execnodes.h:186
Oid * ii_ExclusionOps
Definition: execnodes.h:184
bool ii_Concurrent
Definition: execnodes.h:195
Oid * ii_ExclusionProcs
Definition: execnodes.h:185
struct HeapTupleData * rd_indextuple
Definition: rel.h:193
Form_pg_index rd_index
Definition: rel.h:191
TransactionId xmin
Definition: snapshot.h:157
Definition: regguts.h:323
static TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, bool allow_strat, bool allow_sync)
Definition: tableam.h:925
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1772
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
#define TransactionIdIsValid(xid)
Definition: transam.h:41
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
static void bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate)
#define InvalidBtreeLevel
Definition: verify_nbtree.c:52
static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
#define IsolationUsesXactSnapshot()
Definition: xact.h:51

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, Assert(), BAS_BULKREAD, bloom_create(), bloom_free(), bloom_prop_bits_set(), bt_check_level_from_leftmost(), bt_tuple_present_callback(), BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTPageGetMeta, BTREE_METAPAGE, BuildIndexInfo(), CurrentMemoryContext, DEBUG1, elog(), ereport, errcode(), ERRCODE_T_R_SERIALIZATION_FAILURE, errdetail_internal(), errhint(), errmsg(), errmsg_internal(), ERROR, GetAccessStrategy(), GetTransactionSnapshot(), HeapTupleHeaderGetXmin, IndexInfo::ii_Concurrent, IndexInfo::ii_ExclusionOps, IndexInfo::ii_ExclusionProcs, IndexInfo::ii_ExclusionStrats, IndexInfo::ii_Unique, INT64_FORMAT, InvalidBlockNumber, InvalidBtreeLevel, InvalidSnapshot, IsolationUsesXactSnapshot, BtreeLevel::istruerootlevel, BtreeLevel::leftmost, BtreeLevel::level, maintenance_work_mem, Max, MaxTIDsPerBTreePage, MemoryContextDelete(), P_NONE, palloc0(), palloc_btree_page(), pg_global_prng_state, pg_prng_uint64(), RelationData::rd_index, RelationData::rd_indextuple, RecentXmin, RegisterSnapshot(), RelationGetNumberOfBlocks, RelationGetRelationName, SnapshotAny, HeapTupleData::t_data, table_beginscan_strat(), table_index_build_scan(), TransactionIdIsValid, TransactionIdPrecedes(), UnregisterSnapshot(), and SnapshotData::xmin.

Referenced by bt_index_check_internal().

◆ bt_check_level_from_leftmost()

static BtreeLevel bt_check_level_from_leftmost ( BtreeCheckState state,
BtreeLevel  level 
)
static

Definition at line 750 of file verify_nbtree.c.

751 {
752  /* State to establish early, concerning entire level */
753  BTPageOpaque opaque;
754  MemoryContext oldcontext;
755  BtreeLevel nextleveldown;
756 
757  /* Variables for iterating across level using right links */
758  BlockNumber leftcurrent = P_NONE;
759  BlockNumber current = level.leftmost;
760 
761  /* Initialize return state */
762  nextleveldown.leftmost = InvalidBlockNumber;
763  nextleveldown.level = InvalidBtreeLevel;
764  nextleveldown.istruerootlevel = false;
765 
766  /* Use page-level context for duration of this call */
767  oldcontext = MemoryContextSwitchTo(state->targetcontext);
768 
769  elog(DEBUG1, "verifying level %u%s", level.level,
770  level.istruerootlevel ?
771  " (true root level)" : level.level == 0 ? " (leaf level)" : "");
772 
773  state->prevrightlink = InvalidBlockNumber;
774  state->previncompletesplit = false;
775 
776  do
777  {
778  /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
780 
781  /* Initialize state for this iteration */
782  state->targetblock = current;
783  state->target = palloc_btree_page(state, state->targetblock);
784  state->targetlsn = PageGetLSN(state->target);
785 
786  opaque = BTPageGetOpaque(state->target);
787 
788  if (P_IGNORE(opaque))
789  {
790  /*
791  * Since there cannot be a concurrent VACUUM operation in readonly
792  * mode, and since a page has no links within other pages
793  * (siblings and parent) once it is marked fully deleted, it
794  * should be impossible to land on a fully deleted page in
795  * readonly mode. See bt_child_check() for further details.
796  *
797  * The bt_child_check() P_ISDELETED() check is repeated here so
798  * that pages that are only reachable through sibling links get
799  * checked.
800  */
801  if (state->readonly && P_ISDELETED(opaque))
802  ereport(ERROR,
803  (errcode(ERRCODE_INDEX_CORRUPTED),
804  errmsg("downlink or sibling link points to deleted block in index \"%s\"",
806  errdetail_internal("Block=%u left block=%u left link from block=%u.",
807  current, leftcurrent, opaque->btpo_prev)));
808 
809  if (P_RIGHTMOST(opaque))
810  ereport(ERROR,
811  (errcode(ERRCODE_INDEX_CORRUPTED),
812  errmsg("block %u fell off the end of index \"%s\"",
813  current, RelationGetRelationName(state->rel))));
814  else
815  ereport(DEBUG1,
816  (errcode(ERRCODE_NO_DATA),
817  errmsg_internal("block %u of index \"%s\" concurrently deleted",
818  current, RelationGetRelationName(state->rel))));
819  goto nextpage;
820  }
821  else if (nextleveldown.leftmost == InvalidBlockNumber)
822  {
823  /*
824  * A concurrent page split could make the caller supplied leftmost
825  * block no longer contain the leftmost page, or no longer be the
826  * true root, but where that isn't possible due to heavyweight
827  * locking, check that the first valid page meets caller's
828  * expectations.
829  */
830  if (state->readonly)
831  {
832  if (!bt_leftmost_ignoring_half_dead(state, current, opaque))
833  ereport(ERROR,
834  (errcode(ERRCODE_INDEX_CORRUPTED),
835  errmsg("block %u is not leftmost in index \"%s\"",
836  current, RelationGetRelationName(state->rel))));
837 
838  if (level.istruerootlevel && !P_ISROOT(opaque))
839  ereport(ERROR,
840  (errcode(ERRCODE_INDEX_CORRUPTED),
841  errmsg("block %u is not true root in index \"%s\"",
842  current, RelationGetRelationName(state->rel))));
843  }
844 
845  /*
846  * Before beginning any non-trivial examination of level, prepare
847  * state for next bt_check_level_from_leftmost() invocation for
848  * the next level for the next level down (if any).
849  *
850  * There should be at least one non-ignorable page per level,
851  * unless this is the leaf level, which is assumed by caller to be
852  * final level.
853  */
854  if (!P_ISLEAF(opaque))
855  {
856  IndexTuple itup;
857  ItemId itemid;
858 
859  /* Internal page -- downlink gets leftmost on next level */
860  itemid = PageGetItemIdCareful(state, state->targetblock,
861  state->target,
862  P_FIRSTDATAKEY(opaque));
863  itup = (IndexTuple) PageGetItem(state->target, itemid);
864  nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
865  nextleveldown.level = opaque->btpo_level - 1;
866  }
867  else
868  {
869  /*
870  * Leaf page -- final level caller must process.
871  *
872  * Note that this could also be the root page, if there has
873  * been no root page split yet.
874  */
875  nextleveldown.leftmost = P_NONE;
876  nextleveldown.level = InvalidBtreeLevel;
877  }
878 
879  /*
880  * Finished setting up state for this call/level. Control will
881  * never end up back here in any future loop iteration for this
882  * level.
883  */
884  }
885 
886  /*
887  * Sibling links should be in mutual agreement. There arises
888  * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
889  * of the parent's low-key downlink is half-dead. (A half-dead page
890  * has no downlink from its parent.) Under heavyweight locking, the
891  * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
892  * Without heavyweight locking, validation of the P_NONE case remains
893  * unimplemented.
894  */
895  if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE)
896  bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
897 
898  /* Check level */
899  if (level.level != opaque->btpo_level)
900  ereport(ERROR,
901  (errcode(ERRCODE_INDEX_CORRUPTED),
902  errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
904  errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
905  current, level.level, opaque->btpo_level)));
906 
907  /* Verify invariants for page */
909 
910 nextpage:
911 
912  /* Try to detect circular links */
913  if (current == leftcurrent || current == opaque->btpo_prev)
914  ereport(ERROR,
915  (errcode(ERRCODE_INDEX_CORRUPTED),
916  errmsg("circular link chain found in block %u of index \"%s\"",
917  current, RelationGetRelationName(state->rel))));
918 
919  leftcurrent = current;
920  current = opaque->btpo_next;
921 
922  if (state->lowkey)
923  {
924  Assert(state->readonly);
925  pfree(state->lowkey);
926  state->lowkey = NULL;
927  }
928 
929  /*
930  * Copy current target high key as the low key of right sibling.
931  * Allocate memory in upper level context, so it would be cleared
932  * after reset of target context.
933  *
934  * We only need the low key in corner cases of checking child high
935  * keys. We use high key only when incomplete split on the child level
936  * falls to the boundary of pages on the target level. See
937  * bt_child_highkey_check() for details. So, typically we won't end
938  * up doing anything with low key, but it's simpler for general case
939  * high key verification to always have it available.
940  *
941  * The correctness of managing low key in the case of concurrent
942  * splits wasn't investigated yet. Thankfully we only need low key
943  * for readonly verification and concurrent splits won't happen.
944  */
945  if (state->readonly && !P_RIGHTMOST(opaque))
946  {
947  IndexTuple itup;
948  ItemId itemid;
949 
950  itemid = PageGetItemIdCareful(state, state->targetblock,
951  state->target, P_HIKEY);
952  itup = (IndexTuple) PageGetItem(state->target, itemid);
953 
954  state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup));
955  memcpy(state->lowkey, itup, IndexTupleSize(itup));
956  }
957 
958  /* Free page and associated memory for this iteration */
959  MemoryContextReset(state->targetcontext);
960  }
961  while (current != P_NONE);
962 
963  if (state->lowkey)
964  {
965  Assert(state->readonly);
966  pfree(state->lowkey);
967  state->lowkey = NULL;
968  }
969 
970  /* Don't change context for caller */
971  MemoryContextSwitchTo(oldcontext);
972 
973  return nextleveldown;
974 }
uint32 BlockNumber
Definition: block.h:31
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:330
void pfree(void *pointer)
Definition: mcxt.c:1456
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1021
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define P_HIKEY
Definition: nbtree.h:367
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
#define P_ISROOT(opaque)
Definition: nbtree.h:221
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint32 btpo_level
Definition: nbtree.h:66
static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, BlockNumber start, BTPageOpaque start_opaque)
static void bt_target_page_check(BtreeCheckState *state)
static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, OffsetNumber offset)
static void bt_recheck_sibling_links(BtreeCheckState *state, BlockNumber btpo_prev_from_target, BlockNumber leftcurrent)

References Assert(), bt_leftmost_ignoring_half_dead(), bt_recheck_sibling_links(), bt_target_page_check(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), CHECK_FOR_INTERRUPTS, DEBUG1, elog(), ereport, errcode(), errdetail_internal(), errmsg(), errmsg_internal(), ERROR, IndexTupleSize, InvalidBlockNumber, InvalidBtreeLevel, BtreeLevel::istruerootlevel, BtreeLevel::leftmost, BtreeLevel::level, MemoryContextAlloc(), MemoryContextReset(), MemoryContextSwitchTo(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISDELETED, P_ISLEAF, P_ISROOT, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetLSN(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_check_every_level().

◆ bt_child_check()

static void bt_child_check ( BtreeCheckState state,
BTScanInsert  targetkey,
OffsetNumber  downlinkoffnum 
)
static

Definition at line 2484 of file verify_nbtree.c.

2486 {
2487  ItemId itemid;
2488  IndexTuple itup;
2489  BlockNumber childblock;
2490  OffsetNumber offset;
2491  OffsetNumber maxoffset;
2492  Page child;
2493  BTPageOpaque copaque;
2494  BTPageOpaque topaque;
2495 
2496  itemid = PageGetItemIdCareful(state, state->targetblock,
2497  state->target, downlinkoffnum);
2498  itup = (IndexTuple) PageGetItem(state->target, itemid);
2499  childblock = BTreeTupleGetDownLink(itup);
2500 
2501  /*
2502  * Caller must have ShareLock on target relation, because of
2503  * considerations around page deletion by VACUUM.
2504  *
2505  * NB: In general, page deletion deletes the right sibling's downlink, not
2506  * the downlink of the page being deleted; the deleted page's downlink is
2507  * reused for its sibling. The key space is thereby consolidated between
2508  * the deleted page and its right sibling. (We cannot delete a parent
2509  * page's rightmost child unless it is the last child page, and we intend
2510  * to also delete the parent itself.)
2511  *
2512  * If this verification happened without a ShareLock, the following race
2513  * condition could cause false positives:
2514  *
2515  * In general, concurrent page deletion might occur, including deletion of
2516  * the left sibling of the child page that is examined here. If such a
2517  * page deletion were to occur, closely followed by an insertion into the
2518  * newly expanded key space of the child, a window for the false positive
2519  * opens up: the stale parent/target downlink originally followed to get
2520  * to the child legitimately ceases to be a lower bound on all items in
2521  * the page, since the key space was concurrently expanded "left".
2522  * (Insertion followed the "new" downlink for the child, not our now-stale
2523  * downlink, which was concurrently physically removed in target/parent as
2524  * part of deletion's first phase.)
2525  *
2526  * While we use various techniques elsewhere to perform cross-page
2527  * verification for !readonly callers, a similar trick seems difficult
2528  * here. The tricks used by bt_recheck_sibling_links and by
2529  * bt_right_page_check_scankey both involve verification of a same-level,
2530  * cross-sibling invariant. Cross-level invariants are far more squishy,
2531  * though. The nbtree REDO routines do not actually couple buffer locks
2532  * across levels during page splits, so making any cross-level check work
2533  * reliably in !readonly mode may be impossible.
2534  */
2535  Assert(state->readonly);
2536 
2537  /*
2538  * Verify child page has the downlink key from target page (its parent) as
2539  * a lower bound; downlink must be strictly less than all keys on the
2540  * page.
2541  *
2542  * Check all items, rather than checking just the first and trusting that
2543  * the operator class obeys the transitive law.
2544  */
2545  topaque = BTPageGetOpaque(state->target);
2546  child = palloc_btree_page(state, childblock);
2547  copaque = BTPageGetOpaque(child);
2548  maxoffset = PageGetMaxOffsetNumber(child);
2549 
2550  /*
2551  * Since we've already loaded the child block, combine this check with
2552  * check for downlink connectivity.
2553  */
2554  bt_child_highkey_check(state, downlinkoffnum,
2555  child, topaque->btpo_level);
2556 
2557  /*
2558  * Since there cannot be a concurrent VACUUM operation in readonly mode,
2559  * and since a page has no links within other pages (siblings and parent)
2560  * once it is marked fully deleted, it should be impossible to land on a
2561  * fully deleted page.
2562  *
2563  * It does not quite make sense to enforce that the page cannot even be
2564  * half-dead, despite the fact the downlink is modified at the same stage
2565  * that the child leaf page is marked half-dead. That's incorrect because
2566  * there may occasionally be multiple downlinks from a chain of pages
2567  * undergoing deletion, where multiple successive calls are made to
2568  * _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark
2569  * the leaf page as fully dead. While _bt_mark_page_halfdead() usually
2570  * removes the downlink to the leaf page that is marked half-dead, that's
2571  * not guaranteed, so it's possible we'll land on a half-dead page with a
2572  * downlink due to an interrupted multi-level page deletion.
2573  *
2574  * We go ahead with our checks if the child page is half-dead. It's safe
2575  * to do so because we do not test the child's high key, so it does not
2576  * matter that the original high key will have been replaced by a dummy
2577  * truncated high key within _bt_mark_page_halfdead(). All other page
2578  * items are left intact on a half-dead page, so there is still something
2579  * to test.
2580  */
2581  if (P_ISDELETED(copaque))
2582  ereport(ERROR,
2583  (errcode(ERRCODE_INDEX_CORRUPTED),
2584  errmsg("downlink to deleted page found in index \"%s\"",
2586  errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.",
2587  state->targetblock, childblock,
2588  LSN_FORMAT_ARGS(state->targetlsn))));
2589 
2590  for (offset = P_FIRSTDATAKEY(copaque);
2591  offset <= maxoffset;
2592  offset = OffsetNumberNext(offset))
2593  {
2594  /*
2595  * Skip comparison of target page key against "negative infinity"
2596  * item, if any. Checking it would indicate that it's not a strict
2597  * lower bound, but that's only because of the hard-coding for
2598  * negative infinity items within _bt_compare().
2599  *
2600  * If nbtree didn't truncate negative infinity tuples during internal
2601  * page splits then we'd expect child's negative infinity key to be
2602  * equal to the scankey/downlink from target/parent (it would be a
2603  * "low key" in this hypothetical scenario, and so it would still need
2604  * to be treated as a special case here).
2605  *
2606  * Negative infinity items can be thought of as a strict lower bound
2607  * that works transitively, with the last non-negative-infinity pivot
2608  * followed during a descent from the root as its "true" strict lower
2609  * bound. Only a small number of negative infinity items are truly
2610  * negative infinity; those that are the first items of leftmost
2611  * internal pages. In more general terms, a negative infinity item is
2612  * only negative infinity with respect to the subtree that the page is
2613  * at the root of.
2614  *
2615  * See also: bt_rootdescend(), which can even detect transitive
2616  * inconsistencies on cousin leaf pages.
2617  */
2618  if (offset_is_negative_infinity(copaque, offset))
2619  continue;
2620 
2621  if (!invariant_l_nontarget_offset(state, targetkey, childblock, child,
2622  offset))
2623  ereport(ERROR,
2624  (errcode(ERRCODE_INDEX_CORRUPTED),
2625  errmsg("down-link lower bound invariant violated for index \"%s\"",
2627  errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.",
2628  state->targetblock, childblock, offset,
2629  LSN_FORMAT_ARGS(state->targetlsn))));
2630  }
2631 
2632  pfree(child);
2633 }
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
static void bt_child_highkey_check(BtreeCheckState *state, OffsetNumber target_downlinkoffnum, Page loaded_child, uint32 target_level)
static bool invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, BlockNumber nontargetblock, Page nontarget, OffsetNumber upperbound)
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43

References Assert(), bt_child_highkey_check(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), ereport, errcode(), errdetail_internal(), errmsg(), ERROR, invariant_l_nontarget_offset(), LSN_FORMAT_ARGS, offset_is_negative_infinity(), OffsetNumberNext, P_FIRSTDATAKEY, P_ISDELETED, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_target_page_check().

◆ bt_child_highkey_check()

static void bt_child_highkey_check ( BtreeCheckState state,
OffsetNumber  target_downlinkoffnum,
Page  loaded_child,
uint32  target_level 
)
static

Definition at line 2237 of file verify_nbtree.c.

2241 {
2242  BlockNumber blkno = state->prevrightlink;
2243  Page page;
2244  BTPageOpaque opaque;
2245  bool rightsplit = state->previncompletesplit;
2246  bool first = true;
2247  ItemId itemid;
2248  IndexTuple itup;
2249  BlockNumber downlink;
2250 
2251  if (OffsetNumberIsValid(target_downlinkoffnum))
2252  {
2253  itemid = PageGetItemIdCareful(state, state->targetblock,
2254  state->target, target_downlinkoffnum);
2255  itup = (IndexTuple) PageGetItem(state->target, itemid);
2256  downlink = BTreeTupleGetDownLink(itup);
2257  }
2258  else
2259  {
2260  downlink = P_NONE;
2261  }
2262 
2263  /*
2264  * If no previous rightlink is memorized for current level just below
2265  * target page's level, we are about to start from the leftmost page. We
2266  * can't follow rightlinks from previous page, because there is no
2267  * previous page. But we still can match high key.
2268  *
2269  * So we initialize variables for the loop above like there is previous
2270  * page referencing current child. Also we imply previous page to not
2271  * have incomplete split flag, that would make us require downlink for
2272  * current child. That's correct, because leftmost page on the level
2273  * should always have parent downlink.
2274  */
2275  if (!BlockNumberIsValid(blkno))
2276  {
2277  blkno = downlink;
2278  rightsplit = false;
2279  }
2280 
2281  /* Move to the right on the child level */
2282  while (true)
2283  {
2284  /*
2285  * Did we traverse the whole tree level and this is check for pages to
2286  * the right of rightmost downlink?
2287  */
2288  if (blkno == P_NONE && downlink == P_NONE)
2289  {
2290  state->prevrightlink = InvalidBlockNumber;
2291  state->previncompletesplit = false;
2292  return;
2293  }
2294 
2295  /* Did we traverse the whole tree level and don't find next downlink? */
2296  if (blkno == P_NONE)
2297  ereport(ERROR,
2298  (errcode(ERRCODE_INDEX_CORRUPTED),
2299  errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
2300  state->prevrightlink, downlink,
2301  RelationGetRelationName(state->rel))));
2302 
2303  /* Load page contents */
2304  if (blkno == downlink && loaded_child)
2305  page = loaded_child;
2306  else
2307  page = palloc_btree_page(state, blkno);
2308 
2309  opaque = BTPageGetOpaque(page);
2310 
2311  /* The first page we visit at the level should be leftmost */
2312  if (first && !BlockNumberIsValid(state->prevrightlink) &&
2313  !bt_leftmost_ignoring_half_dead(state, blkno, opaque))
2314  ereport(ERROR,
2315  (errcode(ERRCODE_INDEX_CORRUPTED),
2316  errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",
2318  errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
2319  state->targetblock, blkno,
2320  LSN_FORMAT_ARGS(state->targetlsn))));
2321 
2322  /* Do level sanity check */
2323  if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
2324  opaque->btpo_level != target_level - 1)
2325  ereport(ERROR,
2326  (errcode(ERRCODE_INDEX_CORRUPTED),
2327  errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
2329  errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
2330  blkno, target_level - 1, opaque->btpo_level)));
2331 
2332  /* Try to detect circular links */
2333  if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
2334  ereport(ERROR,
2335  (errcode(ERRCODE_INDEX_CORRUPTED),
2336  errmsg("circular link chain found in block %u of index \"%s\"",
2337  blkno, RelationGetRelationName(state->rel))));
2338 
2339  if (blkno != downlink && !P_IGNORE(opaque))
2340  {
2341  /* blkno probably has missing parent downlink */
2342  bt_downlink_missing_check(state, rightsplit, blkno, page);
2343  }
2344 
2345  rightsplit = P_INCOMPLETE_SPLIT(opaque);
2346 
2347  /*
2348  * If we visit page with high key, check that it is equal to the
2349  * target key next to corresponding downlink.
2350  */
2351  if (!rightsplit && !P_RIGHTMOST(opaque))
2352  {
2353  BTPageOpaque topaque;
2354  IndexTuple highkey;
2355  OffsetNumber pivotkey_offset;
2356 
2357  /* Get high key */
2358  itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY);
2359  highkey = (IndexTuple) PageGetItem(page, itemid);
2360 
2361  /*
2362  * There might be two situations when we examine high key. If
2363  * current child page is referenced by given target downlink, we
2364  * should look to the next offset number for matching key from
2365  * target page.
2366  *
2367  * Alternatively, we're following rightlinks somewhere in the
2368  * middle between page referenced by previous target's downlink
2369  * and the page referenced by current target's downlink. If
2370  * current child page hasn't incomplete split flag set, then its
2371  * high key should match to the target's key of current offset
2372  * number. This happens when a previous call here (to
2373  * bt_child_highkey_check()) found an incomplete split, and we
2374  * reach a right sibling page without a downlink -- the right
2375  * sibling page's high key still needs to be matched to a
2376  * separator key on the parent/target level.
2377  *
2378  * Don't apply OffsetNumberNext() to target_downlinkoffnum when we
2379  * already had to step right on the child level. Our traversal of
2380  * the child level must try to move in perfect lockstep behind (to
2381  * the left of) the target/parent level traversal.
2382  */
2383  if (blkno == downlink)
2384  pivotkey_offset = OffsetNumberNext(target_downlinkoffnum);
2385  else
2386  pivotkey_offset = target_downlinkoffnum;
2387 
2388  topaque = BTPageGetOpaque(state->target);
2389 
2390  if (!offset_is_negative_infinity(topaque, pivotkey_offset))
2391  {
2392  /*
2393  * If we're looking for the next pivot tuple in target page,
2394  * but there is no more pivot tuples, then we should match to
2395  * high key instead.
2396  */
2397  if (pivotkey_offset > PageGetMaxOffsetNumber(state->target))
2398  {
2399  if (P_RIGHTMOST(topaque))
2400  ereport(ERROR,
2401  (errcode(ERRCODE_INDEX_CORRUPTED),
2402  errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"",
2404  errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
2405  state->targetblock, blkno,
2406  LSN_FORMAT_ARGS(state->targetlsn))));
2407  pivotkey_offset = P_HIKEY;
2408  }
2409  itemid = PageGetItemIdCareful(state, state->targetblock,
2410  state->target, pivotkey_offset);
2411  itup = (IndexTuple) PageGetItem(state->target, itemid);
2412  }
2413  else
2414  {
2415  /*
2416  * We cannot try to match child's high key to a negative
2417  * infinity key in target, since there is nothing to compare.
2418  * However, it's still possible to match child's high key
2419  * outside of target page. The reason why we're are is that
2420  * bt_child_highkey_check() was previously called for the
2421  * cousin page of 'loaded_child', which is incomplete split.
2422  * So, now we traverse to the right of that cousin page and
2423  * current child level page under consideration still belongs
2424  * to the subtree of target's left sibling. Thus, we need to
2425  * match child's high key to it's left uncle page high key.
2426  * Thankfully we saved it, it's called a "low key" of target
2427  * page.
2428  */
2429  if (!state->lowkey)
2430  ereport(ERROR,
2431  (errcode(ERRCODE_INDEX_CORRUPTED),
2432  errmsg("can't find left sibling high key in index \"%s\"",
2434  errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
2435  state->targetblock, blkno,
2436  LSN_FORMAT_ARGS(state->targetlsn))));
2437  itup = state->lowkey;
2438  }
2439 
2440  if (!bt_pivot_tuple_identical(state->heapkeyspace, highkey, itup))
2441  {
2442  ereport(ERROR,
2443  (errcode(ERRCODE_INDEX_CORRUPTED),
2444  errmsg("mismatch between parent key and child high key in index \"%s\"",
2446  errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
2447  state->targetblock, blkno,
2448  LSN_FORMAT_ARGS(state->targetlsn))));
2449  }
2450  }
2451 
2452  /* Exit if we already found next downlink */
2453  if (blkno == downlink)
2454  {
2455  state->prevrightlink = opaque->btpo_next;
2456  state->previncompletesplit = rightsplit;
2457  return;
2458  }
2459 
2460  /* Traverse to the next page using rightlink */
2461  blkno = opaque->btpo_next;
2462 
2463  /* Free page contents if it's allocated by us */
2464  if (page != loaded_child)
2465  pfree(page);
2466  first = false;
2467  }
2468 }
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define P_HAS_FULLXID(opaque)
Definition: nbtree.h:228
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
static bool bt_pivot_tuple_identical(bool heapkeyspace, IndexTuple itup1, IndexTuple itup2)
static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, BlockNumber blkno, Page page)

References BlockNumberIsValid(), bt_downlink_missing_check(), bt_leftmost_ignoring_half_dead(), bt_pivot_tuple_identical(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), ereport, errcode(), errdetail_internal(), errmsg(), ERROR, InvalidBlockNumber, LSN_FORMAT_ARGS, offset_is_negative_infinity(), OffsetNumberIsValid, OffsetNumberNext, P_HAS_FULLXID, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_child_check(), and bt_target_page_check().

◆ bt_downlink_missing_check()

static void bt_downlink_missing_check ( BtreeCheckState state,
bool  rightsplit,
BlockNumber  blkno,
Page  page 
)
static

Definition at line 2649 of file verify_nbtree.c.

2651 {
2652  BTPageOpaque opaque = BTPageGetOpaque(page);
2653  ItemId itemid;
2654  IndexTuple itup;
2655  Page child;
2656  BTPageOpaque copaque;
2657  uint32 level;
2658  BlockNumber childblk;
2659  XLogRecPtr pagelsn;
2660 
2661  Assert(state->readonly);
2662  Assert(!P_IGNORE(opaque));
2663 
2664  /* No next level up with downlinks to fingerprint from the true root */
2665  if (P_ISROOT(opaque))
2666  return;
2667 
2668  pagelsn = PageGetLSN(page);
2669 
2670  /*
2671  * Incomplete (interrupted) page splits can account for the lack of a
2672  * downlink. Some inserting transaction should eventually complete the
2673  * page split in passing, when it notices that the left sibling page is
2674  * P_INCOMPLETE_SPLIT().
2675  *
2676  * In general, VACUUM is not prepared for there to be no downlink to a
2677  * page that it deletes. This is the main reason why the lack of a
2678  * downlink can be reported as corruption here. It's not obvious that an
2679  * invalid missing downlink can result in wrong answers to queries,
2680  * though, since index scans that land on the child may end up
2681  * consistently moving right. The handling of concurrent page splits (and
2682  * page deletions) within _bt_moveright() cannot distinguish
2683  * inconsistencies that last for a moment from inconsistencies that are
2684  * permanent and irrecoverable.
2685  *
2686  * VACUUM isn't even prepared to delete pages that have no downlink due to
2687  * an incomplete page split, but it can detect and reason about that case
2688  * by design, so it shouldn't be taken to indicate corruption. See
2689  * _bt_pagedel() for full details.
2690  */
2691  if (rightsplit)
2692  {
2693  ereport(DEBUG1,
2694  (errcode(ERRCODE_NO_DATA),
2695  errmsg_internal("harmless interrupted page split detected in index \"%s\"",
2697  errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
2698  blkno, opaque->btpo_level,
2699  opaque->btpo_prev,
2700  LSN_FORMAT_ARGS(pagelsn))));
2701  return;
2702  }
2703 
2704  /*
2705  * Page under check is probably the "top parent" of a multi-level page
2706  * deletion. We'll need to descend the subtree to make sure that
2707  * descendant pages are consistent with that, though.
2708  *
2709  * If the page (which must be non-ignorable) is a leaf page, then clearly
2710  * it can't be the top parent. The lack of a downlink is probably a
2711  * symptom of a broad problem that could just as easily cause
2712  * inconsistencies anywhere else.
2713  */
2714  if (P_ISLEAF(opaque))
2715  ereport(ERROR,
2716  (errcode(ERRCODE_INDEX_CORRUPTED),
2717  errmsg("leaf index block lacks downlink in index \"%s\"",
2719  errdetail_internal("Block=%u page lsn=%X/%X.",
2720  blkno,
2721  LSN_FORMAT_ARGS(pagelsn))));
2722 
2723  /* Descend from the given page, which is an internal page */
2724  elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
2726 
2727  level = opaque->btpo_level;
2728  itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
2729  itup = (IndexTuple) PageGetItem(page, itemid);
2730  childblk = BTreeTupleGetDownLink(itup);
2731  for (;;)
2732  {
2734 
2735  child = palloc_btree_page(state, childblk);
2736  copaque = BTPageGetOpaque(child);
2737 
2738  if (P_ISLEAF(copaque))
2739  break;
2740 
2741  /* Do an extra sanity check in passing on internal pages */
2742  if (copaque->btpo_level != level - 1)
2743  ereport(ERROR,
2744  (errcode(ERRCODE_INDEX_CORRUPTED),
2745  errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
2747  errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
2748  blkno, childblk,
2749  level - 1, copaque->btpo_level)));
2750 
2751  level = copaque->btpo_level;
2752  itemid = PageGetItemIdCareful(state, childblk, child,
2753  P_FIRSTDATAKEY(copaque));
2754  itup = (IndexTuple) PageGetItem(child, itemid);
2755  childblk = BTreeTupleGetDownLink(itup);
2756  /* Be slightly more pro-active in freeing this memory, just in case */
2757  pfree(child);
2758  }
2759 
2760  /*
2761  * Since there cannot be a concurrent VACUUM operation in readonly mode,
2762  * and since a page has no links within other pages (siblings and parent)
2763  * once it is marked fully deleted, it should be impossible to land on a
2764  * fully deleted page. See bt_child_check() for further details.
2765  *
2766  * The bt_child_check() P_ISDELETED() check is repeated here because
2767  * bt_child_check() does not visit pages reachable through negative
2768  * infinity items. Besides, bt_child_check() is unwilling to descend
2769  * multiple levels. (The similar bt_child_check() P_ISDELETED() check
2770  * within bt_check_level_from_leftmost() won't reach the page either,
2771  * since the leaf's live siblings should have their sibling links updated
2772  * to bypass the deletion target page when it is marked fully dead.)
2773  *
2774  * If this error is raised, it might be due to a previous multi-level page
2775  * deletion that failed to realize that it wasn't yet safe to mark the
2776  * leaf page as fully dead. A "dangling downlink" will still remain when
2777  * this happens. The fact that the dangling downlink's page (the leaf's
2778  * parent/ancestor page) lacked a downlink is incidental.
2779  */
2780  if (P_ISDELETED(copaque))
2781  ereport(ERROR,
2782  (errcode(ERRCODE_INDEX_CORRUPTED),
2783  errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
2785  errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.",
2786  blkno, childblk,
2787  LSN_FORMAT_ARGS(pagelsn))));
2788 
2789  /*
2790  * Iff leaf page is half-dead, its high key top parent link should point
2791  * to what VACUUM considered to be the top parent page at the instant it
2792  * was interrupted. Provided the high key link actually points to the
2793  * page under check, the missing downlink we detected is consistent with
2794  * there having been an interrupted multi-level page deletion. This means
2795  * that the subtree with the page under check at its root (a page deletion
2796  * chain) is in a consistent state, enabling VACUUM to resume deleting the
2797  * entire chain the next time it encounters the half-dead leaf page.
2798  */
2799  if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
2800  {
2801  itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
2802  itup = (IndexTuple) PageGetItem(child, itemid);
2803  if (BTreeTupleGetTopParent(itup) == blkno)
2804  return;
2805  }
2806 
2807  ereport(ERROR,
2808  (errcode(ERRCODE_INDEX_CORRUPTED),
2809  errmsg("internal index block lacks downlink in index \"%s\"",
2811  errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
2812  blkno, opaque->btpo_level,
2813  LSN_FORMAT_ARGS(pagelsn))));
2814 }
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:620
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), BTreeTupleGetTopParent(), CHECK_FOR_INTERRUPTS, DEBUG1, elog(), ereport, errcode(), errdetail_internal(), errmsg(), errmsg_internal(), ERROR, LSN_FORMAT_ARGS, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetLSN(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_child_highkey_check().

◆ bt_entry_unique_check()

static void bt_entry_unique_check ( BtreeCheckState state,
IndexTuple  itup,
BlockNumber  targetblock,
OffsetNumber  offset,
int *  lVis_i,
ItemPointer lVis_tid,
OffsetNumber lVis_offset,
BlockNumber lVis_block 
)
static

Definition at line 1038 of file verify_nbtree.c.

1042 {
1043  ItemPointer tid;
1044  bool has_visible_entry = false;
1045 
1046  Assert(targetblock != P_NONE);
1047 
1048  /*
1049  * Current tuple has posting list. Report duplicate if TID of any posting
1050  * list entry is visible and lVis_tid is valid.
1051  */
1052  if (BTreeTupleIsPosting(itup))
1053  {
1054  for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
1055  {
1056  tid = BTreeTupleGetPostingN(itup, i);
1057  if (heap_entry_is_visible(state, tid))
1058  {
1059  has_visible_entry = true;
1060  if (ItemPointerIsValid(*lVis_tid))
1061  {
1063  *lVis_tid, *lVis_block,
1064  *lVis_offset, *lVis_i,
1065  tid, targetblock,
1066  offset, i);
1067  }
1068 
1069  /*
1070  * Prevent double reporting unique constraint violation
1071  * between the posting list entries of the first tuple on the
1072  * page after cross-page check.
1073  */
1074  if (*lVis_block != targetblock && ItemPointerIsValid(*lVis_tid))
1075  return;
1076 
1077  *lVis_i = i;
1078  *lVis_tid = tid;
1079  *lVis_offset = offset;
1080  *lVis_block = targetblock;
1081  }
1082  }
1083  }
1084 
1085  /*
1086  * Current tuple has no posting list. If TID is visible save info about it
1087  * for the next comparisons in the loop in bt_page_check(). Report
1088  * duplicate if lVis_tid is already valid.
1089  */
1090  else
1091  {
1092  tid = BTreeTupleGetHeapTID(itup);
1093  if (heap_entry_is_visible(state, tid))
1094  {
1095  has_visible_entry = true;
1096  if (ItemPointerIsValid(*lVis_tid))
1097  {
1099  *lVis_tid, *lVis_block,
1100  *lVis_offset, *lVis_i,
1101  tid, targetblock,
1102  offset, -1);
1103  }
1104  *lVis_i = -1;
1105  *lVis_tid = tid;
1106  *lVis_offset = offset;
1107  *lVis_block = targetblock;
1108  }
1109  }
1110 
1111  if (!has_visible_entry && *lVis_block != InvalidBlockNumber &&
1112  *lVis_block != targetblock)
1113  {
1114  char *posting = "";
1115 
1116  if (*lVis_i >= 0)
1117  posting = psprintf(" posting %u", *lVis_i);
1118  ereport(DEBUG1,
1119  (errcode(ERRCODE_NO_DATA),
1120  errmsg("index uniqueness can not be checked for index tid=(%u,%u) in index \"%s\"",
1121  targetblock, offset,
1123  errdetail("It doesn't have visible heap tids and key is equal to the tid=(%u,%u)%s (points to heap tid=(%u,%u)).",
1124  *lVis_block, *lVis_offset, posting,
1127  errhint("VACUUM the table and repeat the check.")));
1128  }
1129 }
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int i
Definition: isn.c:73
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:114
static BlockNumber ItemPointerGetBlockNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:93
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
static void bt_report_duplicate(BtreeCheckState *state, ItemPointer tid, BlockNumber block, OffsetNumber offset, int posting, ItemPointer nexttid, BlockNumber nblock, OffsetNumber noffset, int nposting)
static bool heap_entry_is_visible(BtreeCheckState *state, ItemPointer tid)

References Assert(), bt_report_duplicate(), BTreeTupleGetHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), DEBUG1, ereport, errcode(), errdetail(), errhint(), errmsg(), heap_entry_is_visible(), i, InvalidBlockNumber, ItemPointerGetBlockNumberNoCheck(), ItemPointerGetOffsetNumberNoCheck(), ItemPointerIsValid(), P_NONE, psprintf(), and RelationGetRelationName.

Referenced by bt_target_page_check().

◆ bt_index_check()

Datum bt_index_check ( PG_FUNCTION_ARGS  )

Definition at line 229 of file verify_nbtree.c.

230 {
231  Oid indrelid = PG_GETARG_OID(0);
232  bool heapallindexed = false;
233  bool checkunique = false;
234 
235  if (PG_NARGS() >= 2)
236  heapallindexed = PG_GETARG_BOOL(1);
237  if (PG_NARGS() == 3)
238  checkunique = PG_GETARG_BOOL(2);
239 
240  bt_index_check_internal(indrelid, false, heapallindexed, false, checkunique);
241 
242  PG_RETURN_VOID();
243 }
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_NARGS()
Definition: fmgr.h:203
#define PG_GETARG_BOOL(n)
Definition: fmgr.h:274
unsigned int Oid
Definition: postgres_ext.h:31
static void bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, bool rootdescend, bool checkunique)

References bt_index_check_internal(), PG_GETARG_BOOL, PG_GETARG_OID, PG_NARGS, and PG_RETURN_VOID.

◆ bt_index_check_internal()

static void bt_index_check_internal ( Oid  indrelid,
bool  parentcheck,
bool  heapallindexed,
bool  rootdescend,
bool  checkunique 
)
static

Definition at line 278 of file verify_nbtree.c.

280 {
281  Oid heapid;
282  Relation indrel;
283  Relation heaprel;
284  LOCKMODE lockmode;
285  Oid save_userid;
286  int save_sec_context;
287  int save_nestlevel;
288 
289  if (parentcheck)
290  lockmode = ShareLock;
291  else
292  lockmode = AccessShareLock;
293 
294  /*
295  * We must lock table before index to avoid deadlocks. However, if the
296  * passed indrelid isn't an index then IndexGetRelation() will fail.
297  * Rather than emitting a not-very-helpful error message, postpone
298  * complaining, expecting that the is-it-an-index test below will fail.
299  *
300  * In hot standby mode this will raise an error when parentcheck is true.
301  */
302  heapid = IndexGetRelation(indrelid, true);
303  if (OidIsValid(heapid))
304  {
305  heaprel = table_open(heapid, lockmode);
306 
307  /*
308  * Switch to the table owner's userid, so that any index functions are
309  * run as that user. Also lock down security-restricted operations
310  * and arrange to make GUC variable changes local to this command.
311  */
312  GetUserIdAndSecContext(&save_userid, &save_sec_context);
313  SetUserIdAndSecContext(heaprel->rd_rel->relowner,
314  save_sec_context | SECURITY_RESTRICTED_OPERATION);
315  save_nestlevel = NewGUCNestLevel();
316  }
317  else
318  {
319  heaprel = NULL;
320  /* Set these just to suppress "uninitialized variable" warnings */
321  save_userid = InvalidOid;
322  save_sec_context = -1;
323  save_nestlevel = -1;
324  }
325 
326  /*
327  * Open the target index relations separately (like relation_openrv(), but
328  * with heap relation locked first to prevent deadlocking). In hot
329  * standby mode this will raise an error when parentcheck is true.
330  *
331  * There is no need for the usual indcheckxmin usability horizon test
332  * here, even in the heapallindexed case, because index undergoing
333  * verification only needs to have entries for a new transaction snapshot.
334  * (If this is a parentcheck verification, there is no question about
335  * committed or recently dead heap tuples lacking index entries due to
336  * concurrent activity.)
337  */
338  indrel = index_open(indrelid, lockmode);
339 
340  /*
341  * Since we did the IndexGetRelation call above without any lock, it's
342  * barely possible that a race against an index drop/recreation could have
343  * netted us the wrong table.
344  */
345  if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
346  ereport(ERROR,
348  errmsg("could not open parent table of index \"%s\"",
349  RelationGetRelationName(indrel))));
350 
351  /* Relation suitable for checking as B-Tree? */
352  btree_index_checkable(indrel);
353 
354  if (btree_index_mainfork_expected(indrel))
355  {
356  bool heapkeyspace,
357  allequalimage;
358 
359  if (!smgrexists(RelationGetSmgr(indrel), MAIN_FORKNUM))
360  ereport(ERROR,
361  (errcode(ERRCODE_INDEX_CORRUPTED),
362  errmsg("index \"%s\" lacks a main relation fork",
363  RelationGetRelationName(indrel))));
364 
365  /* Extract metadata from metapage, and sanitize it in passing */
366  _bt_metaversion(indrel, &heapkeyspace, &allequalimage);
367  if (allequalimage && !heapkeyspace)
368  ereport(ERROR,
369  (errcode(ERRCODE_INDEX_CORRUPTED),
370  errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version",
371  RelationGetRelationName(indrel))));
372  if (allequalimage && !_bt_allequalimage(indrel, false))
373  {
374  bool has_interval_ops = false;
375 
376  for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(indrel); i++)
377  if (indrel->rd_opfamily[i] == INTERVAL_BTREE_FAM_OID)
378  has_interval_ops = true;
379  ereport(ERROR,
380  (errcode(ERRCODE_INDEX_CORRUPTED),
381  errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe",
382  RelationGetRelationName(indrel)),
383  has_interval_ops
384  ? errhint("This is known of \"interval\" indexes last built on a version predating 2023-11.")
385  : 0));
386  }
387 
388  /* Check index, possibly against table it is an index on */
389  bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
390  heapallindexed, rootdescend, checkunique);
391  }
392 
393  /* Roll back any GUC changes executed by index functions */
394  AtEOXact_GUC(false, save_nestlevel);
395 
396  /* Restore userid and security context */
397  SetUserIdAndSecContext(save_userid, save_sec_context);
398 
399  /*
400  * Release locks early. That's ok here because nothing in the called
401  * routines will trigger shared cache invalidations to be sent, so we can
402  * relax the usual pattern of only releasing locks after commit.
403  */
404  index_close(indrel, lockmode);
405  if (heaprel)
406  table_close(heaprel, lockmode);
407 }
#define OidIsValid(objectId)
Definition: c.h:764
int NewGUCNestLevel(void)
Definition: guc.c:2231
void AtEOXact_GUC(bool isCommit, int nestLevel)
Definition: guc.c:2245
Oid IndexGetRelation(Oid indexId, bool missing_ok)
Definition: index.c:3536
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:158
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:132
int LOCKMODE
Definition: lockdefs.h:26
#define AccessShareLock
Definition: lockdefs.h:36
#define ShareLock
Definition: lockdefs.h:40
#define SECURITY_RESTRICTED_OPERATION
Definition: miscadmin.h:316
void GetUserIdAndSecContext(Oid *userid, int *sec_context)
Definition: miscinit.c:629
void SetUserIdAndSecContext(Oid userid, int sec_context)
Definition: miscinit.c:636
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2729
#define ERRCODE_UNDEFINED_TABLE
Definition: pgbench.c:78
#define InvalidOid
Definition: postgres_ext.h:36
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:572
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:523
@ MAIN_FORKNUM
Definition: relpath.h:50
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:253
Oid * rd_opfamily
Definition: rel.h:206
Form_pg_class rd_rel
Definition: rel.h:111
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, bool readonly, bool heapallindexed, bool rootdescend, bool checkunique)
static bool btree_index_mainfork_expected(Relation rel)
static void btree_index_checkable(Relation rel)

References _bt_allequalimage(), _bt_metaversion(), AccessShareLock, AtEOXact_GUC(), bt_check_every_level(), btree_index_checkable(), btree_index_mainfork_expected(), ereport, errcode(), ERRCODE_UNDEFINED_TABLE, errhint(), errmsg(), ERROR, GetUserIdAndSecContext(), i, index_close(), index_open(), IndexGetRelation(), IndexRelationGetNumberOfKeyAttributes, InvalidOid, MAIN_FORKNUM, NewGUCNestLevel(), OidIsValid, RelationData::rd_opfamily, RelationData::rd_rel, RelationGetRelationName, RelationGetSmgr(), SECURITY_RESTRICTED_OPERATION, SetUserIdAndSecContext(), ShareLock, smgrexists(), table_close(), and table_open().

Referenced by bt_index_check(), and bt_index_parent_check().

◆ bt_index_parent_check()

Datum bt_index_parent_check ( PG_FUNCTION_ARGS  )

Definition at line 255 of file verify_nbtree.c.

256 {
257  Oid indrelid = PG_GETARG_OID(0);
258  bool heapallindexed = false;
259  bool rootdescend = false;
260  bool checkunique = false;
261 
262  if (PG_NARGS() >= 2)
263  heapallindexed = PG_GETARG_BOOL(1);
264  if (PG_NARGS() >= 3)
265  rootdescend = PG_GETARG_BOOL(2);
266  if (PG_NARGS() == 4)
267  checkunique = PG_GETARG_BOOL(3);
268 
269  bt_index_check_internal(indrelid, true, heapallindexed, rootdescend, checkunique);
270 
271  PG_RETURN_VOID();
272 }

References bt_index_check_internal(), PG_GETARG_BOOL, PG_GETARG_OID, PG_NARGS, and PG_RETURN_VOID.

◆ bt_leftmost_ignoring_half_dead()

static bool bt_leftmost_ignoring_half_dead ( BtreeCheckState state,
BlockNumber  start,
BTPageOpaque  start_opaque 
)
static

Definition at line 1138 of file verify_nbtree.c.

1141 {
1142  BlockNumber reached = start_opaque->btpo_prev,
1143  reached_from = start;
1144  bool all_half_dead = true;
1145 
1146  /*
1147  * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
1148  * potentially observe nbtree/README "Page deletion and backwards scans".
1149  */
1150  Assert(state->readonly);
1151 
1152  while (reached != P_NONE && all_half_dead)
1153  {
1154  Page page = palloc_btree_page(state, reached);
1155  BTPageOpaque reached_opaque = BTPageGetOpaque(page);
1156 
1158 
1159  /*
1160  * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
1161  * writes that side-links will continue to point to the siblings.
1162  * Check btpo_next for that property.
1163  */
1164  all_half_dead = P_ISHALFDEAD(reached_opaque) &&
1165  reached != start &&
1166  reached != reached_from &&
1167  reached_opaque->btpo_next == reached_from;
1168  if (all_half_dead)
1169  {
1170  XLogRecPtr pagelsn = PageGetLSN(page);
1171 
1172  /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
1173  ereport(DEBUG1,
1174  (errcode(ERRCODE_NO_DATA),
1175  errmsg_internal("harmless interrupted page deletion detected in index \"%s\"",
1177  errdetail_internal("Block=%u right block=%u page lsn=%X/%X.",
1178  reached, reached_from,
1179  LSN_FORMAT_ARGS(pagelsn))));
1180 
1181  reached_from = reached;
1182  reached = reached_opaque->btpo_prev;
1183  }
1184 
1185  pfree(page);
1186  }
1187 
1188  return all_half_dead;
1189 }

References Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, CHECK_FOR_INTERRUPTS, DEBUG1, ereport, errcode(), errdetail_internal(), errmsg_internal(), LSN_FORMAT_ARGS, P_ISHALFDEAD, P_NONE, PageGetLSN(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_check_level_from_leftmost(), and bt_child_highkey_check().

◆ bt_mkscankey_pivotsearch()

static BTScanInsert bt_mkscankey_pivotsearch ( Relation  rel,
IndexTuple  itup 
)
inlinestatic

Definition at line 3522 of file verify_nbtree.c.

3523 {
3524  BTScanInsert skey;
3525 
3526  skey = _bt_mkscankey(rel, itup);
3527  skey->backward = true;
3528 
3529  return skey;
3530 }
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:82

References _bt_mkscankey(), and BTScanInsertData::backward.

Referenced by bt_right_page_check_scankey(), and bt_target_page_check().

◆ bt_normalize_tuple()

static IndexTuple bt_normalize_tuple ( BtreeCheckState state,
IndexTuple  itup 
)
static

Definition at line 2940 of file verify_nbtree.c.

2941 {
2942  TupleDesc tupleDescriptor = RelationGetDescr(state->rel);
2943  Datum normalized[INDEX_MAX_KEYS];
2944  bool isnull[INDEX_MAX_KEYS];
2945  bool toast_free[INDEX_MAX_KEYS];
2946  bool formnewtup = false;
2947  IndexTuple reformed;
2948  int i;
2949 
2950  /* Caller should only pass "logical" non-pivot tuples here */
2951  Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup));
2952 
2953  /* Easy case: It's immediately clear that tuple has no varlena datums */
2954  if (!IndexTupleHasVarwidths(itup))
2955  return itup;
2956 
2957  for (i = 0; i < tupleDescriptor->natts; i++)
2958  {
2959  Form_pg_attribute att;
2960 
2961  att = TupleDescAttr(tupleDescriptor, i);
2962 
2963  /* Assume untoasted/already normalized datum initially */
2964  toast_free[i] = false;
2965  normalized[i] = index_getattr(itup, att->attnum,
2966  tupleDescriptor,
2967  &isnull[i]);
2968  if (att->attbyval || att->attlen != -1 || isnull[i])
2969  continue;
2970 
2971  /*
2972  * Callers always pass a tuple that could safely be inserted into the
2973  * index without further processing, so an external varlena header
2974  * should never be encountered here
2975  */
2976  if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i])))
2977  ereport(ERROR,
2978  (errcode(ERRCODE_INDEX_CORRUPTED),
2979  errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"",
2980  ItemPointerGetBlockNumber(&(itup->t_tid)),
2982  RelationGetRelationName(state->rel))));
2983  else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])))
2984  {
2985  formnewtup = true;
2986  normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i]));
2987  toast_free[i] = true;
2988  }
2989  }
2990 
2991  /* Easier case: Tuple has varlena datums, none of which are compressed */
2992  if (!formnewtup)
2993  return itup;
2994 
2995  /*
2996  * Hard case: Tuple had compressed varlena datums that necessitate
2997  * creating normalized version of the tuple from uncompressed input datums
2998  * (normalized input datums). This is rather naive, but shouldn't be
2999  * necessary too often.
3000  *
3001  * Note that we rely on deterministic index_form_tuple() TOAST compression
3002  * of normalized input.
3003  */
3004  reformed = index_form_tuple(tupleDescriptor, normalized, isnull);
3005  reformed->t_tid = itup->t_tid;
3006 
3007  /* Cannot leak memory here */
3008  for (i = 0; i < tupleDescriptor->natts; i++)
3009  if (toast_free[i])
3010  pfree(DatumGetPointer(normalized[i]));
3011 
3012  return reformed;
3013 }
#define PG_DETOAST_DATUM(datum)
Definition: fmgr.h:240
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define IndexTupleHasVarwidths(itup)
Definition: itup.h:72
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:209
#define INDEX_MAX_KEYS
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define RelationGetDescr(relation)
Definition: rel.h:530
ItemPointerData t_tid
Definition: itup.h:37
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
#define VARATT_IS_COMPRESSED(PTR)
Definition: varatt.h:288
#define VARATT_IS_EXTERNAL(PTR)
Definition: varatt.h:289

References Assert(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), DatumGetPointer(), ereport, errcode(), errmsg(), ERROR, i, index_form_tuple(), index_getattr(), INDEX_MAX_KEYS, IndexTupleHasVarwidths, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), TupleDescData::natts, pfree(), PG_DETOAST_DATUM, PointerGetDatum(), RelationGetDescr, RelationGetRelationName, IndexTupleData::t_tid, TupleDescAttr, VARATT_IS_COMPRESSED, and VARATT_IS_EXTERNAL.

Referenced by bt_target_page_check(), and bt_tuple_present_callback().

◆ bt_pivot_tuple_identical()

static bool bt_pivot_tuple_identical ( bool  heapkeyspace,
IndexTuple  itup1,
IndexTuple  itup2 
)
static

Definition at line 2164 of file verify_nbtree.c.

2165 {
2166  if (IndexTupleSize(itup1) != IndexTupleSize(itup2))
2167  return false;
2168 
2169  if (heapkeyspace)
2170  {
2171  /*
2172  * Offset number will contain important information in heapkeyspace
2173  * indexes: the number of attributes left in the pivot tuple following
2174  * suffix truncation. Don't skip over it (compare it too).
2175  */
2176  if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid,
2177  IndexTupleSize(itup1) -
2178  offsetof(ItemPointerData, ip_posid)) != 0)
2179  return false;
2180  }
2181  else
2182  {
2183  /*
2184  * Cannot rely on offset number field having consistent value across
2185  * levels on pg_upgrade'd !heapkeyspace indexes. Compare contents of
2186  * tuple starting from just after item pointer (i.e. after block
2187  * number and offset number).
2188  */
2189  if (memcmp(&itup1->t_info, &itup2->t_info,
2190  IndexTupleSize(itup1) -
2191  offsetof(IndexTupleData, t_info)) != 0)
2192  return false;
2193  }
2194 
2195  return true;
2196 }
unsigned short t_info
Definition: itup.h:49
OffsetNumber ip_posid
Definition: itemptr.h:39

References IndexTupleSize, ItemPointerData::ip_posid, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by bt_child_highkey_check().

◆ bt_posting_plain_tuple()

static IndexTuple bt_posting_plain_tuple ( IndexTuple  itup,
int  n 
)
inlinestatic

Definition at line 3030 of file verify_nbtree.c.

3031 {
3032  Assert(BTreeTupleIsPosting(itup));
3033 
3034  /* Returns non-posting-list tuple */
3035  return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1);
3036 }
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864

References _bt_form_posting(), Assert(), BTreeTupleGetPostingN(), and BTreeTupleIsPosting().

Referenced by bt_target_page_check().

◆ bt_recheck_sibling_links()

static void bt_recheck_sibling_links ( BtreeCheckState state,
BlockNumber  btpo_prev_from_target,
BlockNumber  leftcurrent 
)
static

Definition at line 1227 of file verify_nbtree.c.

1230 {
1231  /* passing metapage to BTPageGetOpaque() would give irrelevant findings */
1232  Assert(leftcurrent != P_NONE);
1233 
1234  if (!state->readonly)
1235  {
1236  Buffer lbuf;
1237  Buffer newtargetbuf;
1238  Page page;
1239  BTPageOpaque opaque;
1240  BlockNumber newtargetblock;
1241 
1242  /* Couple locks in the usual order for nbtree: Left to right */
1243  lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent,
1244  RBM_NORMAL, state->checkstrategy);
1245  LockBuffer(lbuf, BT_READ);
1246  _bt_checkpage(state->rel, lbuf);
1247  page = BufferGetPage(lbuf);
1248  opaque = BTPageGetOpaque(page);
1249  if (P_ISDELETED(opaque))
1250  {
1251  /*
1252  * Cannot reason about concurrently deleted page -- the left link
1253  * in the page to the right is expected to point to some other
1254  * page to the left (not leftcurrent page).
1255  *
1256  * Note that we deliberately don't give up with a half-dead page.
1257  */
1258  UnlockReleaseBuffer(lbuf);
1259  return;
1260  }
1261 
1262  newtargetblock = opaque->btpo_next;
1263  /* Avoid self-deadlock when newtargetblock == leftcurrent */
1264  if (newtargetblock != leftcurrent)
1265  {
1266  newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM,
1267  newtargetblock, RBM_NORMAL,
1268  state->checkstrategy);
1269  LockBuffer(newtargetbuf, BT_READ);
1270  _bt_checkpage(state->rel, newtargetbuf);
1271  page = BufferGetPage(newtargetbuf);
1272  opaque = BTPageGetOpaque(page);
1273  /* btpo_prev_from_target may have changed; update it */
1274  btpo_prev_from_target = opaque->btpo_prev;
1275  }
1276  else
1277  {
1278  /*
1279  * leftcurrent right sibling points back to leftcurrent block.
1280  * Index is corrupt. Easiest way to handle this is to pretend
1281  * that we actually read from a distinct page that has an invalid
1282  * block number in its btpo_prev.
1283  */
1284  newtargetbuf = InvalidBuffer;
1285  btpo_prev_from_target = InvalidBlockNumber;
1286  }
1287 
1288  /*
1289  * No need to check P_ISDELETED here, since new target block cannot be
1290  * marked deleted as long as we hold a lock on lbuf
1291  */
1292  if (BufferIsValid(newtargetbuf))
1293  UnlockReleaseBuffer(newtargetbuf);
1294  UnlockReleaseBuffer(lbuf);
1295 
1296  if (btpo_prev_from_target == leftcurrent)
1297  {
1298  /* Report split in left sibling, not target (or new target) */
1299  ereport(DEBUG1,
1300  (errcode(ERRCODE_INTERNAL_ERROR),
1301  errmsg_internal("harmless concurrent page split detected in index \"%s\"",
1303  errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.",
1304  leftcurrent, newtargetblock,
1305  state->targetblock)));
1306  return;
1307  }
1308 
1309  /*
1310  * Index is corrupt. Make sure that we report correct target page.
1311  *
1312  * This could have changed in cases where there was a concurrent page
1313  * split, as well as index corruption (at least in theory). Note that
1314  * btpo_prev_from_target was already updated above.
1315  */
1316  state->targetblock = newtargetblock;
1317  }
1318 
1319  ereport(ERROR,
1320  (errcode(ERRCODE_INDEX_CORRUPTED),
1321  errmsg("left link/right link pair in index \"%s\" not in agreement",
1323  errdetail_internal("Block=%u left block=%u left link from block=%u.",
1324  state->targetblock, leftcurrent,
1325  btpo_prev_from_target)));
1326 }
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4590
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4808
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:782
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
@ RBM_NORMAL
Definition: bufmgr.h:44
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
#define BT_READ
Definition: nbtree.h:719

References _bt_checkpage(), Assert(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetPage(), BufferIsValid(), DEBUG1, ereport, errcode(), errdetail_internal(), errmsg(), errmsg_internal(), ERROR, InvalidBlockNumber, InvalidBuffer, LockBuffer(), MAIN_FORKNUM, P_ISDELETED, P_NONE, RBM_NORMAL, ReadBufferExtended(), RelationGetRelationName, and UnlockReleaseBuffer().

Referenced by bt_check_level_from_leftmost().

◆ bt_report_duplicate()

static void bt_report_duplicate ( BtreeCheckState state,
ItemPointer  tid,
BlockNumber  block,
OffsetNumber  offset,
int  posting,
ItemPointer  nexttid,
BlockNumber  nblock,
OffsetNumber  noffset,
int  nposting 
)
static

Definition at line 997 of file verify_nbtree.c.

1002 {
1003  char *htid,
1004  *nhtid,
1005  *itid,
1006  *nitid = "",
1007  *pposting = "",
1008  *pnposting = "";
1009 
1010  htid = psprintf("tid=(%u,%u)",
1013  nhtid = psprintf("tid=(%u,%u)",
1016  itid = psprintf("tid=(%u,%u)", block, offset);
1017 
1018  if (nblock != block || noffset != offset)
1019  nitid = psprintf(" tid=(%u,%u)", nblock, noffset);
1020 
1021  if (posting >= 0)
1022  pposting = psprintf(" posting %u", posting);
1023 
1024  if (nposting >= 0)
1025  pnposting = psprintf(" posting %u", nposting);
1026 
1027  ereport(ERROR,
1028  (errcode(ERRCODE_INDEX_CORRUPTED),
1029  errmsg("index uniqueness is violated for index \"%s\"",
1031  errdetail("Index %s%s and%s%s (point to heap %s and %s) page lsn=%X/%X.",
1032  itid, pposting, nitid, pnposting, htid, nhtid,
1033  LSN_FORMAT_ARGS(state->targetlsn))));
1034 }

References ereport, errcode(), errdetail(), errmsg(), ERROR, ItemPointerGetBlockNumberNoCheck(), ItemPointerGetOffsetNumberNoCheck(), LSN_FORMAT_ARGS, psprintf(), and RelationGetRelationName.

Referenced by bt_entry_unique_check().

◆ bt_right_page_check_scankey()

static BTScanInsert bt_right_page_check_scankey ( BtreeCheckState state,
OffsetNumber rightfirstoffset 
)
static

Definition at line 1957 of file verify_nbtree.c.

1958 {
1959  BTPageOpaque opaque;
1960  ItemId rightitem;
1961  IndexTuple firstitup;
1962  BlockNumber targetnext;
1963  Page rightpage;
1964  OffsetNumber nline;
1965 
1966  /* Determine target's next block number */
1967  opaque = BTPageGetOpaque(state->target);
1968 
1969  /* If target is already rightmost, no right sibling; nothing to do here */
1970  if (P_RIGHTMOST(opaque))
1971  return NULL;
1972 
1973  /*
1974  * General notes on concurrent page splits and page deletion:
1975  *
1976  * Routines like _bt_search() don't require *any* page split interlock
1977  * when descending the tree, including something very light like a buffer
1978  * pin. That's why it's okay that we don't either. This avoidance of any
1979  * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao
1980  * algorithm, in fact.
1981  *
1982  * That leaves deletion. A deleted page won't actually be recycled by
1983  * VACUUM early enough for us to fail to at least follow its right link
1984  * (or left link, or downlink) and find its sibling, because recycling
1985  * does not occur until no possible index scan could land on the page.
1986  * Index scans can follow links with nothing more than their snapshot as
1987  * an interlock and be sure of at least that much. (See page
1988  * recycling/"visible to everyone" notes in nbtree README.)
1989  *
1990  * Furthermore, it's okay if we follow a rightlink and find a half-dead or
1991  * dead (ignorable) page one or more times. There will either be a
1992  * further right link to follow that leads to a live page before too long
1993  * (before passing by parent's rightmost child), or we will find the end
1994  * of the entire level instead (possible when parent page is itself the
1995  * rightmost on its level).
1996  */
1997  targetnext = opaque->btpo_next;
1998  for (;;)
1999  {
2001 
2002  rightpage = palloc_btree_page(state, targetnext);
2003  opaque = BTPageGetOpaque(rightpage);
2004 
2005  if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque))
2006  break;
2007 
2008  /*
2009  * We landed on a deleted or half-dead sibling page. Step right until
2010  * we locate a live sibling page.
2011  */
2012  ereport(DEBUG2,
2013  (errcode(ERRCODE_NO_DATA),
2014  errmsg_internal("level %u sibling page in block %u of index \"%s\" was found deleted or half dead",
2015  opaque->btpo_level, targetnext, RelationGetRelationName(state->rel)),
2016  errdetail_internal("Deleted page found when building scankey from right sibling.")));
2017 
2018  targetnext = opaque->btpo_next;
2019 
2020  /* Be slightly more pro-active in freeing this memory, just in case */
2021  pfree(rightpage);
2022  }
2023 
2024  /*
2025  * No ShareLock held case -- why it's safe to proceed.
2026  *
2027  * Problem:
2028  *
2029  * We must avoid false positive reports of corruption when caller treats
2030  * item returned here as an upper bound on target's last item. In
2031  * general, false positives are disallowed. Avoiding them here when
2032  * caller is !readonly is subtle.
2033  *
2034  * A concurrent page deletion by VACUUM of the target page can result in
2035  * the insertion of items on to this right sibling page that would
2036  * previously have been inserted on our target page. There might have
2037  * been insertions that followed the target's downlink after it was made
2038  * to point to right sibling instead of target by page deletion's first
2039  * phase. The inserters insert items that would belong on target page.
2040  * This race is very tight, but it's possible. This is our only problem.
2041  *
2042  * Non-problems:
2043  *
2044  * We are not hindered by a concurrent page split of the target; we'll
2045  * never land on the second half of the page anyway. A concurrent split
2046  * of the right page will also not matter, because the first data item
2047  * remains the same within the left half, which we'll reliably land on. If
2048  * we had to skip over ignorable/deleted pages, it cannot matter because
2049  * their key space has already been atomically merged with the first
2050  * non-ignorable page we eventually find (doesn't matter whether the page
2051  * we eventually find is a true sibling or a cousin of target, which we go
2052  * into below).
2053  *
2054  * Solution:
2055  *
2056  * Caller knows that it should reverify that target is not ignorable
2057  * (half-dead or deleted) when cross-page sibling item comparison appears
2058  * to indicate corruption (invariant fails). This detects the single race
2059  * condition that exists for caller. This is correct because the
2060  * continued existence of target block as non-ignorable (not half-dead or
2061  * deleted) implies that target page was not merged into from the right by
2062  * deletion; the key space at or after target never moved left. Target's
2063  * parent either has the same downlink to target as before, or a <
2064  * downlink due to deletion at the left of target. Target either has the
2065  * same highkey as before, or a highkey < before when there is a page
2066  * split. (The rightmost concurrently-split-from-target-page page will
2067  * still have the same highkey as target was originally found to have,
2068  * which for our purposes is equivalent to target's highkey itself never
2069  * changing, since we reliably skip over
2070  * concurrently-split-from-target-page pages.)
2071  *
2072  * In simpler terms, we allow that the key space of the target may expand
2073  * left (the key space can move left on the left side of target only), but
2074  * the target key space cannot expand right and get ahead of us without
2075  * our detecting it. The key space of the target cannot shrink, unless it
2076  * shrinks to zero due to the deletion of the original page, our canary
2077  * condition. (To be very precise, we're a bit stricter than that because
2078  * it might just have been that the target page split and only the
2079  * original target page was deleted. We can be more strict, just not more
2080  * lax.)
2081  *
2082  * Top level tree walk caller moves on to next page (makes it the new
2083  * target) following recovery from this race. (cf. The rationale for
2084  * child/downlink verification needing a ShareLock within
2085  * bt_child_check(), where page deletion is also the main source of
2086  * trouble.)
2087  *
2088  * Note that it doesn't matter if right sibling page here is actually a
2089  * cousin page, because in order for the key space to be readjusted in a
2090  * way that causes us issues in next level up (guiding problematic
2091  * concurrent insertions to the cousin from the grandparent rather than to
2092  * the sibling from the parent), there'd have to be page deletion of
2093  * target's parent page (affecting target's parent's downlink in target's
2094  * grandparent page). Internal page deletion only occurs when there are
2095  * no child pages (they were all fully deleted), and caller is checking
2096  * that the target's parent has at least one non-deleted (so
2097  * non-ignorable) child: the target page. (Note that the first phase of
2098  * deletion atomically marks the page to be deleted half-dead/ignorable at
2099  * the same time downlink in its parent is removed, so caller will
2100  * definitely not fail to detect that this happened.)
2101  *
2102  * This trick is inspired by the method backward scans use for dealing
2103  * with concurrent page splits; concurrent page deletion is a problem that
2104  * similarly receives special consideration sometimes (it's possible that
2105  * the backwards scan will re-read its "original" block after failing to
2106  * find a right-link to it, having already moved in the opposite direction
2107  * (right/"forwards") a few times to try to locate one). Just like us,
2108  * that happens only to determine if there was a concurrent page deletion
2109  * of a reference page, and just like us if there was a page deletion of
2110  * that reference page it means we can move on from caring about the
2111  * reference page. See the nbtree README for a full description of how
2112  * that works.
2113  */
2114  nline = PageGetMaxOffsetNumber(rightpage);
2115 
2116  /*
2117  * Get first data item, if any
2118  */
2119  if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque))
2120  {
2121  /* Return first data item (if any) */
2122  rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
2123  P_FIRSTDATAKEY(opaque));
2124  *rightfirstoffset = P_FIRSTDATAKEY(opaque);
2125  }
2126  else if (!P_ISLEAF(opaque) &&
2127  nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque)))
2128  {
2129  /*
2130  * Return first item after the internal page's "negative infinity"
2131  * item
2132  */
2133  rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
2134  OffsetNumberNext(P_FIRSTDATAKEY(opaque)));
2135  }
2136  else
2137  {
2138  /*
2139  * No first item. Page is probably empty leaf page, but it's also
2140  * possible that it's an internal page with only a negative infinity
2141  * item.
2142  */
2143  ereport(DEBUG2,
2144  (errcode(ERRCODE_NO_DATA),
2145  errmsg_internal("%s block %u of index \"%s\" has no first data item",
2146  P_ISLEAF(opaque) ? "leaf" : "internal", targetnext,
2147  RelationGetRelationName(state->rel))));
2148  return NULL;
2149  }
2150 
2151  /*
2152  * Return first real item scankey. Note that this relies on right page
2153  * memory remaining allocated.
2154  */
2155  firstitup = (IndexTuple) PageGetItem(rightpage, rightitem);
2156  return bt_mkscankey_pivotsearch(state->rel, firstitup);
2157 }
#define DEBUG2
Definition: elog.h:29
static BTScanInsert bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)

References bt_mkscankey_pivotsearch(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, CHECK_FOR_INTERRUPTS, DEBUG2, ereport, errcode(), errdetail_internal(), errmsg_internal(), OffsetNumberNext, P_FIRSTDATAKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_target_page_check().

◆ bt_rootdescend()

static bool bt_rootdescend ( BtreeCheckState state,
IndexTuple  itup 
)
static

Definition at line 3063 of file verify_nbtree.c.

3064 {
3065  BTScanInsert key;
3066  BTStack stack;
3067  Buffer lbuf;
3068  bool exists;
3069 
3070  key = _bt_mkscankey(state->rel, itup);
3071  Assert(key->heapkeyspace && key->scantid != NULL);
3072 
3073  /*
3074  * Search from root.
3075  *
3076  * Ideally, we would arrange to only move right within _bt_search() when
3077  * an interrupted page split is detected (i.e. when the incomplete split
3078  * bit is found to be set), but for now we accept the possibility that
3079  * that could conceal an inconsistency.
3080  */
3081  Assert(state->readonly && state->rootdescend);
3082  exists = false;
3083  stack = _bt_search(state->rel, NULL, key, &lbuf, BT_READ);
3084 
3085  if (BufferIsValid(lbuf))
3086  {
3087  BTInsertStateData insertstate;
3088  OffsetNumber offnum;
3089  Page page;
3090 
3091  insertstate.itup = itup;
3092  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
3093  insertstate.itup_key = key;
3094  insertstate.postingoff = 0;
3095  insertstate.bounds_valid = false;
3096  insertstate.buf = lbuf;
3097 
3098  /* Get matching tuple on leaf page */
3099  offnum = _bt_binsrch_insert(state->rel, &insertstate);
3100  /* Compare first >= matching item on leaf page, if any */
3101  page = BufferGetPage(lbuf);
3102  /* Should match on first heap TID when tuple has a posting list */
3103  if (offnum <= PageGetMaxOffsetNumber(page) &&
3104  insertstate.postingoff <= 0 &&
3105  _bt_compare(state->rel, key, page, offnum) == 0)
3106  exists = true;
3107  _bt_relbuf(state->rel, lbuf);
3108  }
3109 
3110  _bt_freestack(stack);
3111  pfree(key);
3112 
3113  return exists;
3114 }
#define MAXALIGN(LEN)
Definition: c.h:800
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:96
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:468
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:682
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:174
bool bounds_valid
Definition: nbtree.h:823
IndexTuple itup
Definition: nbtree.h:811
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_insert(), _bt_compare(), _bt_freestack(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), Assert(), BTInsertStateData::bounds_valid, BT_READ, BTInsertStateData::buf, BufferGetPage(), BufferIsValid(), IndexTupleSize, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, sort-test::key, MAXALIGN, PageGetMaxOffsetNumber(), pfree(), and BTInsertStateData::postingoff.

Referenced by bt_target_page_check().

◆ bt_target_page_check()

static void bt_target_page_check ( BtreeCheckState state)
static

Definition at line 1367 of file verify_nbtree.c.

1368 {
1369  OffsetNumber offset;
1370  OffsetNumber max;
1371  BTPageOpaque topaque;
1372 
1373  /* last visible entry info for checking indexes with unique constraint */
1374  int lVis_i = -1; /* the position of last visible item for
1375  * posting tuple. for non-posting tuple (-1) */
1376  ItemPointer lVis_tid = NULL;
1377  BlockNumber lVis_block = InvalidBlockNumber;
1378  OffsetNumber lVis_offset = InvalidOffsetNumber;
1379 
1380  topaque = BTPageGetOpaque(state->target);
1381  max = PageGetMaxOffsetNumber(state->target);
1382 
1383  elog(DEBUG2, "verifying %u items on %s block %u", max,
1384  P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
1385 
1386  /*
1387  * Check the number of attributes in high key. Note, rightmost page
1388  * doesn't contain a high key, so nothing to check
1389  */
1390  if (!P_RIGHTMOST(topaque))
1391  {
1392  ItemId itemid;
1393  IndexTuple itup;
1394 
1395  /* Verify line pointer before checking tuple */
1396  itemid = PageGetItemIdCareful(state, state->targetblock,
1397  state->target, P_HIKEY);
1398  if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
1399  P_HIKEY))
1400  {
1401  itup = (IndexTuple) PageGetItem(state->target, itemid);
1402  ereport(ERROR,
1403  (errcode(ERRCODE_INDEX_CORRUPTED),
1404  errmsg("wrong number of high key index tuple attributes in index \"%s\"",
1406  errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.",
1407  state->targetblock,
1408  BTreeTupleGetNAtts(itup, state->rel),
1409  P_ISLEAF(topaque) ? "heap" : "index",
1410  LSN_FORMAT_ARGS(state->targetlsn))));
1411  }
1412  }
1413 
1414  /*
1415  * Loop over page items, starting from first non-highkey item, not high
1416  * key (if any). Most tests are not performed for the "negative infinity"
1417  * real item (if any).
1418  */
1419  for (offset = P_FIRSTDATAKEY(topaque);
1420  offset <= max;
1421  offset = OffsetNumberNext(offset))
1422  {
1423  ItemId itemid;
1424  IndexTuple itup;
1425  size_t tupsize;
1426  BTScanInsert skey;
1427  bool lowersizelimit;
1428  ItemPointer scantid;
1429 
1431 
1432  itemid = PageGetItemIdCareful(state, state->targetblock,
1433  state->target, offset);
1434  itup = (IndexTuple) PageGetItem(state->target, itemid);
1435  tupsize = IndexTupleSize(itup);
1436 
1437  /*
1438  * lp_len should match the IndexTuple reported length exactly, since
1439  * lp_len is completely redundant in indexes, and both sources of
1440  * tuple length are MAXALIGN()'d. nbtree does not use lp_len all that
1441  * frequently, and is surprisingly tolerant of corrupt lp_len fields.
1442  */
1443  if (tupsize != ItemIdGetLength(itemid))
1444  ereport(ERROR,
1445  (errcode(ERRCODE_INDEX_CORRUPTED),
1446  errmsg("index tuple size does not equal lp_len in index \"%s\"",
1448  errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.",
1449  state->targetblock, offset,
1450  tupsize, ItemIdGetLength(itemid),
1451  LSN_FORMAT_ARGS(state->targetlsn)),
1452  errhint("This could be a torn page problem.")));
1453 
1454  /* Check the number of index tuple attributes */
1455  if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
1456  offset))
1457  {
1458  ItemPointer tid;
1459  char *itid,
1460  *htid;
1461 
1462  itid = psprintf("(%u,%u)", state->targetblock, offset);
1463  tid = BTreeTupleGetPointsToTID(itup);
1464  htid = psprintf("(%u,%u)",
1467 
1468  ereport(ERROR,
1469  (errcode(ERRCODE_INDEX_CORRUPTED),
1470  errmsg("wrong number of index tuple attributes in index \"%s\"",
1472  errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",
1473  itid,
1474  BTreeTupleGetNAtts(itup, state->rel),
1475  P_ISLEAF(topaque) ? "heap" : "index",
1476  htid,
1477  LSN_FORMAT_ARGS(state->targetlsn))));
1478  }
1479 
1480  /*
1481  * Don't try to generate scankey using "negative infinity" item on
1482  * internal pages. They are always truncated to zero attributes.
1483  */
1484  if (offset_is_negative_infinity(topaque, offset))
1485  {
1486  /*
1487  * We don't call bt_child_check() for "negative infinity" items.
1488  * But if we're performing downlink connectivity check, we do it
1489  * for every item including "negative infinity" one.
1490  */
1491  if (!P_ISLEAF(topaque) && state->readonly)
1492  {
1494  offset,
1495  NULL,
1496  topaque->btpo_level);
1497  }
1498  continue;
1499  }
1500 
1501  /*
1502  * Readonly callers may optionally verify that non-pivot tuples can
1503  * each be found by an independent search that starts from the root.
1504  * Note that we deliberately don't do individual searches for each
1505  * TID, since the posting list itself is validated by other checks.
1506  */
1507  if (state->rootdescend && P_ISLEAF(topaque) &&
1508  !bt_rootdescend(state, itup))
1509  {
1511  char *itid,
1512  *htid;
1513 
1514  itid = psprintf("(%u,%u)", state->targetblock, offset);
1515  htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid),
1517 
1518  ereport(ERROR,
1519  (errcode(ERRCODE_INDEX_CORRUPTED),
1520  errmsg("could not find tuple using search from root page in index \"%s\"",
1522  errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
1523  itid, htid,
1524  LSN_FORMAT_ARGS(state->targetlsn))));
1525  }
1526 
1527  /*
1528  * If tuple is a posting list tuple, make sure posting list TIDs are
1529  * in order
1530  */
1531  if (BTreeTupleIsPosting(itup))
1532  {
1533  ItemPointerData last;
1534  ItemPointer current;
1535 
1536  ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
1537 
1538  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1539  {
1540 
1541  current = BTreeTupleGetPostingN(itup, i);
1542 
1543  if (ItemPointerCompare(current, &last) <= 0)
1544  {
1545  char *itid = psprintf("(%u,%u)", state->targetblock, offset);
1546 
1547  ereport(ERROR,
1548  (errcode(ERRCODE_INDEX_CORRUPTED),
1549  errmsg_internal("posting list contains misplaced TID in index \"%s\"",
1551  errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.",
1552  itid, i,
1553  LSN_FORMAT_ARGS(state->targetlsn))));
1554  }
1555 
1556  ItemPointerCopy(current, &last);
1557  }
1558  }
1559 
1560  /* Build insertion scankey for current page offset */
1561  skey = bt_mkscankey_pivotsearch(state->rel, itup);
1562 
1563  /*
1564  * Make sure tuple size does not exceed the relevant BTREE_VERSION
1565  * specific limit.
1566  *
1567  * BTREE_VERSION 4 (which introduced heapkeyspace rules) requisitioned
1568  * a small amount of space from BTMaxItemSize() in order to ensure
1569  * that suffix truncation always has enough space to add an explicit
1570  * heap TID back to a tuple -- we pessimistically assume that every
1571  * newly inserted tuple will eventually need to have a heap TID
1572  * appended during a future leaf page split, when the tuple becomes
1573  * the basis of the new high key (pivot tuple) for the leaf page.
1574  *
1575  * Since the reclaimed space is reserved for that purpose, we must not
1576  * enforce the slightly lower limit when the extra space has been used
1577  * as intended. In other words, there is only a cross-version
1578  * difference in the limit on tuple size within leaf pages.
1579  *
1580  * Still, we're particular about the details within BTREE_VERSION 4
1581  * internal pages. Pivot tuples may only use the extra space for its
1582  * designated purpose. Enforce the lower limit for pivot tuples when
1583  * an explicit heap TID isn't actually present. (In all other cases
1584  * suffix truncation is guaranteed to generate a pivot tuple that's no
1585  * larger than the firstright tuple provided to it by its caller.)
1586  */
1587  lowersizelimit = skey->heapkeyspace &&
1588  (P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL);
1589  if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) :
1590  BTMaxItemSizeNoHeapTid(state->target)))
1591  {
1593  char *itid,
1594  *htid;
1595 
1596  itid = psprintf("(%u,%u)", state->targetblock, offset);
1597  htid = psprintf("(%u,%u)",
1600 
1601  ereport(ERROR,
1602  (errcode(ERRCODE_INDEX_CORRUPTED),
1603  errmsg("index row size %zu exceeds maximum for index \"%s\"",
1604  tupsize, RelationGetRelationName(state->rel)),
1605  errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
1606  itid,
1607  P_ISLEAF(topaque) ? "heap" : "index",
1608  htid,
1609  LSN_FORMAT_ARGS(state->targetlsn))));
1610  }
1611 
1612  /* Fingerprint leaf page tuples (those that point to the heap) */
1613  if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
1614  {
1615  IndexTuple norm;
1616 
1617  if (BTreeTupleIsPosting(itup))
1618  {
1619  /* Fingerprint all elements as distinct "plain" tuples */
1620  for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
1621  {
1622  IndexTuple logtuple;
1623 
1624  logtuple = bt_posting_plain_tuple(itup, i);
1625  norm = bt_normalize_tuple(state, logtuple);
1626  bloom_add_element(state->filter, (unsigned char *) norm,
1627  IndexTupleSize(norm));
1628  /* Be tidy */
1629  if (norm != logtuple)
1630  pfree(norm);
1631  pfree(logtuple);
1632  }
1633  }
1634  else
1635  {
1636  norm = bt_normalize_tuple(state, itup);
1637  bloom_add_element(state->filter, (unsigned char *) norm,
1638  IndexTupleSize(norm));
1639  /* Be tidy */
1640  if (norm != itup)
1641  pfree(norm);
1642  }
1643  }
1644 
1645  /*
1646  * * High key check *
1647  *
1648  * If there is a high key (if this is not the rightmost page on its
1649  * entire level), check that high key actually is upper bound on all
1650  * page items. If this is a posting list tuple, we'll need to set
1651  * scantid to be highest TID in posting list.
1652  *
1653  * We prefer to check all items against high key rather than checking
1654  * just the last and trusting that the operator class obeys the
1655  * transitive law (which implies that all previous items also
1656  * respected the high key invariant if they pass the item order
1657  * check).
1658  *
1659  * Ideally, we'd compare every item in the index against every other
1660  * item in the index, and not trust opclass obedience of the
1661  * transitive law to bridge the gap between children and their
1662  * grandparents (as well as great-grandparents, and so on). We don't
1663  * go to those lengths because that would be prohibitively expensive,
1664  * and probably not markedly more effective in practice.
1665  *
1666  * On the leaf level, we check that the key is <= the highkey.
1667  * However, on non-leaf levels we check that the key is < the highkey,
1668  * because the high key is "just another separator" rather than a copy
1669  * of some existing key item; we expect it to be unique among all keys
1670  * on the same level. (Suffix truncation will sometimes produce a
1671  * leaf highkey that is an untruncated copy of the lastleft item, but
1672  * never any other item, which necessitates weakening the leaf level
1673  * check to <=.)
1674  *
1675  * Full explanation for why a highkey is never truly a copy of another
1676  * item from the same level on internal levels:
1677  *
1678  * While the new left page's high key is copied from the first offset
1679  * on the right page during an internal page split, that's not the
1680  * full story. In effect, internal pages are split in the middle of
1681  * the firstright tuple, not between the would-be lastleft and
1682  * firstright tuples: the firstright key ends up on the left side as
1683  * left's new highkey, and the firstright downlink ends up on the
1684  * right side as right's new "negative infinity" item. The negative
1685  * infinity tuple is truncated to zero attributes, so we're only left
1686  * with the downlink. In other words, the copying is just an
1687  * implementation detail of splitting in the middle of a (pivot)
1688  * tuple. (See also: "Notes About Data Representation" in the nbtree
1689  * README.)
1690  */
1691  scantid = skey->scantid;
1692  if (state->heapkeyspace && BTreeTupleIsPosting(itup))
1693  skey->scantid = BTreeTupleGetMaxHeapTID(itup);
1694 
1695  if (!P_RIGHTMOST(topaque) &&
1696  !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
1697  invariant_l_offset(state, skey, P_HIKEY)))
1698  {
1700  char *itid,
1701  *htid;
1702 
1703  itid = psprintf("(%u,%u)", state->targetblock, offset);
1704  htid = psprintf("(%u,%u)",
1707 
1708  ereport(ERROR,
1709  (errcode(ERRCODE_INDEX_CORRUPTED),
1710  errmsg("high key invariant violated for index \"%s\"",
1712  errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
1713  itid,
1714  P_ISLEAF(topaque) ? "heap" : "index",
1715  htid,
1716  LSN_FORMAT_ARGS(state->targetlsn))));
1717  }
1718  /* Reset, in case scantid was set to (itup) posting tuple's max TID */
1719  skey->scantid = scantid;
1720 
1721  /*
1722  * * Item order check *
1723  *
1724  * Check that items are stored on page in logical order, by checking
1725  * current item is strictly less than next item (if any).
1726  */
1727  if (OffsetNumberNext(offset) <= max &&
1728  !invariant_l_offset(state, skey, OffsetNumberNext(offset)))
1729  {
1730  ItemPointer tid;
1731  char *itid,
1732  *htid,
1733  *nitid,
1734  *nhtid;
1735 
1736  itid = psprintf("(%u,%u)", state->targetblock, offset);
1737  tid = BTreeTupleGetPointsToTID(itup);
1738  htid = psprintf("(%u,%u)",
1741  nitid = psprintf("(%u,%u)", state->targetblock,
1742  OffsetNumberNext(offset));
1743 
1744  /* Reuse itup to get pointed-to heap location of second item */
1745  itemid = PageGetItemIdCareful(state, state->targetblock,
1746  state->target,
1747  OffsetNumberNext(offset));
1748  itup = (IndexTuple) PageGetItem(state->target, itemid);
1749  tid = BTreeTupleGetPointsToTID(itup);
1750  nhtid = psprintf("(%u,%u)",
1753 
1754  ereport(ERROR,
1755  (errcode(ERRCODE_INDEX_CORRUPTED),
1756  errmsg("item order invariant violated for index \"%s\"",
1758  errdetail_internal("Lower index tid=%s (points to %s tid=%s) "
1759  "higher index tid=%s (points to %s tid=%s) "
1760  "page lsn=%X/%X.",
1761  itid,
1762  P_ISLEAF(topaque) ? "heap" : "index",
1763  htid,
1764  nitid,
1765  P_ISLEAF(topaque) ? "heap" : "index",
1766  nhtid,
1767  LSN_FORMAT_ARGS(state->targetlsn))));
1768  }
1769 
1770  /*
1771  * If the index is unique verify entries uniqueness by checking the
1772  * heap tuples visibility.
1773  */
1774  if (state->checkunique && state->indexinfo->ii_Unique &&
1775  P_ISLEAF(topaque) && !skey->anynullkeys)
1776  bt_entry_unique_check(state, itup, state->targetblock, offset,
1777  &lVis_i, &lVis_tid, &lVis_offset,
1778  &lVis_block);
1779 
1780  if (state->checkunique && state->indexinfo->ii_Unique &&
1781  P_ISLEAF(topaque) && OffsetNumberNext(offset) <= max)
1782  {
1783  /* Save current scankey tid */
1784  scantid = skey->scantid;
1785 
1786  /*
1787  * Invalidate scankey tid to make _bt_compare compare only keys in
1788  * the item to report equality even if heap TIDs are different
1789  */
1790  skey->scantid = NULL;
1791 
1792  /*
1793  * If next key tuple is different, invalidate last visible entry
1794  * data (whole index tuple or last posting in index tuple). Key
1795  * containing null value does not violate unique constraint and
1796  * treated as different to any other key.
1797  */
1798  if (_bt_compare(state->rel, skey, state->target,
1799  OffsetNumberNext(offset)) != 0 || skey->anynullkeys)
1800  {
1801  lVis_i = -1;
1802  lVis_tid = NULL;
1803  lVis_block = InvalidBlockNumber;
1804  lVis_offset = InvalidOffsetNumber;
1805  }
1806  skey->scantid = scantid; /* Restore saved scan key state */
1807  }
1808 
1809  /*
1810  * * Last item check *
1811  *
1812  * Check last item against next/right page's first data item's when
1813  * last item on page is reached. This additional check will detect
1814  * transposed pages iff the supposed right sibling page happens to
1815  * belong before target in the key space. (Otherwise, a subsequent
1816  * heap verification will probably detect the problem.)
1817  *
1818  * This check is similar to the item order check that will have
1819  * already been performed for every other "real" item on target page
1820  * when last item is checked. The difference is that the next item
1821  * (the item that is compared to target's last item) needs to come
1822  * from the next/sibling page. There may not be such an item
1823  * available from sibling for various reasons, though (e.g., target is
1824  * the rightmost page on level).
1825  */
1826  if (offset == max)
1827  {
1828  BTScanInsert rightkey;
1829  BlockNumber rightblock_number;
1830 
1831  /* first offset on a right index page (log only) */
1832  OffsetNumber rightfirstoffset = InvalidOffsetNumber;
1833 
1834  /* Get item in next/right page */
1835  rightkey = bt_right_page_check_scankey(state, &rightfirstoffset);
1836 
1837  if (rightkey &&
1838  !invariant_g_offset(state, rightkey, max))
1839  {
1840  /*
1841  * As explained at length in bt_right_page_check_scankey(),
1842  * there is a known !readonly race that could account for
1843  * apparent violation of invariant, which we must check for
1844  * before actually proceeding with raising error. Our canary
1845  * condition is that target page was deleted.
1846  */
1847  if (!state->readonly)
1848  {
1849  /* Get fresh copy of target page */
1850  state->target = palloc_btree_page(state, state->targetblock);
1851  /* Note that we deliberately do not update target LSN */
1852  topaque = BTPageGetOpaque(state->target);
1853 
1854  /*
1855  * All !readonly checks now performed; just return
1856  */
1857  if (P_IGNORE(topaque))
1858  return;
1859  }
1860 
1861  ereport(ERROR,
1862  (errcode(ERRCODE_INDEX_CORRUPTED),
1863  errmsg("cross page item order invariant violated for index \"%s\"",
1865  errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.",
1866  state->targetblock, offset,
1867  LSN_FORMAT_ARGS(state->targetlsn))));
1868  }
1869 
1870  /*
1871  * If index has unique constraint make sure that no more than one
1872  * found equal items is visible.
1873  */
1874  rightblock_number = topaque->btpo_next;
1875  if (state->checkunique && state->indexinfo->ii_Unique &&
1876  rightkey && P_ISLEAF(topaque) && rightblock_number != P_NONE)
1877  {
1878  elog(DEBUG2, "check cross page unique condition");
1879 
1880  /*
1881  * Make _bt_compare compare only index keys without heap TIDs.
1882  * rightkey->scantid is modified destructively but it is ok
1883  * for it is not used later.
1884  */
1885  rightkey->scantid = NULL;
1886 
1887  /* The first key on the next page is the same */
1888  if (_bt_compare(state->rel, rightkey, state->target, max) == 0 && !rightkey->anynullkeys)
1889  {
1890  elog(DEBUG2, "cross page equal keys");
1891  state->target = palloc_btree_page(state,
1892  rightblock_number);
1893  topaque = BTPageGetOpaque(state->target);
1894 
1895  if (P_IGNORE(topaque) || !P_ISLEAF(topaque))
1896  break;
1897 
1898  itemid = PageGetItemIdCareful(state, rightblock_number,
1899  state->target,
1900  rightfirstoffset);
1901  itup = (IndexTuple) PageGetItem(state->target, itemid);
1902 
1903  bt_entry_unique_check(state, itup, rightblock_number, rightfirstoffset,
1904  &lVis_i, &lVis_tid, &lVis_offset,
1905  &lVis_block);
1906  }
1907  }
1908  }
1909 
1910  /*
1911  * * Downlink check *
1912  *
1913  * Additional check of child items iff this is an internal page and
1914  * caller holds a ShareLock. This happens for every downlink (item)
1915  * in target excluding the negative-infinity downlink (again, this is
1916  * because it has no useful value to compare).
1917  */
1918  if (!P_ISLEAF(topaque) && state->readonly)
1919  bt_child_check(state, skey, offset);
1920  }
1921 
1922  /*
1923  * Special case bt_child_highkey_check() call
1924  *
1925  * We don't pass a real downlink, but we've to finish the level
1926  * processing. If condition is satisfied, we've already processed all the
1927  * downlinks from the target level. But there still might be pages to the
1928  * right of the child page pointer to by our rightmost downlink. And they
1929  * might have missing downlinks. This final call checks for them.
1930  */
1931  if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
1932  {
1934  NULL, topaque->btpo_level);
1935  }
1936 }
void bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:135
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:169
#define BTMaxItemSize(page)
Definition: nbtree.h:164
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2511
#define InvalidOffsetNumber
Definition: off.h:26
ItemPointer scantid
Definition: nbtree.h:791
bool heapkeyspace
Definition: nbtree.h:786
bool anynullkeys
Definition: nbtree.h:788
static bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
static ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup)
static IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n)
static void bt_entry_unique_check(BtreeCheckState *state, IndexTuple itup, BlockNumber targetblock, OffsetNumber offset, int *lVis_i, ItemPointer *lVis_tid, OffsetNumber *lVis_offset, BlockNumber *lVis_block)
static bool invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state, OffsetNumber *rightfirstoffset)
static bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound)
static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, OffsetNumber downlinkoffnum)

References _bt_check_natts(), _bt_compare(), BTScanInsertData::anynullkeys, bloom_add_element(), bt_child_check(), bt_child_highkey_check(), bt_entry_unique_check(), bt_mkscankey_pivotsearch(), bt_normalize_tuple(), bt_posting_plain_tuple(), bt_right_page_check_scankey(), bt_rootdescend(), BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPointsToTID(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), CHECK_FOR_INTERRUPTS, DEBUG2, elog(), ereport, errcode(), errdetail_internal(), errhint(), errmsg(), errmsg_internal(), ERROR, BTScanInsertData::heapkeyspace, i, IndexTupleSize, InvalidBlockNumber, InvalidOffsetNumber, invariant_g_offset(), invariant_l_offset(), invariant_leq_offset(), ItemIdGetLength, ItemIdIsDead, ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetBlockNumber(), ItemPointerGetBlockNumberNoCheck(), ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), LSN_FORMAT_ARGS, offset_is_negative_infinity(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), psprintf(), RelationGetRelationName, and BTScanInsertData::scantid.

Referenced by bt_check_level_from_leftmost().

◆ bt_tuple_present_callback()

static void bt_tuple_present_callback ( Relation  index,
ItemPointer  tid,
Datum values,
bool isnull,
bool  tupleIsAlive,
void *  checkstate 
)
static

Definition at line 2872 of file verify_nbtree.c.

2874 {
2875  BtreeCheckState *state = (BtreeCheckState *) checkstate;
2876  IndexTuple itup,
2877  norm;
2878 
2879  Assert(state->heapallindexed);
2880 
2881  /* Generate a normalized index tuple for fingerprinting */
2882  itup = index_form_tuple(RelationGetDescr(index), values, isnull);
2883  itup->t_tid = *tid;
2884  norm = bt_normalize_tuple(state, itup);
2885 
2886  /* Probe Bloom filter -- tuple should be present */
2887  if (bloom_lacks_element(state->filter, (unsigned char *) norm,
2888  IndexTupleSize(norm)))
2889  ereport(ERROR,
2891  errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
2892  ItemPointerGetBlockNumber(&(itup->t_tid)),
2894  RelationGetRelationName(state->heaprel),
2896  !state->readonly
2897  ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.")
2898  : 0));
2899 
2900  state->heaptuplespresent++;
2901  pfree(itup);
2902  /* Cannot leak memory here */
2903  if (norm != itup)
2904  pfree(norm);
2905 }
bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:157
static Datum values[MAXATTR]
Definition: bootstrap.c:156
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
Definition: type.h:95

References Assert(), bloom_lacks_element(), bt_normalize_tuple(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, index_form_tuple(), IndexTupleSize, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), pfree(), RelationGetDescr, RelationGetRelationName, IndexTupleData::t_tid, and values.

Referenced by bt_check_every_level().

◆ btree_index_checkable()

static void btree_index_checkable ( Relation  rel)
inlinestatic

Definition at line 418 of file verify_nbtree.c.

419 {
420  if (rel->rd_rel->relkind != RELKIND_INDEX ||
421  rel->rd_rel->relam != BTREE_AM_OID)
422  ereport(ERROR,
423  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
424  errmsg("only B-Tree indexes are supported as targets for verification"),
425  errdetail("Relation \"%s\" is not a B-Tree index.",
426  RelationGetRelationName(rel))));
427 
428  if (RELATION_IS_OTHER_TEMP(rel))
429  ereport(ERROR,
430  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
431  errmsg("cannot access temporary tables of other sessions"),
432  errdetail("Index \"%s\" is associated with temporary relation.",
433  RelationGetRelationName(rel))));
434 
435  if (!rel->rd_index->indisvalid)
436  ereport(ERROR,
437  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
438  errmsg("cannot check index \"%s\"",
440  errdetail("Index is not valid.")));
441 }
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:659

References ereport, errcode(), errdetail(), errmsg(), ERROR, RelationData::rd_index, RelationData::rd_rel, RELATION_IS_OTHER_TEMP, and RelationGetRelationName.

Referenced by bt_index_check_internal().

◆ btree_index_mainfork_expected()

static bool btree_index_mainfork_expected ( Relation  rel)
inlinestatic

Definition at line 452 of file verify_nbtree.c.

453 {
454  if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED ||
456  return true;
457 
458  ereport(DEBUG1,
459  (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
460  errmsg("cannot verify unlogged index \"%s\" during recovery, skipping",
461  RelationGetRelationName(rel))));
462 
463  return false;
464 }
bool RecoveryInProgress(void)
Definition: xlog.c:6039

References DEBUG1, ereport, errcode(), errmsg(), RelationData::rd_rel, RecoveryInProgress(), and RelationGetRelationName.

Referenced by bt_index_check_internal().

◆ BTreeTupleGetHeapTIDCareful()

static ItemPointer BTreeTupleGetHeapTIDCareful ( BtreeCheckState state,
IndexTuple  itup,
bool  nonpivot 
)
inlinestatic

Definition at line 3586 of file verify_nbtree.c.

3588 {
3589  ItemPointer htid;
3590 
3591  /*
3592  * Caller determines whether this is supposed to be a pivot or non-pivot
3593  * tuple using page type and item offset number. Verify that tuple
3594  * metadata agrees with this.
3595  */
3596  Assert(state->heapkeyspace);
3597  if (BTreeTupleIsPivot(itup) && nonpivot)
3598  ereport(ERROR,
3599  (errcode(ERRCODE_INDEX_CORRUPTED),
3600  errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple",
3601  state->targetblock,
3602  RelationGetRelationName(state->rel))));
3603 
3604  if (!BTreeTupleIsPivot(itup) && !nonpivot)
3605  ereport(ERROR,
3606  (errcode(ERRCODE_INDEX_CORRUPTED),
3607  errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple",
3608  state->targetblock,
3609  RelationGetRelationName(state->rel))));
3610 
3611  htid = BTreeTupleGetHeapTID(itup);
3612  if (!ItemPointerIsValid(htid) && nonpivot)
3613  ereport(ERROR,
3614  (errcode(ERRCODE_INDEX_CORRUPTED),
3615  errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
3616  state->targetblock,
3617  RelationGetRelationName(state->rel))));
3618 
3619  return htid;
3620 }

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleIsPivot(), ereport, errcode(), errmsg(), errmsg_internal(), ERROR, ItemPointerIsValid(), and RelationGetRelationName.

Referenced by invariant_l_nontarget_offset(), and invariant_l_offset().

◆ BTreeTupleGetPointsToTID()

static ItemPointer BTreeTupleGetPointsToTID ( IndexTuple  itup)
inlinestatic

Definition at line 3634 of file verify_nbtree.c.

3635 {
3636  /*
3637  * Rely on the assumption that !heapkeyspace internal page data items will
3638  * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't
3639  * recognize it as a pivot tuple, but everything still works out because
3640  * the t_tid field is still returned
3641  */
3642  if (!BTreeTupleIsPivot(itup))
3643  return BTreeTupleGetHeapTID(itup);
3644 
3645  /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */
3646  return &itup->t_tid;
3647 }

References BTreeTupleGetHeapTID(), BTreeTupleIsPivot(), and IndexTupleData::t_tid.

Referenced by bt_target_page_check().

◆ heap_entry_is_visible()

static bool heap_entry_is_visible ( BtreeCheckState state,
ItemPointer  tid 
)
static

Definition at line 978 of file verify_nbtree.c.

979 {
980  bool tid_visible;
981 
982  TupleTableSlot *slot = table_slot_create(state->heaprel, NULL);
983 
984  tid_visible = table_tuple_fetch_row_version(state->heaprel,
985  tid, state->snapshot, slot);
986  if (slot != NULL)
988 
989  return tid_visible;
990 }
void ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
Definition: execTuples.c:1253
TupleTableSlot * table_slot_create(Relation relation, List **reglist)
Definition: tableam.c:91
static bool table_tuple_fetch_row_version(Relation rel, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot)
Definition: tableam.h:1283

References ExecDropSingleTupleTableSlot(), table_slot_create(), and table_tuple_fetch_row_version().

Referenced by bt_entry_unique_check().

◆ invariant_g_offset()

static bool invariant_g_offset ( BtreeCheckState state,
BTScanInsert  key,
OffsetNumber  lowerbound 
)
inlinestatic

Definition at line 3248 of file verify_nbtree.c.

3250 {
3251  int32 cmp;
3252 
3253  Assert(!key->nextkey && key->backward);
3254 
3255  cmp = _bt_compare(state->rel, key, state->target, lowerbound);
3256 
3257  /* pg_upgrade'd indexes may legally have equal sibling tuples */
3258  if (!key->heapkeyspace)
3259  return cmp >= 0;
3260 
3261  /*
3262  * No need to consider the possibility that scankey has attributes that we
3263  * need to force to be interpreted as negative infinity. _bt_compare() is
3264  * able to determine that scankey is greater than negative infinity. The
3265  * distinction between "==" and "<" isn't interesting here, since
3266  * corruption is indicated either way.
3267  */
3268  return cmp > 0;
3269 }
signed int int32
Definition: c.h:483
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:743

References _bt_compare(), Assert(), cmp(), and sort-test::key.

Referenced by bt_target_page_check().

◆ invariant_l_nontarget_offset()

static bool invariant_l_nontarget_offset ( BtreeCheckState state,
BTScanInsert  key,
BlockNumber  nontargetblock,
Page  nontarget,
OffsetNumber  upperbound 
)
inlinestatic

Definition at line 3284 of file verify_nbtree.c.

3287 {
3288  ItemId itemid;
3289  int32 cmp;
3290 
3291  Assert(!key->nextkey && key->backward);
3292 
3293  /* Verify line pointer before checking tuple */
3294  itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
3295  upperbound);
3296  cmp = _bt_compare(state->rel, key, nontarget, upperbound);
3297 
3298  /* pg_upgrade'd indexes may legally have equal sibling tuples */
3299  if (!key->heapkeyspace)
3300  return cmp <= 0;
3301 
3302  /* See invariant_l_offset() for an explanation of this extra step */
3303  if (cmp == 0)
3304  {
3305  IndexTuple child;
3306  int uppnkeyatts;
3307  ItemPointer childheaptid;
3308  BTPageOpaque copaque;
3309  bool nonpivot;
3310 
3311  child = (IndexTuple) PageGetItem(nontarget, itemid);
3312  copaque = BTPageGetOpaque(nontarget);
3313  nonpivot = P_ISLEAF(copaque) && upperbound >= P_FIRSTDATAKEY(copaque);
3314 
3315  /* Get number of keys + heap TID for child/non-target item */
3316  uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel);
3317  childheaptid = BTreeTupleGetHeapTIDCareful(state, child, nonpivot);
3318 
3319  /* Heap TID is tiebreaker key attribute */
3320  if (key->keysz == uppnkeyatts)
3321  return key->scantid == NULL && childheaptid != NULL;
3322 
3323  return key->keysz < uppnkeyatts;
3324  }
3325 
3326  return cmp < 0;
3327 }
static ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot)
#define BTreeTupleGetNKeyAtts(itup, rel)
Definition: verify_nbtree.c:53

References _bt_compare(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTIDCareful(), BTreeTupleGetNKeyAtts, cmp(), sort-test::key, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), and PageGetItemIdCareful().

Referenced by bt_child_check().

◆ invariant_l_offset()

static bool invariant_l_offset ( BtreeCheckState state,
BTScanInsert  key,
OffsetNumber  upperbound 
)
inlinestatic

Definition at line 3162 of file verify_nbtree.c.

3164 {
3165  ItemId itemid;
3166  int32 cmp;
3167 
3168  Assert(!key->nextkey && key->backward);
3169 
3170  /* Verify line pointer before checking tuple */
3171  itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
3172  upperbound);
3173  /* pg_upgrade'd indexes may legally have equal sibling tuples */
3174  if (!key->heapkeyspace)
3175  return invariant_leq_offset(state, key, upperbound);
3176 
3177  cmp = _bt_compare(state->rel, key, state->target, upperbound);
3178 
3179  /*
3180  * _bt_compare() is capable of determining that a scankey with a
3181  * filled-out attribute is greater than pivot tuples where the comparison
3182  * is resolved at a truncated attribute (value of attribute in pivot is
3183  * minus infinity). However, it is not capable of determining that a
3184  * scankey is _less than_ a tuple on the basis of a comparison resolved at
3185  * _scankey_ minus infinity attribute. Complete an extra step to simulate
3186  * having minus infinity values for omitted scankey attribute(s).
3187  */
3188  if (cmp == 0)
3189  {
3190  BTPageOpaque topaque;
3191  IndexTuple ritup;
3192  int uppnkeyatts;
3193  ItemPointer rheaptid;
3194  bool nonpivot;
3195 
3196  ritup = (IndexTuple) PageGetItem(state->target, itemid);
3197  topaque = BTPageGetOpaque(state->target);
3198  nonpivot = P_ISLEAF(topaque) && upperbound >= P_FIRSTDATAKEY(topaque);
3199 
3200  /* Get number of keys + heap TID for item to the right */
3201  uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel);
3202  rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, nonpivot);
3203 
3204  /* Heap TID is tiebreaker key attribute */
3205  if (key->keysz == uppnkeyatts)
3206  return key->scantid == NULL && rheaptid != NULL;
3207 
3208  return key->keysz < uppnkeyatts;
3209  }
3210 
3211  return cmp < 0;
3212 }

References _bt_compare(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTIDCareful(), BTreeTupleGetNKeyAtts, cmp(), invariant_leq_offset(), sort-test::key, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), and PageGetItemIdCareful().

Referenced by bt_target_page_check().

◆ invariant_leq_offset()

static bool invariant_leq_offset ( BtreeCheckState state,
BTScanInsert  key,
OffsetNumber  upperbound 
)
inlinestatic

Definition at line 3225 of file verify_nbtree.c.

3227 {
3228  int32 cmp;
3229 
3230  Assert(!key->nextkey && key->backward);
3231 
3232  cmp = _bt_compare(state->rel, key, state->target, upperbound);
3233 
3234  return cmp <= 0;
3235 }

References _bt_compare(), Assert(), cmp(), and sort-test::key.

Referenced by bt_target_page_check(), and invariant_l_offset().

◆ offset_is_negative_infinity()

static bool offset_is_negative_infinity ( BTPageOpaque  opaque,
OffsetNumber  offset 
)
inlinestatic

Definition at line 3127 of file verify_nbtree.c.

3128 {
3129  /*
3130  * For internal pages only, the first item after high key, if any, is
3131  * negative infinity item. Internal pages always have a negative infinity
3132  * item, whereas leaf pages never have one. This implies that negative
3133  * infinity item is either first or second line item, or there is none
3134  * within page.
3135  *
3136  * Negative infinity items are a special case among pivot tuples. They
3137  * always have zero attributes, while all other pivot tuples always have
3138  * nkeyatts attributes.
3139  *
3140  * Right-most pages don't have a high key, but could be said to
3141  * conceptually have a "positive infinity" high key. Thus, there is a
3142  * symmetry between down link items in parent pages, and high keys in
3143  * children. Together, they represent the part of the key space that
3144  * belongs to each page in the index. For example, all children of the
3145  * root page will have negative infinity as a lower bound from root
3146  * negative infinity downlink, and positive infinity as an upper bound
3147  * (implicitly, from "imaginary" positive infinity high key in root).
3148  */
3149  return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);
3150 }

References P_FIRSTDATAKEY, and P_ISLEAF.

Referenced by bt_child_check(), bt_child_highkey_check(), and bt_target_page_check().

◆ PageGetItemIdCareful()

static ItemId PageGetItemIdCareful ( BtreeCheckState state,
BlockNumber  block,
Page  page,
OffsetNumber  offset 
)
static

Definition at line 3546 of file verify_nbtree.c.

3548 {
3549  ItemId itemid = PageGetItemId(page, offset);
3550 
3551  if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) >
3552  BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData)))
3553  ereport(ERROR,
3554  (errcode(ERRCODE_INDEX_CORRUPTED),
3555  errmsg("line pointer points past end of tuple space in index \"%s\"",
3557  errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
3558  block, offset, ItemIdGetOffset(itemid),
3559  ItemIdGetLength(itemid),
3560  ItemIdGetFlags(itemid))));
3561 
3562  /*
3563  * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree
3564  * never uses either. Verify that line pointer has storage, too, since
3565  * even LP_DEAD items should within nbtree.
3566  */
3567  if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) ||
3568  ItemIdGetLength(itemid) == 0)
3569  ereport(ERROR,
3570  (errcode(ERRCODE_INDEX_CORRUPTED),
3571  errmsg("invalid line pointer storage in index \"%s\"",
3573  errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
3574  block, offset, ItemIdGetOffset(itemid),
3575  ItemIdGetLength(itemid),
3576  ItemIdGetFlags(itemid))));
3577 
3578  return itemid;
3579 }
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
#define ItemIdGetOffset(itemId)
Definition: itemid.h:65
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define ItemIdGetFlags(itemId)
Definition: itemid.h:71

References ereport, errcode(), errdetail_internal(), errmsg(), ERROR, ItemIdGetFlags, ItemIdGetLength, ItemIdGetOffset, ItemIdIsRedirected, ItemIdIsUsed, MAXALIGN, PageGetItemId(), and RelationGetRelationName.

Referenced by bt_check_level_from_leftmost(), bt_child_check(), bt_child_highkey_check(), bt_downlink_missing_check(), bt_right_page_check_scankey(), bt_target_page_check(), invariant_l_nontarget_offset(), and invariant_l_offset().

◆ palloc_btree_page()

static Page palloc_btree_page ( BtreeCheckState state,
BlockNumber  blocknum 
)
static

Definition at line 3344 of file verify_nbtree.c.

3345 {
3346  Buffer buffer;
3347  Page page;
3348  BTPageOpaque opaque;
3349  OffsetNumber maxoffset;
3350 
3351  page = palloc(BLCKSZ);
3352 
3353  /*
3354  * We copy the page into local storage to avoid holding pin on the buffer
3355  * longer than we must.
3356  */
3357  buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL,
3358  state->checkstrategy);
3359  LockBuffer(buffer, BT_READ);
3360 
3361  /*
3362  * Perform the same basic sanity checking that nbtree itself performs for
3363  * every page:
3364  */
3365  _bt_checkpage(state->rel, buffer);
3366 
3367  /* Only use copy of page in palloc()'d memory */
3368  memcpy(page, BufferGetPage(buffer), BLCKSZ);
3369  UnlockReleaseBuffer(buffer);
3370 
3371  opaque = BTPageGetOpaque(page);
3372 
3373  if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE)
3374  ereport(ERROR,
3375  (errcode(ERRCODE_INDEX_CORRUPTED),
3376  errmsg("invalid meta page found at block %u in index \"%s\"",
3377  blocknum, RelationGetRelationName(state->rel))));
3378 
3379  /* Check page from block that ought to be meta page */
3380  if (blocknum == BTREE_METAPAGE)
3381  {
3382  BTMetaPageData *metad = BTPageGetMeta(page);
3383 
3384  if (!P_ISMETA(opaque) ||
3385  metad->btm_magic != BTREE_MAGIC)
3386  ereport(ERROR,
3387  (errcode(ERRCODE_INDEX_CORRUPTED),
3388  errmsg("index \"%s\" meta page is corrupt",
3389  RelationGetRelationName(state->rel))));
3390 
3391  if (metad->btm_version < BTREE_MIN_VERSION ||
3392  metad->btm_version > BTREE_VERSION)
3393  ereport(ERROR,
3394  (errcode(ERRCODE_INDEX_CORRUPTED),
3395  errmsg("version mismatch in index \"%s\": file version %d, "
3396  "current version %d, minimum supported version %d",
3398  metad->btm_version, BTREE_VERSION,
3399  BTREE_MIN_VERSION)));
3400 
3401  /* Finished with metapage checks */
3402  return page;
3403  }
3404 
3405  /*
3406  * Deleted pages that still use the old 32-bit XID representation have no
3407  * sane "level" field because they type pun the field, but all other pages
3408  * (including pages deleted on Postgres 14+) have a valid value.
3409  */
3410  if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
3411  {
3412  /* Okay, no reason not to trust btpo_level field from page */
3413 
3414  if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
3415  ereport(ERROR,
3416  (errcode(ERRCODE_INDEX_CORRUPTED),
3417  errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
3418  opaque->btpo_level, blocknum,
3419  RelationGetRelationName(state->rel))));
3420 
3421  if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
3422  ereport(ERROR,
3423  (errcode(ERRCODE_INDEX_CORRUPTED),
3424  errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
3425  blocknum,
3426  RelationGetRelationName(state->rel))));
3427  }
3428 
3429  /*
3430  * Sanity checks for number of items on page.
3431  *
3432  * As noted at the beginning of _bt_binsrch(), an internal page must have
3433  * children, since there must always be a negative infinity downlink
3434  * (there may also be a highkey). In the case of non-rightmost leaf
3435  * pages, there must be at least a highkey. The exceptions are deleted
3436  * pages, which contain no items.
3437  *
3438  * This is correct when pages are half-dead, since internal pages are
3439  * never half-dead, and leaf pages must have a high key when half-dead
3440  * (the rightmost page can never be deleted). It's also correct with
3441  * fully deleted pages: _bt_unlink_halfdead_page() doesn't change anything
3442  * about the target page other than setting the page as fully dead, and
3443  * setting its xact field. In particular, it doesn't change the sibling
3444  * links in the deletion target itself, since they're required when index
3445  * scans land on the deletion target, and then need to move right (or need
3446  * to move left, in the case of backward index scans).
3447  */
3448  maxoffset = PageGetMaxOffsetNumber(page);
3449  if (maxoffset > MaxIndexTuplesPerPage)
3450  ereport(ERROR,
3451  (errcode(ERRCODE_INDEX_CORRUPTED),
3452  errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)",
3453  blocknum, RelationGetRelationName(state->rel),
3455 
3456  if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && maxoffset < P_FIRSTDATAKEY(opaque))
3457  ereport(ERROR,
3458  (errcode(ERRCODE_INDEX_CORRUPTED),
3459  errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink",
3460  blocknum, RelationGetRelationName(state->rel))));
3461 
3462  if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY)
3463  ereport(ERROR,
3464  (errcode(ERRCODE_INDEX_CORRUPTED),
3465  errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item",
3466  blocknum, RelationGetRelationName(state->rel))));
3467 
3468  /*
3469  * In general, internal pages are never marked half-dead, except on
3470  * versions of Postgres prior to 9.4, where it can be valid transient
3471  * state. This state is nonetheless treated as corruption by VACUUM on
3472  * from version 9.4 on, so do the same here. See _bt_pagedel() for full
3473  * details.
3474  */
3475  if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
3476  ereport(ERROR,
3477  (errcode(ERRCODE_INDEX_CORRUPTED),
3478  errmsg("internal page block %u in index \"%s\" is half-dead",
3479  blocknum, RelationGetRelationName(state->rel)),
3480  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
3481 
3482  /*
3483  * Check that internal pages have no garbage items, and that no page has
3484  * an invalid combination of deletion-related page level flags
3485  */
3486  if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
3487  ereport(ERROR,
3488  (errcode(ERRCODE_INDEX_CORRUPTED),
3489  errmsg_internal("internal page block %u in index \"%s\" has garbage items",
3490  blocknum, RelationGetRelationName(state->rel))));
3491 
3492  if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
3493  ereport(ERROR,
3494  (errcode(ERRCODE_INDEX_CORRUPTED),
3495  errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
3496  blocknum, RelationGetRelationName(state->rel))));
3497 
3498  if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
3499  ereport(ERROR,
3500  (errcode(ERRCODE_INDEX_CORRUPTED),
3501  errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
3502  blocknum, RelationGetRelationName(state->rel))));
3503 
3504  return page;
3505 }
#define MaxIndexTuplesPerPage
Definition: itup.h:165
void * palloc(Size size)
Definition: mcxt.c:1226
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:226
#define P_ISMETA(opaque)
Definition: nbtree.h:223
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTREE_VERSION
Definition: nbtree.h:150
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105

References _bt_checkpage(), BT_READ, BTMetaPageData::btm_magic, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), ERROR, LockBuffer(), MAIN_FORKNUM, MaxIndexTuplesPerPage, P_FIRSTDATAKEY, P_HAS_FULLXID, P_HAS_GARBAGE, P_HIKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISMETA, P_RIGHTMOST, PageGetMaxOffsetNumber(), palloc(), RBM_NORMAL, ReadBufferExtended(), RelationGetRelationName, and UnlockReleaseBuffer().

Referenced by bt_check_every_level(), bt_check_level_from_leftmost(), bt_child_check(), bt_child_highkey_check(), bt_downlink_missing_check(), bt_leftmost_ignoring_half_dead(), bt_right_page_check_scankey(), and bt_target_page_check().

◆ PG_FUNCTION_INFO_V1() [1/2]

PG_FUNCTION_INFO_V1 ( bt_index_check  )

◆ PG_FUNCTION_INFO_V1() [2/2]

PG_FUNCTION_INFO_V1 ( bt_index_parent_check  )

Variable Documentation

◆ PG_MODULE_MAGIC

PG_MODULE_MAGIC

Definition at line 46 of file verify_nbtree.c.