PostgreSQL Source Code  git master
verify_nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * verify_nbtree.c
4  * Verifies the integrity of nbtree indexes based on invariants.
5  *
6  * For B-Tree indexes, verification includes checking that each page in the
7  * target index has items in logical order as reported by an insertion scankey
8  * (the insertion scankey sort-wise NULL semantics are needed for
9  * verification).
10  *
11  * When index-to-heap verification is requested, a Bloom filter is used to
12  * fingerprint all tuples in the target index, as the index is traversed to
13  * verify its structure. A heap scan later uses Bloom filter probes to verify
14  * that every visible heap tuple has a matching index tuple.
15  *
16  *
17  * Copyright (c) 2017-2019, PostgreSQL Global Development Group
18  *
19  * IDENTIFICATION
20  * contrib/amcheck/verify_nbtree.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 #include "postgres.h"
25 
26 #include "access/htup_details.h"
27 #include "access/nbtree.h"
28 #include "access/table.h"
29 #include "access/tableam.h"
30 #include "access/transam.h"
31 #include "access/xact.h"
32 #include "catalog/index.h"
33 #include "catalog/pg_am.h"
34 #include "commands/tablecmds.h"
35 #include "lib/bloomfilter.h"
36 #include "miscadmin.h"
37 #include "storage/lmgr.h"
38 #include "utils/memutils.h"
39 #include "utils/snapmgr.h"
40 
41 
43 
44 /*
45  * A B-Tree cannot possibly have this many levels, since there must be one
46  * block per level, which is bound by the range of BlockNumber:
47  */
48 #define InvalidBtreeLevel ((uint32) InvalidBlockNumber)
49 #define BTreeTupleGetNKeyAtts(itup, rel) \
50  Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel))
51 
52 /*
53  * State associated with verifying a B-Tree index
54  *
55  * target is the point of reference for a verification operation.
56  *
57  * Other B-Tree pages may be allocated, but those are always auxiliary (e.g.,
58  * they are current target's child pages). Conceptually, problems are only
59  * ever found in the current target page (or for a particular heap tuple during
60  * heapallindexed verification). Each page found by verification's left/right,
61  * top/bottom scan becomes the target exactly once.
62  */
63 typedef struct BtreeCheckState
64 {
65  /*
66  * Unchanging state, established at start of verification:
67  */
68 
69  /* B-Tree Index Relation and associated heap relation */
72  /* rel is heapkeyspace index? */
74  /* ShareLock held on heap/index, rather than AccessShareLock? */
75  bool readonly;
76  /* Also verifying heap has no unindexed tuples? */
78  /* Also making sure non-pivot tuples can be found by new search? */
80  /* Per-page context */
82  /* Buffer access strategy */
84 
85  /*
86  * Mutable state, for verification of particular page:
87  */
88 
89  /* Current target page */
91  /* Target block number */
93  /* Target page's LSN */
95 
96  /*
97  * Mutable state, for optional heapallindexed verification:
98  */
99 
100  /* Bloom filter fingerprints B-Tree index */
102  /* Bloom filter fingerprints downlink blocks within tree */
104  /* Right half of incomplete split marker */
106  /* Debug counter */
109 
110 /*
111  * Starting point for verifying an entire B-Tree index level
112  */
113 typedef struct BtreeLevel
114 {
115  /* Level number (0 is leaf page level). */
117 
118  /* Left most block on level. Scan of level begins here. */
120 
121  /* Is this level reported as "true" root level by meta page? */
123 } BtreeLevel;
124 
127 
128 static void bt_index_check_internal(Oid indrelid, bool parentcheck,
129  bool heapallindexed, bool rootdescend);
130 static inline void btree_index_checkable(Relation rel);
132  bool heapkeyspace, bool readonly, bool heapallindexed,
133  bool rootdescend);
135  BtreeLevel level);
138 static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
139  BlockNumber childblock);
142  Datum *values, bool *isnull,
143  bool tupleIsAlive, void *checkstate);
145  IndexTuple itup);
146 static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
147 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
148  OffsetNumber offset);
150  OffsetNumber upperbound);
151 static inline bool invariant_leq_offset(BtreeCheckState *state,
153  OffsetNumber upperbound);
155  OffsetNumber lowerbound);
158  BlockNumber nontargetblock,
159  Page nontarget,
160  OffsetNumber upperbound);
163  IndexTuple itup);
165  Page page, OffsetNumber offset);
167  IndexTuple itup, bool nonpivot);
168 
169 /*
170  * bt_index_check(index regclass, heapallindexed boolean)
171  *
172  * Verify integrity of B-Tree index.
173  *
174  * Acquires AccessShareLock on heap & index relations. Does not consider
175  * invariants that exist between parent/child pages. Optionally verifies
176  * that heap does not contain any unindexed or incorrectly indexed tuples.
177  */
178 Datum
180 {
181  Oid indrelid = PG_GETARG_OID(0);
182  bool heapallindexed = false;
183 
184  if (PG_NARGS() == 2)
185  heapallindexed = PG_GETARG_BOOL(1);
186 
187  bt_index_check_internal(indrelid, false, heapallindexed, false);
188 
189  PG_RETURN_VOID();
190 }
191 
192 /*
193  * bt_index_parent_check(index regclass, heapallindexed boolean)
194  *
195  * Verify integrity of B-Tree index.
196  *
197  * Acquires ShareLock on heap & index relations. Verifies that downlinks in
198  * parent pages are valid lower bounds on child pages. Optionally verifies
199  * that heap does not contain any unindexed or incorrectly indexed tuples.
200  */
201 Datum
203 {
204  Oid indrelid = PG_GETARG_OID(0);
205  bool heapallindexed = false;
206  bool rootdescend = false;
207 
208  if (PG_NARGS() >= 2)
209  heapallindexed = PG_GETARG_BOOL(1);
210  if (PG_NARGS() == 3)
211  rootdescend = PG_GETARG_BOOL(2);
212 
213  bt_index_check_internal(indrelid, true, heapallindexed, rootdescend);
214 
215  PG_RETURN_VOID();
216 }
217 
218 /*
219  * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
220  */
221 static void
222 bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
223  bool rootdescend)
224 {
225  Oid heapid;
226  Relation indrel;
228  bool heapkeyspace;
229  LOCKMODE lockmode;
230 
231  if (parentcheck)
232  lockmode = ShareLock;
233  else
234  lockmode = AccessShareLock;
235 
236  /*
237  * We must lock table before index to avoid deadlocks. However, if the
238  * passed indrelid isn't an index then IndexGetRelation() will fail.
239  * Rather than emitting a not-very-helpful error message, postpone
240  * complaining, expecting that the is-it-an-index test below will fail.
241  *
242  * In hot standby mode this will raise an error when parentcheck is true.
243  */
244  heapid = IndexGetRelation(indrelid, true);
245  if (OidIsValid(heapid))
246  heaprel = table_open(heapid, lockmode);
247  else
248  heaprel = NULL;
249 
250  /*
251  * Open the target index relations separately (like relation_openrv(), but
252  * with heap relation locked first to prevent deadlocking). In hot
253  * standby mode this will raise an error when parentcheck is true.
254  *
255  * There is no need for the usual indcheckxmin usability horizon test
256  * here, even in the heapallindexed case, because index undergoing
257  * verification only needs to have entries for a new transaction snapshot.
258  * (If this is a parentcheck verification, there is no question about
259  * committed or recently dead heap tuples lacking index entries due to
260  * concurrent activity.)
261  */
262  indrel = index_open(indrelid, lockmode);
263 
264  /*
265  * Since we did the IndexGetRelation call above without any lock, it's
266  * barely possible that a race against an index drop/recreation could have
267  * netted us the wrong table.
268  */
269  if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
270  ereport(ERROR,
272  errmsg("could not open parent table of index %s",
273  RelationGetRelationName(indrel))));
274 
275  /* Relation suitable for checking as B-Tree? */
276  btree_index_checkable(indrel);
277 
278  /* Check index, possibly against table it is an index on */
279  heapkeyspace = _bt_heapkeyspace(indrel);
280  bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
281  heapallindexed, rootdescend);
282 
283  /*
284  * Release locks early. That's ok here because nothing in the called
285  * routines will trigger shared cache invalidations to be sent, so we can
286  * relax the usual pattern of only releasing locks after commit.
287  */
288  index_close(indrel, lockmode);
289  if (heaprel)
290  table_close(heaprel, lockmode);
291 }
292 
293 /*
294  * Basic checks about the suitability of a relation for checking as a B-Tree
295  * index.
296  *
297  * NB: Intentionally not checking permissions, the function is normally not
298  * callable by non-superusers. If granted, it's useful to be able to check a
299  * whole cluster.
300  */
301 static inline void
303 {
304  if (rel->rd_rel->relkind != RELKIND_INDEX ||
305  rel->rd_rel->relam != BTREE_AM_OID)
306  ereport(ERROR,
307  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
308  errmsg("only B-Tree indexes are supported as targets for verification"),
309  errdetail("Relation \"%s\" is not a B-Tree index.",
310  RelationGetRelationName(rel))));
311 
312  if (RELATION_IS_OTHER_TEMP(rel))
313  ereport(ERROR,
314  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
315  errmsg("cannot access temporary tables of other sessions"),
316  errdetail("Index \"%s\" is associated with temporary relation.",
317  RelationGetRelationName(rel))));
318 
319  if (!rel->rd_index->indisvalid)
320  ereport(ERROR,
321  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
322  errmsg("cannot check index \"%s\"",
324  errdetail("Index is not valid.")));
325 }
326 
327 /*
328  * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in
329  * logical order, verifying invariants as it goes. Optionally, verification
330  * checks if the heap relation contains any tuples that are not represented in
331  * the index but should be.
332  *
333  * It is the caller's responsibility to acquire appropriate heavyweight lock on
334  * the index relation, and advise us if extra checks are safe when a ShareLock
335  * is held. (A lock of the same type must also have been acquired on the heap
336  * relation.)
337  *
338  * A ShareLock is generally assumed to prevent any kind of physical
339  * modification to the index structure, including modifications that VACUUM may
340  * make. This does not include setting of the LP_DEAD bit by concurrent index
341  * scans, although that is just metadata that is not able to directly affect
342  * any check performed here. Any concurrent process that might act on the
343  * LP_DEAD bit being set (recycle space) requires a heavyweight lock that
344  * cannot be held while we hold a ShareLock. (Besides, even if that could
345  * happen, the ad-hoc recycling when a page might otherwise split is performed
346  * per-page, and requires an exclusive buffer lock, which wouldn't cause us
347  * trouble. _bt_delitems_vacuum() may only delete leaf items, and so the extra
348  * parent/child check cannot be affected.)
349  */
350 static void
352  bool readonly, bool heapallindexed, bool rootdescend)
353 {
355  Page metapage;
356  BTMetaPageData *metad;
357  uint32 previouslevel;
358  BtreeLevel current;
359  Snapshot snapshot = SnapshotAny;
360 
361  /*
362  * RecentGlobalXmin assertion matches index_getnext_tid(). See note on
363  * RecentGlobalXmin/B-Tree page deletion.
364  */
366 
367  /*
368  * Initialize state for entire verification operation
369  */
370  state = palloc0(sizeof(BtreeCheckState));
371  state->rel = rel;
372  state->heaprel = heaprel;
373  state->heapkeyspace = heapkeyspace;
374  state->readonly = readonly;
376  state->rootdescend = rootdescend;
377 
378  if (state->heapallindexed)
379  {
380  int64 total_elems;
381  uint64 seed;
382 
383  /* Size Bloom filter based on estimated number of tuples in index */
384  total_elems = (int64) state->rel->rd_rel->reltuples;
385  /* Random seed relies on backend srandom() call to avoid repetition */
386  seed = random();
387  /* Create Bloom filter to fingerprint index */
388  state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
389  state->heaptuplespresent = 0;
390 
391  /*
392  * Register our own snapshot in !readonly case, rather than asking
393  * table_index_build_scan() to do this for us later. This needs to
394  * happen before index fingerprinting begins, so we can later be
395  * certain that index fingerprinting should have reached all tuples
396  * returned by table_index_build_scan().
397  *
398  * In readonly case, we also check for problems with missing
399  * downlinks. A second Bloom filter is used for this.
400  */
401  if (!state->readonly)
402  {
404 
405  /*
406  * GetTransactionSnapshot() always acquires a new MVCC snapshot in
407  * READ COMMITTED mode. A new snapshot is guaranteed to have all
408  * the entries it requires in the index.
409  *
410  * We must defend against the possibility that an old xact
411  * snapshot was returned at higher isolation levels when that
412  * snapshot is not safe for index scans of the target index. This
413  * is possible when the snapshot sees tuples that are before the
414  * index's indcheckxmin horizon. Throwing an error here should be
415  * very rare. It doesn't seem worth using a secondary snapshot to
416  * avoid this.
417  */
418  if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&
420  snapshot->xmin))
421  ereport(ERROR,
422  (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
423  errmsg("index \"%s\" cannot be verified using transaction snapshot",
424  RelationGetRelationName(rel))));
425  }
426  else
427  {
428  int64 total_pages;
429 
430  /*
431  * Extra readonly downlink check.
432  *
433  * In readonly case, we know that there cannot be a concurrent
434  * page split or a concurrent page deletion, which gives us the
435  * opportunity to verify that every non-ignorable page had a
436  * downlink one level up. We must be tolerant of interrupted page
437  * splits and page deletions, though. This is taken care of in
438  * bt_downlink_missing_check().
439  */
440  total_pages = (int64) state->rel->rd_rel->relpages;
441  state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
442  }
443  }
444 
445  Assert(!state->rootdescend || state->readonly);
446  if (state->rootdescend && !state->heapkeyspace)
447  ereport(ERROR,
448  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
449  errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search",
451  errhint("Only B-Tree version 4 indexes support rootdescend verification.")));
452 
453  /* Create context for page */
455  "amcheck context",
458 
459  /* Get true root block from meta-page */
460  metapage = palloc_btree_page(state, BTREE_METAPAGE);
461  metad = BTPageGetMeta(metapage);
462 
463  /*
464  * Certain deletion patterns can result in "skinny" B-Tree indexes, where
465  * the fast root and true root differ.
466  *
467  * Start from the true root, not the fast root, unlike conventional index
468  * scans. This approach is more thorough, and removes the risk of
469  * following a stale fast root from the meta page.
470  */
471  if (metad->btm_fastroot != metad->btm_root)
472  ereport(DEBUG1,
473  (errcode(ERRCODE_NO_DATA),
474  errmsg("harmless fast root mismatch in index %s",
476  errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).",
477  metad->btm_fastroot, metad->btm_fastlevel,
478  metad->btm_root, metad->btm_level)));
479 
480  /*
481  * Starting at the root, verify every level. Move left to right, top to
482  * bottom. Note that there may be no pages other than the meta page (meta
483  * page can indicate that root is P_NONE when the index is totally empty).
484  */
485  previouslevel = InvalidBtreeLevel;
486  current.level = metad->btm_level;
487  current.leftmost = metad->btm_root;
488  current.istruerootlevel = true;
489  while (current.leftmost != P_NONE)
490  {
491  /*
492  * Leftmost page on level cannot be right half of incomplete split.
493  * This can go stale immediately in !readonly case.
494  */
495  state->rightsplit = false;
496 
497  /*
498  * Verify this level, and get left most page for next level down, if
499  * not at leaf level
500  */
501  current = bt_check_level_from_leftmost(state, current);
502 
503  if (current.leftmost == InvalidBlockNumber)
504  ereport(ERROR,
505  (errcode(ERRCODE_INDEX_CORRUPTED),
506  errmsg("index \"%s\" has no valid pages on level below %u or first level",
507  RelationGetRelationName(rel), previouslevel)));
508 
509  previouslevel = current.level;
510  }
511 
512  /*
513  * * Check whether heap contains unindexed/malformed tuples *
514  */
515  if (state->heapallindexed)
516  {
517  IndexInfo *indexinfo = BuildIndexInfo(state->rel);
518  TableScanDesc scan;
519 
520  /* Report on extra downlink checks performed in readonly case */
521  if (state->readonly)
522  {
523  ereport(DEBUG1,
524  (errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
526  100.0 * bloom_prop_bits_set(state->downlinkfilter))));
527  bloom_free(state->downlinkfilter);
528  }
529 
530  /*
531  * Create our own scan for table_index_build_scan(), rather than
532  * getting it to do so for us. This is required so that we can
533  * actually use the MVCC snapshot registered earlier in !readonly
534  * case.
535  *
536  * Note that table_index_build_scan() calls heap_endscan() for us.
537  */
538  scan = table_beginscan_strat(state->heaprel, /* relation */
539  snapshot, /* snapshot */
540  0, /* number of keys */
541  NULL, /* scan key */
542  true, /* buffer access strategy OK */
543  true); /* syncscan OK? */
544 
545  /*
546  * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY
547  * behaves in !readonly case.
548  *
549  * It's okay that we don't actually use the same lock strength for the
550  * heap relation as any other ii_Concurrent caller would in !readonly
551  * case. We have no reason to care about a concurrent VACUUM
552  * operation, since there isn't going to be a second scan of the heap
553  * that needs to be sure that there was no concurrent recycling of
554  * TIDs.
555  */
556  indexinfo->ii_Concurrent = !state->readonly;
557 
558  /*
559  * Don't wait for uncommitted tuple xact commit/abort when index is a
560  * unique index on a catalog (or an index used by an exclusion
561  * constraint). This could otherwise happen in the readonly case.
562  */
563  indexinfo->ii_Unique = false;
564  indexinfo->ii_ExclusionOps = NULL;
565  indexinfo->ii_ExclusionProcs = NULL;
566  indexinfo->ii_ExclusionStrats = NULL;
567 
568  elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"",
571 
572  table_index_build_scan(state->heaprel, state->rel, indexinfo, true, false,
573  bt_tuple_present_callback, (void *) state, scan);
574 
575  ereport(DEBUG1,
576  (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set",
577  state->heaptuplespresent, RelationGetRelationName(heaprel),
578  100.0 * bloom_prop_bits_set(state->filter))));
579 
580  if (snapshot != SnapshotAny)
581  UnregisterSnapshot(snapshot);
582 
583  bloom_free(state->filter);
584  }
585 
586  /* Be tidy: */
588 }
589 
590 /*
591  * Given a left-most block at some level, move right, verifying each page
592  * individually (with more verification across pages for "readonly"
593  * callers). Caller should pass the true root page as the leftmost initially,
594  * working their way down by passing what is returned for the last call here
595  * until level 0 (leaf page level) was reached.
596  *
597  * Returns state for next call, if any. This includes left-most block number
598  * one level lower that should be passed on next level/call, which is set to
599  * P_NONE on last call here (when leaf level is verified). Level numbers
600  * follow the nbtree convention: higher levels have higher numbers, because new
601  * levels are added only due to a root page split. Note that prior to the
602  * first root page split, the root is also a leaf page, so there is always a
603  * level 0 (leaf level), and it's always the last level processed.
604  *
605  * Note on memory management: State's per-page context is reset here, between
606  * each call to bt_target_page_check().
607  */
608 static BtreeLevel
610 {
611  /* State to establish early, concerning entire level */
612  BTPageOpaque opaque;
613  MemoryContext oldcontext;
614  BtreeLevel nextleveldown;
615 
616  /* Variables for iterating across level using right links */
617  BlockNumber leftcurrent = P_NONE;
618  BlockNumber current = level.leftmost;
619 
620  /* Initialize return state */
621  nextleveldown.leftmost = InvalidBlockNumber;
622  nextleveldown.level = InvalidBtreeLevel;
623  nextleveldown.istruerootlevel = false;
624 
625  /* Use page-level context for duration of this call */
626  oldcontext = MemoryContextSwitchTo(state->targetcontext);
627 
628  elog(DEBUG2, "verifying level %u%s", level.level,
629  level.istruerootlevel ?
630  " (true root level)" : level.level == 0 ? " (leaf level)" : "");
631 
632  do
633  {
634  /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
636 
637  /* Initialize state for this iteration */
638  state->targetblock = current;
639  state->target = palloc_btree_page(state, state->targetblock);
640  state->targetlsn = PageGetLSN(state->target);
641 
642  opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
643 
644  if (P_IGNORE(opaque))
645  {
646  /*
647  * Since there cannot be a concurrent VACUUM operation in readonly
648  * mode, and since a page has no links within other pages
649  * (siblings and parent) once it is marked fully deleted, it
650  * should be impossible to land on a fully deleted page in
651  * readonly mode. See bt_downlink_check() for further details.
652  *
653  * The bt_downlink_check() P_ISDELETED() check is repeated here so
654  * that pages that are only reachable through sibling links get
655  * checked.
656  */
657  if (state->readonly && P_ISDELETED(opaque))
658  ereport(ERROR,
659  (errcode(ERRCODE_INDEX_CORRUPTED),
660  errmsg("downlink or sibling link points to deleted block in index \"%s\"",
661  RelationGetRelationName(state->rel)),
662  errdetail_internal("Block=%u left block=%u left link from block=%u.",
663  current, leftcurrent, opaque->btpo_prev)));
664 
665  if (P_RIGHTMOST(opaque))
666  ereport(ERROR,
667  (errcode(ERRCODE_INDEX_CORRUPTED),
668  errmsg("block %u fell off the end of index \"%s\"",
669  current, RelationGetRelationName(state->rel))));
670  else
671  ereport(DEBUG1,
672  (errcode(ERRCODE_NO_DATA),
673  errmsg("block %u of index \"%s\" ignored",
674  current, RelationGetRelationName(state->rel))));
675  goto nextpage;
676  }
677  else if (nextleveldown.leftmost == InvalidBlockNumber)
678  {
679  /*
680  * A concurrent page split could make the caller supplied leftmost
681  * block no longer contain the leftmost page, or no longer be the
682  * true root, but where that isn't possible due to heavyweight
683  * locking, check that the first valid page meets caller's
684  * expectations.
685  */
686  if (state->readonly)
687  {
688  if (!P_LEFTMOST(opaque))
689  ereport(ERROR,
690  (errcode(ERRCODE_INDEX_CORRUPTED),
691  errmsg("block %u is not leftmost in index \"%s\"",
692  current, RelationGetRelationName(state->rel))));
693 
694  if (level.istruerootlevel && !P_ISROOT(opaque))
695  ereport(ERROR,
696  (errcode(ERRCODE_INDEX_CORRUPTED),
697  errmsg("block %u is not true root in index \"%s\"",
698  current, RelationGetRelationName(state->rel))));
699  }
700 
701  /*
702  * Before beginning any non-trivial examination of level, prepare
703  * state for next bt_check_level_from_leftmost() invocation for
704  * the next level for the next level down (if any).
705  *
706  * There should be at least one non-ignorable page per level,
707  * unless this is the leaf level, which is assumed by caller to be
708  * final level.
709  */
710  if (!P_ISLEAF(opaque))
711  {
712  IndexTuple itup;
713  ItemId itemid;
714 
715  /* Internal page -- downlink gets leftmost on next level */
716  itemid = PageGetItemIdCareful(state, state->targetblock,
717  state->target,
718  P_FIRSTDATAKEY(opaque));
719  itup = (IndexTuple) PageGetItem(state->target, itemid);
720  nextleveldown.leftmost = BTreeInnerTupleGetDownLink(itup);
721  nextleveldown.level = opaque->btpo.level - 1;
722  }
723  else
724  {
725  /*
726  * Leaf page -- final level caller must process.
727  *
728  * Note that this could also be the root page, if there has
729  * been no root page split yet.
730  */
731  nextleveldown.leftmost = P_NONE;
732  nextleveldown.level = InvalidBtreeLevel;
733  }
734 
735  /*
736  * Finished setting up state for this call/level. Control will
737  * never end up back here in any future loop iteration for this
738  * level.
739  */
740  }
741 
742  /*
743  * readonly mode can only ever land on live pages and half-dead pages,
744  * so sibling pointers should always be in mutual agreement
745  */
746  if (state->readonly && opaque->btpo_prev != leftcurrent)
747  ereport(ERROR,
748  (errcode(ERRCODE_INDEX_CORRUPTED),
749  errmsg("left link/right link pair in index \"%s\" not in agreement",
750  RelationGetRelationName(state->rel)),
751  errdetail_internal("Block=%u left block=%u left link from block=%u.",
752  current, leftcurrent, opaque->btpo_prev)));
753 
754  /* Check level, which must be valid for non-ignorable page */
755  if (level.level != opaque->btpo.level)
756  ereport(ERROR,
757  (errcode(ERRCODE_INDEX_CORRUPTED),
758  errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
759  RelationGetRelationName(state->rel)),
760  errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
761  current, level.level, opaque->btpo.level)));
762 
763  /* Verify invariants for page */
764  bt_target_page_check(state);
765 
766 nextpage:
767 
768  /* Try to detect circular links */
769  if (current == leftcurrent || current == opaque->btpo_prev)
770  ereport(ERROR,
771  (errcode(ERRCODE_INDEX_CORRUPTED),
772  errmsg("circular link chain found in block %u of index \"%s\"",
773  current, RelationGetRelationName(state->rel))));
774 
775  /*
776  * Record if page that is about to become target is the right half of
777  * an incomplete page split. This can go stale immediately in
778  * !readonly case.
779  */
780  state->rightsplit = P_INCOMPLETE_SPLIT(opaque);
781 
782  leftcurrent = current;
783  current = opaque->btpo_next;
784 
785  /* Free page and associated memory for this iteration */
787  }
788  while (current != P_NONE);
789 
790  /* Don't change context for caller */
791  MemoryContextSwitchTo(oldcontext);
792 
793  return nextleveldown;
794 }
795 
796 /*
797  * Function performs the following checks on target page, or pages ancillary to
798  * target page:
799  *
800  * - That every "real" data item is less than or equal to the high key, which
801  * is an upper bound on the items on the page. Data items should be
802  * strictly less than the high key when the page is an internal page.
803  *
804  * - That within the page, every data item is strictly less than the item
805  * immediately to its right, if any (i.e., that the items are in order
806  * within the page, so that the binary searches performed by index scans are
807  * sane).
808  *
809  * - That the last data item stored on the page is strictly less than the
810  * first data item on the page to the right (when such a first item is
811  * available).
812  *
813  * - Various checks on the structure of tuples themselves. For example, check
814  * that non-pivot tuples have no truncated attributes.
815  *
816  * Furthermore, when state passed shows ShareLock held, function also checks:
817  *
818  * - That all child pages respect strict lower bound from parent's pivot
819  * tuple.
820  *
821  * - That downlink to block was encountered in parent where that's expected.
822  * (Limited to heapallindexed readonly callers.)
823  *
824  * This is also where heapallindexed callers use their Bloom filter to
825  * fingerprint IndexTuples for later table_index_build_scan() verification.
826  *
827  * Note: Memory allocated in this routine is expected to be released by caller
828  * resetting state->targetcontext.
829  */
830 static void
832 {
833  OffsetNumber offset;
834  OffsetNumber max;
835  BTPageOpaque topaque;
836 
837  topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
838  max = PageGetMaxOffsetNumber(state->target);
839 
840  elog(DEBUG2, "verifying %u items on %s block %u", max,
841  P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
842 
843  /*
844  * Check the number of attributes in high key. Note, rightmost page
845  * doesn't contain a high key, so nothing to check
846  */
847  if (!P_RIGHTMOST(topaque))
848  {
849  ItemId itemid;
850  IndexTuple itup;
851 
852  /* Verify line pointer before checking tuple */
853  itemid = PageGetItemIdCareful(state, state->targetblock,
854  state->target, P_HIKEY);
855  if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
856  P_HIKEY))
857  {
858  itup = (IndexTuple) PageGetItem(state->target, itemid);
859  ereport(ERROR,
860  (errcode(ERRCODE_INDEX_CORRUPTED),
861  errmsg("wrong number of high key index tuple attributes in index \"%s\"",
862  RelationGetRelationName(state->rel)),
863  errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.",
864  state->targetblock,
865  BTreeTupleGetNAtts(itup, state->rel),
866  P_ISLEAF(topaque) ? "heap" : "index",
867  (uint32) (state->targetlsn >> 32),
868  (uint32) state->targetlsn)));
869  }
870  }
871 
872  /*
873  * Loop over page items, starting from first non-highkey item, not high
874  * key (if any). Most tests are not performed for the "negative infinity"
875  * real item (if any).
876  */
877  for (offset = P_FIRSTDATAKEY(topaque);
878  offset <= max;
879  offset = OffsetNumberNext(offset))
880  {
881  ItemId itemid;
882  IndexTuple itup;
883  size_t tupsize;
884  BTScanInsert skey;
885  bool lowersizelimit;
886 
888 
889  itemid = PageGetItemIdCareful(state, state->targetblock,
890  state->target, offset);
891  itup = (IndexTuple) PageGetItem(state->target, itemid);
892  tupsize = IndexTupleSize(itup);
893 
894  /*
895  * lp_len should match the IndexTuple reported length exactly, since
896  * lp_len is completely redundant in indexes, and both sources of
897  * tuple length are MAXALIGN()'d. nbtree does not use lp_len all that
898  * frequently, and is surprisingly tolerant of corrupt lp_len fields.
899  */
900  if (tupsize != ItemIdGetLength(itemid))
901  ereport(ERROR,
902  (errcode(ERRCODE_INDEX_CORRUPTED),
903  errmsg("index tuple size does not equal lp_len in index \"%s\"",
904  RelationGetRelationName(state->rel)),
905  errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.",
906  state->targetblock, offset,
907  tupsize, ItemIdGetLength(itemid),
908  (uint32) (state->targetlsn >> 32),
909  (uint32) state->targetlsn),
910  errhint("This could be a torn page problem.")));
911 
912  /* Check the number of index tuple attributes */
913  if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
914  offset))
915  {
916  char *itid,
917  *htid;
918 
919  itid = psprintf("(%u,%u)", state->targetblock, offset);
920  htid = psprintf("(%u,%u)",
923 
924  ereport(ERROR,
925  (errcode(ERRCODE_INDEX_CORRUPTED),
926  errmsg("wrong number of index tuple attributes in index \"%s\"",
927  RelationGetRelationName(state->rel)),
928  errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",
929  itid,
930  BTreeTupleGetNAtts(itup, state->rel),
931  P_ISLEAF(topaque) ? "heap" : "index",
932  htid,
933  (uint32) (state->targetlsn >> 32),
934  (uint32) state->targetlsn)));
935  }
936 
937  /* Fingerprint downlink blocks in heapallindexed + readonly case */
938  if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
939  {
940  BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
941 
943  (unsigned char *) &childblock,
944  sizeof(BlockNumber));
945  }
946 
947  /*
948  * Don't try to generate scankey using "negative infinity" item on
949  * internal pages. They are always truncated to zero attributes.
950  */
951  if (offset_is_negative_infinity(topaque, offset))
952  continue;
953 
954  /*
955  * Readonly callers may optionally verify that non-pivot tuples can
956  * each be found by an independent search that starts from the root
957  */
958  if (state->rootdescend && P_ISLEAF(topaque) &&
959  !bt_rootdescend(state, itup))
960  {
961  char *itid,
962  *htid;
963 
964  itid = psprintf("(%u,%u)", state->targetblock, offset);
965  htid = psprintf("(%u,%u)",
968 
969  ereport(ERROR,
970  (errcode(ERRCODE_INDEX_CORRUPTED),
971  errmsg("could not find tuple using search from root page in index \"%s\"",
972  RelationGetRelationName(state->rel)),
973  errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
974  itid, htid,
975  (uint32) (state->targetlsn >> 32),
976  (uint32) state->targetlsn)));
977  }
978 
979  /* Build insertion scankey for current page offset */
980  skey = bt_mkscankey_pivotsearch(state->rel, itup);
981 
982  /*
983  * Make sure tuple size does not exceed the relevant BTREE_VERSION
984  * specific limit.
985  *
986  * BTREE_VERSION 4 (which introduced heapkeyspace rules) requisitioned
987  * a small amount of space from BTMaxItemSize() in order to ensure
988  * that suffix truncation always has enough space to add an explicit
989  * heap TID back to a tuple -- we pessimistically assume that every
990  * newly inserted tuple will eventually need to have a heap TID
991  * appended during a future leaf page split, when the tuple becomes
992  * the basis of the new high key (pivot tuple) for the leaf page.
993  *
994  * Since the reclaimed space is reserved for that purpose, we must not
995  * enforce the slightly lower limit when the extra space has been used
996  * as intended. In other words, there is only a cross-version
997  * difference in the limit on tuple size within leaf pages.
998  *
999  * Still, we're particular about the details within BTREE_VERSION 4
1000  * internal pages. Pivot tuples may only use the extra space for its
1001  * designated purpose. Enforce the lower limit for pivot tuples when
1002  * an explicit heap TID isn't actually present. (In all other cases
1003  * suffix truncation is guaranteed to generate a pivot tuple that's no
1004  * larger than the first right tuple provided to it by its caller.)
1005  */
1006  lowersizelimit = skey->heapkeyspace &&
1007  (P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL);
1008  if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) :
1009  BTMaxItemSizeNoHeapTid(state->target)))
1010  {
1011  char *itid,
1012  *htid;
1013 
1014  itid = psprintf("(%u,%u)", state->targetblock, offset);
1015  htid = psprintf("(%u,%u)",
1018 
1019  ereport(ERROR,
1020  (errcode(ERRCODE_INDEX_CORRUPTED),
1021  errmsg("index row size %zu exceeds maximum for index \"%s\"",
1022  tupsize, RelationGetRelationName(state->rel)),
1023  errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
1024  itid,
1025  P_ISLEAF(topaque) ? "heap" : "index",
1026  htid,
1027  (uint32) (state->targetlsn >> 32),
1028  (uint32) state->targetlsn)));
1029  }
1030 
1031  /* Fingerprint leaf page tuples (those that point to the heap) */
1032  if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
1033  {
1034  IndexTuple norm;
1035 
1036  norm = bt_normalize_tuple(state, itup);
1037  bloom_add_element(state->filter, (unsigned char *) norm,
1038  IndexTupleSize(norm));
1039  /* Be tidy */
1040  if (norm != itup)
1041  pfree(norm);
1042  }
1043 
1044  /*
1045  * * High key check *
1046  *
1047  * If there is a high key (if this is not the rightmost page on its
1048  * entire level), check that high key actually is upper bound on all
1049  * page items.
1050  *
1051  * We prefer to check all items against high key rather than checking
1052  * just the last and trusting that the operator class obeys the
1053  * transitive law (which implies that all previous items also
1054  * respected the high key invariant if they pass the item order
1055  * check).
1056  *
1057  * Ideally, we'd compare every item in the index against every other
1058  * item in the index, and not trust opclass obedience of the
1059  * transitive law to bridge the gap between children and their
1060  * grandparents (as well as great-grandparents, and so on). We don't
1061  * go to those lengths because that would be prohibitively expensive,
1062  * and probably not markedly more effective in practice.
1063  *
1064  * On the leaf level, we check that the key is <= the highkey.
1065  * However, on non-leaf levels we check that the key is < the highkey,
1066  * because the high key is "just another separator" rather than a copy
1067  * of some existing key item; we expect it to be unique among all keys
1068  * on the same level. (Suffix truncation will sometimes produce a
1069  * leaf highkey that is an untruncated copy of the lastleft item, but
1070  * never any other item, which necessitates weakening the leaf level
1071  * check to <=.)
1072  *
1073  * Full explanation for why a highkey is never truly a copy of another
1074  * item from the same level on internal levels:
1075  *
1076  * While the new left page's high key is copied from the first offset
1077  * on the right page during an internal page split, that's not the
1078  * full story. In effect, internal pages are split in the middle of
1079  * the firstright tuple, not between the would-be lastleft and
1080  * firstright tuples: the firstright key ends up on the left side as
1081  * left's new highkey, and the firstright downlink ends up on the
1082  * right side as right's new "negative infinity" item. The negative
1083  * infinity tuple is truncated to zero attributes, so we're only left
1084  * with the downlink. In other words, the copying is just an
1085  * implementation detail of splitting in the middle of a (pivot)
1086  * tuple. (See also: "Notes About Data Representation" in the nbtree
1087  * README.)
1088  */
1089  if (!P_RIGHTMOST(topaque) &&
1090  !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
1091  invariant_l_offset(state, skey, P_HIKEY)))
1092  {
1093  char *itid,
1094  *htid;
1095 
1096  itid = psprintf("(%u,%u)", state->targetblock, offset);
1097  htid = psprintf("(%u,%u)",
1100 
1101  ereport(ERROR,
1102  (errcode(ERRCODE_INDEX_CORRUPTED),
1103  errmsg("high key invariant violated for index \"%s\"",
1104  RelationGetRelationName(state->rel)),
1105  errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
1106  itid,
1107  P_ISLEAF(topaque) ? "heap" : "index",
1108  htid,
1109  (uint32) (state->targetlsn >> 32),
1110  (uint32) state->targetlsn)));
1111  }
1112 
1113  /*
1114  * * Item order check *
1115  *
1116  * Check that items are stored on page in logical order, by checking
1117  * current item is strictly less than next item (if any).
1118  */
1119  if (OffsetNumberNext(offset) <= max &&
1120  !invariant_l_offset(state, skey, OffsetNumberNext(offset)))
1121  {
1122  char *itid,
1123  *htid,
1124  *nitid,
1125  *nhtid;
1126 
1127  itid = psprintf("(%u,%u)", state->targetblock, offset);
1128  htid = psprintf("(%u,%u)",
1131  nitid = psprintf("(%u,%u)", state->targetblock,
1132  OffsetNumberNext(offset));
1133 
1134  /* Reuse itup to get pointed-to heap location of second item */
1135  itemid = PageGetItemIdCareful(state, state->targetblock,
1136  state->target,
1137  OffsetNumberNext(offset));
1138  itup = (IndexTuple) PageGetItem(state->target, itemid);
1139  nhtid = psprintf("(%u,%u)",
1142 
1143  ereport(ERROR,
1144  (errcode(ERRCODE_INDEX_CORRUPTED),
1145  errmsg("item order invariant violated for index \"%s\"",
1146  RelationGetRelationName(state->rel)),
1147  errdetail_internal("Lower index tid=%s (points to %s tid=%s) "
1148  "higher index tid=%s (points to %s tid=%s) "
1149  "page lsn=%X/%X.",
1150  itid,
1151  P_ISLEAF(topaque) ? "heap" : "index",
1152  htid,
1153  nitid,
1154  P_ISLEAF(topaque) ? "heap" : "index",
1155  nhtid,
1156  (uint32) (state->targetlsn >> 32),
1157  (uint32) state->targetlsn)));
1158  }
1159 
1160  /*
1161  * * Last item check *
1162  *
1163  * Check last item against next/right page's first data item's when
1164  * last item on page is reached. This additional check will detect
1165  * transposed pages iff the supposed right sibling page happens to
1166  * belong before target in the key space. (Otherwise, a subsequent
1167  * heap verification will probably detect the problem.)
1168  *
1169  * This check is similar to the item order check that will have
1170  * already been performed for every other "real" item on target page
1171  * when last item is checked. The difference is that the next item
1172  * (the item that is compared to target's last item) needs to come
1173  * from the next/sibling page. There may not be such an item
1174  * available from sibling for various reasons, though (e.g., target is
1175  * the rightmost page on level).
1176  */
1177  else if (offset == max)
1178  {
1179  BTScanInsert rightkey;
1180 
1181  /* Get item in next/right page */
1182  rightkey = bt_right_page_check_scankey(state);
1183 
1184  if (rightkey &&
1185  !invariant_g_offset(state, rightkey, max))
1186  {
1187  /*
1188  * As explained at length in bt_right_page_check_scankey(),
1189  * there is a known !readonly race that could account for
1190  * apparent violation of invariant, which we must check for
1191  * before actually proceeding with raising error. Our canary
1192  * condition is that target page was deleted.
1193  */
1194  if (!state->readonly)
1195  {
1196  /* Get fresh copy of target page */
1197  state->target = palloc_btree_page(state, state->targetblock);
1198  /* Note that we deliberately do not update target LSN */
1199  topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
1200 
1201  /*
1202  * All !readonly checks now performed; just return
1203  */
1204  if (P_IGNORE(topaque))
1205  return;
1206  }
1207 
1208  ereport(ERROR,
1209  (errcode(ERRCODE_INDEX_CORRUPTED),
1210  errmsg("cross page item order invariant violated for index \"%s\"",
1211  RelationGetRelationName(state->rel)),
1212  errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.",
1213  state->targetblock, offset,
1214  (uint32) (state->targetlsn >> 32),
1215  (uint32) state->targetlsn)));
1216  }
1217  }
1218 
1219  /*
1220  * * Downlink check *
1221  *
1222  * Additional check of child items iff this is an internal page and
1223  * caller holds a ShareLock. This happens for every downlink (item)
1224  * in target excluding the negative-infinity downlink (again, this is
1225  * because it has no useful value to compare).
1226  */
1227  if (!P_ISLEAF(topaque) && state->readonly)
1228  {
1229  BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
1230 
1231  bt_downlink_check(state, skey, childblock);
1232  }
1233  }
1234 
1235  /*
1236  * * Check if page has a downlink in parent *
1237  *
1238  * This can only be checked in heapallindexed + readonly case.
1239  */
1240  if (state->heapallindexed && state->readonly)
1242 }
1243 
1244 /*
1245  * Return a scankey for an item on page to right of current target (or the
1246  * first non-ignorable page), sufficient to check ordering invariant on last
1247  * item in current target page. Returned scankey relies on local memory
1248  * allocated for the child page, which caller cannot pfree(). Caller's memory
1249  * context should be reset between calls here.
1250  *
1251  * This is the first data item, and so all adjacent items are checked against
1252  * their immediate sibling item (which may be on a sibling page, or even a
1253  * "cousin" page at parent boundaries where target's rightlink points to page
1254  * with different parent page). If no such valid item is available, return
1255  * NULL instead.
1256  *
1257  * Note that !readonly callers must reverify that target page has not
1258  * been concurrently deleted.
1259  */
1260 static BTScanInsert
1262 {
1263  BTPageOpaque opaque;
1264  ItemId rightitem;
1265  IndexTuple firstitup;
1266  BlockNumber targetnext;
1267  Page rightpage;
1268  OffsetNumber nline;
1269 
1270  /* Determine target's next block number */
1271  opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
1272 
1273  /* If target is already rightmost, no right sibling; nothing to do here */
1274  if (P_RIGHTMOST(opaque))
1275  return NULL;
1276 
1277  /*
1278  * General notes on concurrent page splits and page deletion:
1279  *
1280  * Routines like _bt_search() don't require *any* page split interlock
1281  * when descending the tree, including something very light like a buffer
1282  * pin. That's why it's okay that we don't either. This avoidance of any
1283  * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao
1284  * algorithm, in fact.
1285  *
1286  * That leaves deletion. A deleted page won't actually be recycled by
1287  * VACUUM early enough for us to fail to at least follow its right link
1288  * (or left link, or downlink) and find its sibling, because recycling
1289  * does not occur until no possible index scan could land on the page.
1290  * Index scans can follow links with nothing more than their snapshot as
1291  * an interlock and be sure of at least that much. (See page
1292  * recycling/RecentGlobalXmin notes in nbtree README.)
1293  *
1294  * Furthermore, it's okay if we follow a rightlink and find a half-dead or
1295  * dead (ignorable) page one or more times. There will either be a
1296  * further right link to follow that leads to a live page before too long
1297  * (before passing by parent's rightmost child), or we will find the end
1298  * of the entire level instead (possible when parent page is itself the
1299  * rightmost on its level).
1300  */
1301  targetnext = opaque->btpo_next;
1302  for (;;)
1303  {
1305 
1306  rightpage = palloc_btree_page(state, targetnext);
1307  opaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1308 
1309  if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque))
1310  break;
1311 
1312  /* We landed on a deleted page, so step right to find a live page */
1313  targetnext = opaque->btpo_next;
1314  ereport(DEBUG1,
1315  (errcode(ERRCODE_NO_DATA),
1316  errmsg("level %u leftmost page of index \"%s\" was found deleted or half dead",
1317  opaque->btpo.level, RelationGetRelationName(state->rel)),
1318  errdetail_internal("Deleted page found when building scankey from right sibling.")));
1319 
1320  /* Be slightly more pro-active in freeing this memory, just in case */
1321  pfree(rightpage);
1322  }
1323 
1324  /*
1325  * No ShareLock held case -- why it's safe to proceed.
1326  *
1327  * Problem:
1328  *
1329  * We must avoid false positive reports of corruption when caller treats
1330  * item returned here as an upper bound on target's last item. In
1331  * general, false positives are disallowed. Avoiding them here when
1332  * caller is !readonly is subtle.
1333  *
1334  * A concurrent page deletion by VACUUM of the target page can result in
1335  * the insertion of items on to this right sibling page that would
1336  * previously have been inserted on our target page. There might have
1337  * been insertions that followed the target's downlink after it was made
1338  * to point to right sibling instead of target by page deletion's first
1339  * phase. The inserters insert items that would belong on target page.
1340  * This race is very tight, but it's possible. This is our only problem.
1341  *
1342  * Non-problems:
1343  *
1344  * We are not hindered by a concurrent page split of the target; we'll
1345  * never land on the second half of the page anyway. A concurrent split
1346  * of the right page will also not matter, because the first data item
1347  * remains the same within the left half, which we'll reliably land on. If
1348  * we had to skip over ignorable/deleted pages, it cannot matter because
1349  * their key space has already been atomically merged with the first
1350  * non-ignorable page we eventually find (doesn't matter whether the page
1351  * we eventually find is a true sibling or a cousin of target, which we go
1352  * into below).
1353  *
1354  * Solution:
1355  *
1356  * Caller knows that it should reverify that target is not ignorable
1357  * (half-dead or deleted) when cross-page sibling item comparison appears
1358  * to indicate corruption (invariant fails). This detects the single race
1359  * condition that exists for caller. This is correct because the
1360  * continued existence of target block as non-ignorable (not half-dead or
1361  * deleted) implies that target page was not merged into from the right by
1362  * deletion; the key space at or after target never moved left. Target's
1363  * parent either has the same downlink to target as before, or a <
1364  * downlink due to deletion at the left of target. Target either has the
1365  * same highkey as before, or a highkey < before when there is a page
1366  * split. (The rightmost concurrently-split-from-target-page page will
1367  * still have the same highkey as target was originally found to have,
1368  * which for our purposes is equivalent to target's highkey itself never
1369  * changing, since we reliably skip over
1370  * concurrently-split-from-target-page pages.)
1371  *
1372  * In simpler terms, we allow that the key space of the target may expand
1373  * left (the key space can move left on the left side of target only), but
1374  * the target key space cannot expand right and get ahead of us without
1375  * our detecting it. The key space of the target cannot shrink, unless it
1376  * shrinks to zero due to the deletion of the original page, our canary
1377  * condition. (To be very precise, we're a bit stricter than that because
1378  * it might just have been that the target page split and only the
1379  * original target page was deleted. We can be more strict, just not more
1380  * lax.)
1381  *
1382  * Top level tree walk caller moves on to next page (makes it the new
1383  * target) following recovery from this race. (cf. The rationale for
1384  * child/downlink verification needing a ShareLock within
1385  * bt_downlink_check(), where page deletion is also the main source of
1386  * trouble.)
1387  *
1388  * Note that it doesn't matter if right sibling page here is actually a
1389  * cousin page, because in order for the key space to be readjusted in a
1390  * way that causes us issues in next level up (guiding problematic
1391  * concurrent insertions to the cousin from the grandparent rather than to
1392  * the sibling from the parent), there'd have to be page deletion of
1393  * target's parent page (affecting target's parent's downlink in target's
1394  * grandparent page). Internal page deletion only occurs when there are
1395  * no child pages (they were all fully deleted), and caller is checking
1396  * that the target's parent has at least one non-deleted (so
1397  * non-ignorable) child: the target page. (Note that the first phase of
1398  * deletion atomically marks the page to be deleted half-dead/ignorable at
1399  * the same time downlink in its parent is removed, so caller will
1400  * definitely not fail to detect that this happened.)
1401  *
1402  * This trick is inspired by the method backward scans use for dealing
1403  * with concurrent page splits; concurrent page deletion is a problem that
1404  * similarly receives special consideration sometimes (it's possible that
1405  * the backwards scan will re-read its "original" block after failing to
1406  * find a right-link to it, having already moved in the opposite direction
1407  * (right/"forwards") a few times to try to locate one). Just like us,
1408  * that happens only to determine if there was a concurrent page deletion
1409  * of a reference page, and just like us if there was a page deletion of
1410  * that reference page it means we can move on from caring about the
1411  * reference page. See the nbtree README for a full description of how
1412  * that works.
1413  */
1414  nline = PageGetMaxOffsetNumber(rightpage);
1415 
1416  /*
1417  * Get first data item, if any
1418  */
1419  if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque))
1420  {
1421  /* Return first data item (if any) */
1422  rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
1423  P_FIRSTDATAKEY(opaque));
1424  }
1425  else if (!P_ISLEAF(opaque) &&
1426  nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque)))
1427  {
1428  /*
1429  * Return first item after the internal page's "negative infinity"
1430  * item
1431  */
1432  rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
1433  OffsetNumberNext(P_FIRSTDATAKEY(opaque)));
1434  }
1435  else
1436  {
1437  /*
1438  * No first item. Page is probably empty leaf page, but it's also
1439  * possible that it's an internal page with only a negative infinity
1440  * item.
1441  */
1442  ereport(DEBUG1,
1443  (errcode(ERRCODE_NO_DATA),
1444  errmsg("%s block %u of index \"%s\" has no first data item",
1445  P_ISLEAF(opaque) ? "leaf" : "internal", targetnext,
1446  RelationGetRelationName(state->rel))));
1447  return NULL;
1448  }
1449 
1450  /*
1451  * Return first real item scankey. Note that this relies on right page
1452  * memory remaining allocated.
1453  */
1454  firstitup = (IndexTuple) PageGetItem(rightpage, rightitem);
1455  return bt_mkscankey_pivotsearch(state->rel, firstitup);
1456 }
1457 
1458 /*
1459  * Checks one of target's downlink against its child page.
1460  *
1461  * Conceptually, the target page continues to be what is checked here. The
1462  * target block is still blamed in the event of finding an invariant violation.
1463  * The downlink insertion into the target is probably where any problem raised
1464  * here arises, and there is no such thing as a parent link, so doing the
1465  * verification this way around is much more practical.
1466  */
1467 static void
1469  BlockNumber childblock)
1470 {
1471  OffsetNumber offset;
1472  OffsetNumber maxoffset;
1473  Page child;
1474  BTPageOpaque copaque;
1475 
1476  /*
1477  * Caller must have ShareLock on target relation, because of
1478  * considerations around page deletion by VACUUM.
1479  *
1480  * NB: In general, page deletion deletes the right sibling's downlink, not
1481  * the downlink of the page being deleted; the deleted page's downlink is
1482  * reused for its sibling. The key space is thereby consolidated between
1483  * the deleted page and its right sibling. (We cannot delete a parent
1484  * page's rightmost child unless it is the last child page, and we intend
1485  * to also delete the parent itself.)
1486  *
1487  * If this verification happened without a ShareLock, the following race
1488  * condition could cause false positives:
1489  *
1490  * In general, concurrent page deletion might occur, including deletion of
1491  * the left sibling of the child page that is examined here. If such a
1492  * page deletion were to occur, closely followed by an insertion into the
1493  * newly expanded key space of the child, a window for the false positive
1494  * opens up: the stale parent/target downlink originally followed to get
1495  * to the child legitimately ceases to be a lower bound on all items in
1496  * the page, since the key space was concurrently expanded "left".
1497  * (Insertion followed the "new" downlink for the child, not our now-stale
1498  * downlink, which was concurrently physically removed in target/parent as
1499  * part of deletion's first phase.)
1500  *
1501  * Note that while the cross-page-same-level last item check uses a trick
1502  * that allows it to perform verification for !readonly callers, a similar
1503  * trick seems difficult here. The trick that that other check uses is,
1504  * in essence, to lock down race conditions to those that occur due to
1505  * concurrent page deletion of the target; that's a race that can be
1506  * reliably detected before actually reporting corruption.
1507  *
1508  * On the other hand, we'd need to lock down race conditions involving
1509  * deletion of child's left page, for long enough to read the child page
1510  * into memory (in other words, a scheme with concurrently held buffer
1511  * locks on both child and left-of-child pages). That's unacceptable for
1512  * amcheck functions on general principle, though.
1513  */
1514  Assert(state->readonly);
1515 
1516  /*
1517  * Verify child page has the downlink key from target page (its parent) as
1518  * a lower bound; downlink must be strictly less than all keys on the
1519  * page.
1520  *
1521  * Check all items, rather than checking just the first and trusting that
1522  * the operator class obeys the transitive law.
1523  */
1524  child = palloc_btree_page(state, childblock);
1525  copaque = (BTPageOpaque) PageGetSpecialPointer(child);
1526  maxoffset = PageGetMaxOffsetNumber(child);
1527 
1528  /*
1529  * Since there cannot be a concurrent VACUUM operation in readonly mode,
1530  * and since a page has no links within other pages (siblings and parent)
1531  * once it is marked fully deleted, it should be impossible to land on a
1532  * fully deleted page.
1533  *
1534  * It does not quite make sense to enforce that the page cannot even be
1535  * half-dead, despite the fact the downlink is modified at the same stage
1536  * that the child leaf page is marked half-dead. That's incorrect because
1537  * there may occasionally be multiple downlinks from a chain of pages
1538  * undergoing deletion, where multiple successive calls are made to
1539  * _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark
1540  * the leaf page as fully dead. While _bt_mark_page_halfdead() usually
1541  * removes the downlink to the leaf page that is marked half-dead, that's
1542  * not guaranteed, so it's possible we'll land on a half-dead page with a
1543  * downlink due to an interrupted multi-level page deletion.
1544  *
1545  * We go ahead with our checks if the child page is half-dead. It's safe
1546  * to do so because we do not test the child's high key, so it does not
1547  * matter that the original high key will have been replaced by a dummy
1548  * truncated high key within _bt_mark_page_halfdead(). All other page
1549  * items are left intact on a half-dead page, so there is still something
1550  * to test.
1551  */
1552  if (P_ISDELETED(copaque))
1553  ereport(ERROR,
1554  (errcode(ERRCODE_INDEX_CORRUPTED),
1555  errmsg("downlink to deleted page found in index \"%s\"",
1556  RelationGetRelationName(state->rel)),
1557  errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.",
1558  state->targetblock, childblock,
1559  (uint32) (state->targetlsn >> 32),
1560  (uint32) state->targetlsn)));
1561 
1562  for (offset = P_FIRSTDATAKEY(copaque);
1563  offset <= maxoffset;
1564  offset = OffsetNumberNext(offset))
1565  {
1566  /*
1567  * Skip comparison of target page key against "negative infinity"
1568  * item, if any. Checking it would indicate that it's not a strict
1569  * lower bound, but that's only because of the hard-coding for
1570  * negative infinity items within _bt_compare().
1571  *
1572  * If nbtree didn't truncate negative infinity tuples during internal
1573  * page splits then we'd expect child's negative infinity key to be
1574  * equal to the scankey/downlink from target/parent (it would be a
1575  * "low key" in this hypothetical scenario, and so it would still need
1576  * to be treated as a special case here).
1577  *
1578  * Negative infinity items can be thought of as a strict lower bound
1579  * that works transitively, with the last non-negative-infinity pivot
1580  * followed during a descent from the root as its "true" strict lower
1581  * bound. Only a small number of negative infinity items are truly
1582  * negative infinity; those that are the first items of leftmost
1583  * internal pages. In more general terms, a negative infinity item is
1584  * only negative infinity with respect to the subtree that the page is
1585  * at the root of.
1586  *
1587  * See also: bt_rootdescend(), which can even detect transitive
1588  * inconsistencies on cousin leaf pages.
1589  */
1590  if (offset_is_negative_infinity(copaque, offset))
1591  continue;
1592 
1593  if (!invariant_l_nontarget_offset(state, targetkey, childblock, child,
1594  offset))
1595  ereport(ERROR,
1596  (errcode(ERRCODE_INDEX_CORRUPTED),
1597  errmsg("down-link lower bound invariant violated for index \"%s\"",
1598  RelationGetRelationName(state->rel)),
1599  errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.",
1600  state->targetblock, childblock, offset,
1601  (uint32) (state->targetlsn >> 32),
1602  (uint32) state->targetlsn)));
1603  }
1604 
1605  pfree(child);
1606 }
1607 
1608 /*
1609  * Checks if page is missing a downlink that it should have.
1610  *
1611  * A page that lacks a downlink/parent may indicate corruption. However, we
1612  * must account for the fact that a missing downlink can occasionally be
1613  * encountered in a non-corrupt index. This can be due to an interrupted page
1614  * split, or an interrupted multi-level page deletion (i.e. there was a hard
1615  * crash or an error during a page split, or while VACUUM was deleting a
1616  * multi-level chain of pages).
1617  *
1618  * Note that this can only be called in readonly mode, so there is no need to
1619  * be concerned about concurrent page splits or page deletions.
1620  */
1621 static void
1623 {
1625  ItemId itemid;
1626  IndexTuple itup;
1627  Page child;
1628  BTPageOpaque copaque;
1629  uint32 level;
1630  BlockNumber childblk;
1631 
1632  Assert(state->heapallindexed && state->readonly);
1633  Assert(!P_IGNORE(topaque));
1634 
1635  /* No next level up with downlinks to fingerprint from the true root */
1636  if (P_ISROOT(topaque))
1637  return;
1638 
1639  /*
1640  * Incomplete (interrupted) page splits can account for the lack of a
1641  * downlink. Some inserting transaction should eventually complete the
1642  * page split in passing, when it notices that the left sibling page is
1643  * P_INCOMPLETE_SPLIT().
1644  *
1645  * In general, VACUUM is not prepared for there to be no downlink to a
1646  * page that it deletes. This is the main reason why the lack of a
1647  * downlink can be reported as corruption here. It's not obvious that an
1648  * invalid missing downlink can result in wrong answers to queries,
1649  * though, since index scans that land on the child may end up
1650  * consistently moving right. The handling of concurrent page splits (and
1651  * page deletions) within _bt_moveright() cannot distinguish
1652  * inconsistencies that last for a moment from inconsistencies that are
1653  * permanent and irrecoverable.
1654  *
1655  * VACUUM isn't even prepared to delete pages that have no downlink due to
1656  * an incomplete page split, but it can detect and reason about that case
1657  * by design, so it shouldn't be taken to indicate corruption. See
1658  * _bt_pagedel() for full details.
1659  */
1660  if (state->rightsplit)
1661  {
1662  ereport(DEBUG1,
1663  (errcode(ERRCODE_NO_DATA),
1664  errmsg("harmless interrupted page split detected in index %s",
1665  RelationGetRelationName(state->rel)),
1666  errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
1667  state->targetblock, topaque->btpo.level,
1668  topaque->btpo_prev,
1669  (uint32) (state->targetlsn >> 32),
1670  (uint32) state->targetlsn)));
1671  return;
1672  }
1673 
1674  /* Target's downlink is typically present in parent/fingerprinted */
1675  if (!bloom_lacks_element(state->downlinkfilter,
1676  (unsigned char *) &state->targetblock,
1677  sizeof(BlockNumber)))
1678  return;
1679 
1680  /*
1681  * Target is probably the "top parent" of a multi-level page deletion.
1682  * We'll need to descend the subtree to make sure that descendant pages
1683  * are consistent with that, though.
1684  *
1685  * If the target page (which must be non-ignorable) is a leaf page, then
1686  * clearly it can't be the top parent. The lack of a downlink is probably
1687  * a symptom of a broad problem that could just as easily cause
1688  * inconsistencies anywhere else.
1689  */
1690  if (P_ISLEAF(topaque))
1691  ereport(ERROR,
1692  (errcode(ERRCODE_INDEX_CORRUPTED),
1693  errmsg("leaf index block lacks downlink in index \"%s\"",
1694  RelationGetRelationName(state->rel)),
1695  errdetail_internal("Block=%u page lsn=%X/%X.",
1696  state->targetblock,
1697  (uint32) (state->targetlsn >> 32),
1698  (uint32) state->targetlsn)));
1699 
1700  /* Descend from the target page, which is an internal page */
1701  elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
1702  RelationGetRelationName(state->rel));
1703 
1704  level = topaque->btpo.level;
1705  itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
1706  P_FIRSTDATAKEY(topaque));
1707  itup = (IndexTuple) PageGetItem(state->target, itemid);
1708  childblk = BTreeInnerTupleGetDownLink(itup);
1709  for (;;)
1710  {
1712 
1713  child = palloc_btree_page(state, childblk);
1714  copaque = (BTPageOpaque) PageGetSpecialPointer(child);
1715 
1716  if (P_ISLEAF(copaque))
1717  break;
1718 
1719  /* Do an extra sanity check in passing on internal pages */
1720  if (copaque->btpo.level != level - 1)
1721  ereport(ERROR,
1722  (errcode(ERRCODE_INDEX_CORRUPTED),
1723  errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
1724  RelationGetRelationName(state->rel)),
1725  errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
1726  state->targetblock, childblk,
1727  level - 1, copaque->btpo.level)));
1728 
1729  level = copaque->btpo.level;
1730  itemid = PageGetItemIdCareful(state, childblk, child,
1731  P_FIRSTDATAKEY(copaque));
1732  itup = (IndexTuple) PageGetItem(child, itemid);
1733  childblk = BTreeInnerTupleGetDownLink(itup);
1734  /* Be slightly more pro-active in freeing this memory, just in case */
1735  pfree(child);
1736  }
1737 
1738  /*
1739  * Since there cannot be a concurrent VACUUM operation in readonly mode,
1740  * and since a page has no links within other pages (siblings and parent)
1741  * once it is marked fully deleted, it should be impossible to land on a
1742  * fully deleted page. See bt_downlink_check() for further details.
1743  *
1744  * The bt_downlink_check() P_ISDELETED() check is repeated here because
1745  * bt_downlink_check() does not visit pages reachable through negative
1746  * infinity items. Besides, bt_downlink_check() is unwilling to descend
1747  * multiple levels. (The similar bt_downlink_check() P_ISDELETED() check
1748  * within bt_check_level_from_leftmost() won't reach the page either,
1749  * since the leaf's live siblings should have their sibling links updated
1750  * to bypass the deletion target page when it is marked fully dead.)
1751  *
1752  * If this error is raised, it might be due to a previous multi-level page
1753  * deletion that failed to realize that it wasn't yet safe to mark the
1754  * leaf page as fully dead. A "dangling downlink" will still remain when
1755  * this happens. The fact that the dangling downlink's page (the leaf's
1756  * parent/ancestor page) lacked a downlink is incidental.
1757  */
1758  if (P_ISDELETED(copaque))
1759  ereport(ERROR,
1760  (errcode(ERRCODE_INDEX_CORRUPTED),
1761  errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
1762  RelationGetRelationName(state->rel)),
1763  errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",
1764  state->targetblock, childblk,
1765  (uint32) (state->targetlsn >> 32),
1766  (uint32) state->targetlsn)));
1767 
1768  /*
1769  * Iff leaf page is half-dead, its high key top parent link should point
1770  * to what VACUUM considered to be the top parent page at the instant it
1771  * was interrupted. Provided the high key link actually points to the
1772  * target page, the missing downlink we detected is consistent with there
1773  * having been an interrupted multi-level page deletion. This means that
1774  * the subtree with the target page at its root (a page deletion chain) is
1775  * in a consistent state, enabling VACUUM to resume deleting the entire
1776  * chain the next time it encounters the half-dead leaf page.
1777  */
1778  if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
1779  {
1780  itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
1781  itup = (IndexTuple) PageGetItem(child, itemid);
1782  if (BTreeTupleGetTopParent(itup) == state->targetblock)
1783  return;
1784  }
1785 
1786  ereport(ERROR,
1787  (errcode(ERRCODE_INDEX_CORRUPTED),
1788  errmsg("internal index block lacks downlink in index \"%s\"",
1789  RelationGetRelationName(state->rel)),
1790  errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
1791  state->targetblock, topaque->btpo.level,
1792  (uint32) (state->targetlsn >> 32),
1793  (uint32) state->targetlsn)));
1794 }
1795 
1796 /*
1797  * Per-tuple callback from table_index_build_scan, used to determine if index has
1798  * all the entries that definitely should have been observed in leaf pages of
1799  * the target index (that is, all IndexTuples that were fingerprinted by our
1800  * Bloom filter). All heapallindexed checks occur here.
1801  *
1802  * The redundancy between an index and the table it indexes provides a good
1803  * opportunity to detect corruption, especially corruption within the table.
1804  * The high level principle behind the verification performed here is that any
1805  * IndexTuple that should be in an index following a fresh CREATE INDEX (based
1806  * on the same index definition) should also have been in the original,
1807  * existing index, which should have used exactly the same representation
1808  *
1809  * Since the overall structure of the index has already been verified, the most
1810  * likely explanation for error here is a corrupt heap page (could be logical
1811  * or physical corruption). Index corruption may still be detected here,
1812  * though. Only readonly callers will have verified that left links and right
1813  * links are in agreement, and so it's possible that a leaf page transposition
1814  * within index is actually the source of corruption detected here (for
1815  * !readonly callers). The checks performed only for readonly callers might
1816  * more accurately frame the problem as a cross-page invariant issue (this
1817  * could even be due to recovery not replaying all WAL records). The !readonly
1818  * ERROR message raised here includes a HINT about retrying with readonly
1819  * verification, just in case it's a cross-page invariant issue, though that
1820  * isn't particularly likely.
1821  *
1822  * table_index_build_scan() expects to be able to find the root tuple when a
1823  * heap-only tuple (the live tuple at the end of some HOT chain) needs to be
1824  * indexed, in order to replace the actual tuple's TID with the root tuple's
1825  * TID (which is what we're actually passed back here). The index build heap
1826  * scan code will raise an error when a tuple that claims to be the root of the
1827  * heap-only tuple's HOT chain cannot be located. This catches cases where the
1828  * original root item offset/root tuple for a HOT chain indicates (for whatever
1829  * reason) that the entire HOT chain is dead, despite the fact that the latest
1830  * heap-only tuple should be indexed. When this happens, sequential scans may
1831  * always give correct answers, and all indexes may be considered structurally
1832  * consistent (i.e. the nbtree structural checks would not detect corruption).
1833  * It may be the case that only index scans give wrong answers, and yet heap or
1834  * SLRU corruption is the real culprit. (While it's true that LP_DEAD bit
1835  * setting will probably also leave the index in a corrupt state before too
1836  * long, the problem is nonetheless that there is heap corruption.)
1837  *
1838  * Heap-only tuple handling within table_index_build_scan() works in a way that
1839  * helps us to detect index tuples that contain the wrong values (values that
1840  * don't match the latest tuple in the HOT chain). This can happen when there
1841  * is no superseding index tuple due to a faulty assessment of HOT safety,
1842  * perhaps during the original CREATE INDEX. Because the latest tuple's
1843  * contents are used with the root TID, an error will be raised when a tuple
1844  * with the same TID but non-matching attribute values is passed back to us.
1845  * Faulty assessment of HOT-safety was behind at least two distinct CREATE
1846  * INDEX CONCURRENTLY bugs that made it into stable releases, one of which was
1847  * undetected for many years. In short, the same principle that allows a
1848  * REINDEX to repair corruption when there was an (undetected) broken HOT chain
1849  * also allows us to detect the corruption in many cases.
1850  */
1851 static void
1853  bool *isnull, bool tupleIsAlive, void *checkstate)
1854 {
1855  BtreeCheckState *state = (BtreeCheckState *) checkstate;
1856  IndexTuple itup,
1857  norm;
1858 
1859  Assert(state->heapallindexed);
1860 
1861  /* Generate a normalized index tuple for fingerprinting */
1862  itup = index_form_tuple(RelationGetDescr(index), values, isnull);
1863  itup->t_tid = htup->t_self;
1864  norm = bt_normalize_tuple(state, itup);
1865 
1866  /* Probe Bloom filter -- tuple should be present */
1867  if (bloom_lacks_element(state->filter, (unsigned char *) norm,
1868  IndexTupleSize(norm)))
1869  ereport(ERROR,
1871  errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
1872  ItemPointerGetBlockNumber(&(itup->t_tid)),
1875  RelationGetRelationName(state->rel)),
1876  !state->readonly
1877  ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.")
1878  : 0));
1879 
1880  state->heaptuplespresent++;
1881  pfree(itup);
1882  /* Cannot leak memory here */
1883  if (norm != itup)
1884  pfree(norm);
1885 }
1886 
1887 /*
1888  * Normalize an index tuple for fingerprinting.
1889  *
1890  * In general, index tuple formation is assumed to be deterministic by
1891  * heapallindexed verification, and IndexTuples are assumed immutable. While
1892  * the LP_DEAD bit is mutable in leaf pages, that's ItemId metadata, which is
1893  * not fingerprinted. Normalization is required to compensate for corner
1894  * cases where the determinism assumption doesn't quite work.
1895  *
1896  * There is currently one such case: index_form_tuple() does not try to hide
1897  * the source TOAST state of input datums. The executor applies TOAST
1898  * compression for heap tuples based on different criteria to the compression
1899  * applied within btinsert()'s call to index_form_tuple(): it sometimes
1900  * compresses more aggressively, resulting in compressed heap tuple datums but
1901  * uncompressed corresponding index tuple datums. A subsequent heapallindexed
1902  * verification will get a logically equivalent though bitwise unequal tuple
1903  * from index_form_tuple(). False positive heapallindexed corruption reports
1904  * could occur without normalizing away the inconsistency.
1905  *
1906  * Returned tuple is often caller's own original tuple. Otherwise, it is a
1907  * new representation of caller's original index tuple, palloc()'d in caller's
1908  * memory context.
1909  *
1910  * Note: This routine is not concerned with distinctions about the
1911  * representation of tuples beyond those that might break heapallindexed
1912  * verification. In particular, it won't try to normalize opclass-equal
1913  * datums with potentially distinct representations (e.g., btree/numeric_ops
1914  * index datums will not get their display scale normalized-away here).
1915  * Normalization may need to be expanded to handle more cases in the future,
1916  * though. For example, it's possible that non-pivot tuples could in the
1917  * future have alternative logically equivalent representations due to using
1918  * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
1919  */
1920 static IndexTuple
1922 {
1923  TupleDesc tupleDescriptor = RelationGetDescr(state->rel);
1924  Datum normalized[INDEX_MAX_KEYS];
1925  bool isnull[INDEX_MAX_KEYS];
1926  bool toast_free[INDEX_MAX_KEYS];
1927  bool formnewtup = false;
1928  IndexTuple reformed;
1929  int i;
1930 
1931  /* Easy case: It's immediately clear that tuple has no varlena datums */
1932  if (!IndexTupleHasVarwidths(itup))
1933  return itup;
1934 
1935  for (i = 0; i < tupleDescriptor->natts; i++)
1936  {
1937  Form_pg_attribute att;
1938 
1939  att = TupleDescAttr(tupleDescriptor, i);
1940 
1941  /* Assume untoasted/already normalized datum initially */
1942  toast_free[i] = false;
1943  normalized[i] = index_getattr(itup, att->attnum,
1944  tupleDescriptor,
1945  &isnull[i]);
1946  if (att->attbyval || att->attlen != -1 || isnull[i])
1947  continue;
1948 
1949  /*
1950  * Callers always pass a tuple that could safely be inserted into the
1951  * index without further processing, so an external varlena header
1952  * should never be encountered here
1953  */
1954  if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i])))
1955  ereport(ERROR,
1956  (errcode(ERRCODE_INDEX_CORRUPTED),
1957  errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"",
1958  ItemPointerGetBlockNumber(&(itup->t_tid)),
1960  RelationGetRelationName(state->rel))));
1961  else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])))
1962  {
1963  formnewtup = true;
1964  normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i]));
1965  toast_free[i] = true;
1966  }
1967  }
1968 
1969  /* Easier case: Tuple has varlena datums, none of which are compressed */
1970  if (!formnewtup)
1971  return itup;
1972 
1973  /*
1974  * Hard case: Tuple had compressed varlena datums that necessitate
1975  * creating normalized version of the tuple from uncompressed input datums
1976  * (normalized input datums). This is rather naive, but shouldn't be
1977  * necessary too often.
1978  *
1979  * Note that we rely on deterministic index_form_tuple() TOAST compression
1980  * of normalized input.
1981  */
1982  reformed = index_form_tuple(tupleDescriptor, normalized, isnull);
1983  reformed->t_tid = itup->t_tid;
1984 
1985  /* Cannot leak memory here */
1986  for (i = 0; i < tupleDescriptor->natts; i++)
1987  if (toast_free[i])
1988  pfree(DatumGetPointer(normalized[i]));
1989 
1990  return reformed;
1991 }
1992 
1993 /*
1994  * Search for itup in index, starting from fast root page. itup must be a
1995  * non-pivot tuple. This is only supported with heapkeyspace indexes, since
1996  * we rely on having fully unique keys to find a match with only a single
1997  * visit to a leaf page, barring an interrupted page split, where we may have
1998  * to move right. (A concurrent page split is impossible because caller must
1999  * be readonly caller.)
2000  *
2001  * This routine can detect very subtle transitive consistency issues across
2002  * more than one level of the tree. Leaf pages all have a high key (even the
2003  * rightmost page has a conceptual positive infinity high key), but not a low
2004  * key. Their downlink in parent is a lower bound, which along with the high
2005  * key is almost enough to detect every possible inconsistency. A downlink
2006  * separator key value won't always be available from parent, though, because
2007  * the first items of internal pages are negative infinity items, truncated
2008  * down to zero attributes during internal page splits. While it's true that
2009  * bt_downlink_check() and the high key check can detect most imaginable key
2010  * space problems, there are remaining problems it won't detect with non-pivot
2011  * tuples in cousin leaf pages. Starting a search from the root for every
2012  * existing leaf tuple detects small inconsistencies in upper levels of the
2013  * tree that cannot be detected any other way. (Besides all this, this is
2014  * probably also useful as a direct test of the code used by index scans
2015  * themselves.)
2016  */
2017 static bool
2019 {
2020  BTScanInsert key;
2021  BTStack stack;
2022  Buffer lbuf;
2023  bool exists;
2024 
2025  key = _bt_mkscankey(state->rel, itup);
2026  Assert(key->heapkeyspace && key->scantid != NULL);
2027 
2028  /*
2029  * Search from root.
2030  *
2031  * Ideally, we would arrange to only move right within _bt_search() when
2032  * an interrupted page split is detected (i.e. when the incomplete split
2033  * bit is found to be set), but for now we accept the possibility that
2034  * that could conceal an inconsistency.
2035  */
2036  Assert(state->readonly && state->rootdescend);
2037  exists = false;
2038  stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL);
2039 
2040  if (BufferIsValid(lbuf))
2041  {
2042  BTInsertStateData insertstate;
2043  OffsetNumber offnum;
2044  Page page;
2045 
2046  insertstate.itup = itup;
2047  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
2048  insertstate.itup_key = key;
2049  insertstate.bounds_valid = false;
2050  insertstate.buf = lbuf;
2051 
2052  /* Get matching tuple on leaf page */
2053  offnum = _bt_binsrch_insert(state->rel, &insertstate);
2054  /* Compare first >= matching item on leaf page, if any */
2055  page = BufferGetPage(lbuf);
2056  if (offnum <= PageGetMaxOffsetNumber(page) &&
2057  _bt_compare(state->rel, key, page, offnum) == 0)
2058  exists = true;
2059  _bt_relbuf(state->rel, lbuf);
2060  }
2061 
2062  _bt_freestack(stack);
2063  pfree(key);
2064 
2065  return exists;
2066 }
2067 
2068 /*
2069  * Is particular offset within page (whose special state is passed by caller)
2070  * the page negative-infinity item?
2071  *
2072  * As noted in comments above _bt_compare(), there is special handling of the
2073  * first data item as a "negative infinity" item. The hard-coding within
2074  * _bt_compare() makes comparing this item for the purposes of verification
2075  * pointless at best, since the IndexTuple only contains a valid TID (a
2076  * reference TID to child page).
2077  */
2078 static inline bool
2080 {
2081  /*
2082  * For internal pages only, the first item after high key, if any, is
2083  * negative infinity item. Internal pages always have a negative infinity
2084  * item, whereas leaf pages never have one. This implies that negative
2085  * infinity item is either first or second line item, or there is none
2086  * within page.
2087  *
2088  * Negative infinity items are a special case among pivot tuples. They
2089  * always have zero attributes, while all other pivot tuples always have
2090  * nkeyatts attributes.
2091  *
2092  * Right-most pages don't have a high key, but could be said to
2093  * conceptually have a "positive infinity" high key. Thus, there is a
2094  * symmetry between down link items in parent pages, and high keys in
2095  * children. Together, they represent the part of the key space that
2096  * belongs to each page in the index. For example, all children of the
2097  * root page will have negative infinity as a lower bound from root
2098  * negative infinity downlink, and positive infinity as an upper bound
2099  * (implicitly, from "imaginary" positive infinity high key in root).
2100  */
2101  return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);
2102 }
2103 
2104 /*
2105  * Does the invariant hold that the key is strictly less than a given upper
2106  * bound offset item?
2107  *
2108  * Verifies line pointer on behalf of caller.
2109  *
2110  * If this function returns false, convention is that caller throws error due
2111  * to corruption.
2112  */
2113 static inline bool
2115  OffsetNumber upperbound)
2116 {
2117  ItemId itemid;
2118  int32 cmp;
2119 
2120  Assert(key->pivotsearch);
2121 
2122  /* Verify line pointer before checking tuple */
2123  itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
2124  upperbound);
2125  /* pg_upgrade'd indexes may legally have equal sibling tuples */
2126  if (!key->heapkeyspace)
2127  return invariant_leq_offset(state, key, upperbound);
2128 
2129  cmp = _bt_compare(state->rel, key, state->target, upperbound);
2130 
2131  /*
2132  * _bt_compare() is capable of determining that a scankey with a
2133  * filled-out attribute is greater than pivot tuples where the comparison
2134  * is resolved at a truncated attribute (value of attribute in pivot is
2135  * minus infinity). However, it is not capable of determining that a
2136  * scankey is _less than_ a tuple on the basis of a comparison resolved at
2137  * _scankey_ minus infinity attribute. Complete an extra step to simulate
2138  * having minus infinity values for omitted scankey attribute(s).
2139  */
2140  if (cmp == 0)
2141  {
2142  BTPageOpaque topaque;
2143  IndexTuple ritup;
2144  int uppnkeyatts;
2145  ItemPointer rheaptid;
2146  bool nonpivot;
2147 
2148  ritup = (IndexTuple) PageGetItem(state->target, itemid);
2149  topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
2150  nonpivot = P_ISLEAF(topaque) && upperbound >= P_FIRSTDATAKEY(topaque);
2151 
2152  /* Get number of keys + heap TID for item to the right */
2153  uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel);
2154  rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, nonpivot);
2155 
2156  /* Heap TID is tiebreaker key attribute */
2157  if (key->keysz == uppnkeyatts)
2158  return key->scantid == NULL && rheaptid != NULL;
2159 
2160  return key->keysz < uppnkeyatts;
2161  }
2162 
2163  return cmp < 0;
2164 }
2165 
2166 /*
2167  * Does the invariant hold that the key is less than or equal to a given upper
2168  * bound offset item?
2169  *
2170  * Caller should have verified that upperbound's line pointer is consistent
2171  * using PageGetItemIdCareful() call.
2172  *
2173  * If this function returns false, convention is that caller throws error due
2174  * to corruption.
2175  */
2176 static inline bool
2178  OffsetNumber upperbound)
2179 {
2180  int32 cmp;
2181 
2182  Assert(key->pivotsearch);
2183 
2184  cmp = _bt_compare(state->rel, key, state->target, upperbound);
2185 
2186  return cmp <= 0;
2187 }
2188 
2189 /*
2190  * Does the invariant hold that the key is strictly greater than a given lower
2191  * bound offset item?
2192  *
2193  * Caller should have verified that lowerbound's line pointer is consistent
2194  * using PageGetItemIdCareful() call.
2195  *
2196  * If this function returns false, convention is that caller throws error due
2197  * to corruption.
2198  */
2199 static inline bool
2201  OffsetNumber lowerbound)
2202 {
2203  int32 cmp;
2204 
2205  Assert(key->pivotsearch);
2206 
2207  cmp = _bt_compare(state->rel, key, state->target, lowerbound);
2208 
2209  /* pg_upgrade'd indexes may legally have equal sibling tuples */
2210  if (!key->heapkeyspace)
2211  return cmp >= 0;
2212 
2213  /*
2214  * No need to consider the possibility that scankey has attributes that we
2215  * need to force to be interpreted as negative infinity. _bt_compare() is
2216  * able to determine that scankey is greater than negative infinity. The
2217  * distinction between "==" and "<" isn't interesting here, since
2218  * corruption is indicated either way.
2219  */
2220  return cmp > 0;
2221 }
2222 
2223 /*
2224  * Does the invariant hold that the key is strictly less than a given upper
2225  * bound offset item, with the offset relating to a caller-supplied page that
2226  * is not the current target page?
2227  *
2228  * Caller's non-target page is a child page of the target, checked as part of
2229  * checking a property of the target page (i.e. the key comes from the
2230  * target). Verifies line pointer on behalf of caller.
2231  *
2232  * If this function returns false, convention is that caller throws error due
2233  * to corruption.
2234  */
2235 static inline bool
2237  BlockNumber nontargetblock, Page nontarget,
2238  OffsetNumber upperbound)
2239 {
2240  ItemId itemid;
2241  int32 cmp;
2242 
2243  Assert(key->pivotsearch);
2244 
2245  /* Verify line pointer before checking tuple */
2246  itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
2247  upperbound);
2248  cmp = _bt_compare(state->rel, key, nontarget, upperbound);
2249 
2250  /* pg_upgrade'd indexes may legally have equal sibling tuples */
2251  if (!key->heapkeyspace)
2252  return cmp <= 0;
2253 
2254  /* See invariant_l_offset() for an explanation of this extra step */
2255  if (cmp == 0)
2256  {
2257  IndexTuple child;
2258  int uppnkeyatts;
2259  ItemPointer childheaptid;
2260  BTPageOpaque copaque;
2261  bool nonpivot;
2262 
2263  child = (IndexTuple) PageGetItem(nontarget, itemid);
2264  copaque = (BTPageOpaque) PageGetSpecialPointer(nontarget);
2265  nonpivot = P_ISLEAF(copaque) && upperbound >= P_FIRSTDATAKEY(copaque);
2266 
2267  /* Get number of keys + heap TID for child/non-target item */
2268  uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel);
2269  childheaptid = BTreeTupleGetHeapTIDCareful(state, child, nonpivot);
2270 
2271  /* Heap TID is tiebreaker key attribute */
2272  if (key->keysz == uppnkeyatts)
2273  return key->scantid == NULL && childheaptid != NULL;
2274 
2275  return key->keysz < uppnkeyatts;
2276  }
2277 
2278  return cmp < 0;
2279 }
2280 
2281 /*
2282  * Given a block number of a B-Tree page, return page in palloc()'d memory.
2283  * While at it, perform some basic checks of the page.
2284  *
2285  * There is never an attempt to get a consistent view of multiple pages using
2286  * multiple concurrent buffer locks; in general, we only acquire a single pin
2287  * and buffer lock at a time, which is often all that the nbtree code requires.
2288  *
2289  * Operating on a copy of the page is useful because it prevents control
2290  * getting stuck in an uninterruptible state when an underlying operator class
2291  * misbehaves.
2292  */
2293 static Page
2295 {
2296  Buffer buffer;
2297  Page page;
2298  BTPageOpaque opaque;
2299  OffsetNumber maxoffset;
2300 
2301  page = palloc(BLCKSZ);
2302 
2303  /*
2304  * We copy the page into local storage to avoid holding pin on the buffer
2305  * longer than we must.
2306  */
2307  buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL,
2308  state->checkstrategy);
2309  LockBuffer(buffer, BT_READ);
2310 
2311  /*
2312  * Perform the same basic sanity checking that nbtree itself performs for
2313  * every page:
2314  */
2315  _bt_checkpage(state->rel, buffer);
2316 
2317  /* Only use copy of page in palloc()'d memory */
2318  memcpy(page, BufferGetPage(buffer), BLCKSZ);
2319  UnlockReleaseBuffer(buffer);
2320 
2321  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2322 
2323  if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE)
2324  ereport(ERROR,
2325  (errcode(ERRCODE_INDEX_CORRUPTED),
2326  errmsg("invalid meta page found at block %u in index \"%s\"",
2327  blocknum, RelationGetRelationName(state->rel))));
2328 
2329  /* Check page from block that ought to be meta page */
2330  if (blocknum == BTREE_METAPAGE)
2331  {
2332  BTMetaPageData *metad = BTPageGetMeta(page);
2333 
2334  if (!P_ISMETA(opaque) ||
2335  metad->btm_magic != BTREE_MAGIC)
2336  ereport(ERROR,
2337  (errcode(ERRCODE_INDEX_CORRUPTED),
2338  errmsg("index \"%s\" meta page is corrupt",
2339  RelationGetRelationName(state->rel))));
2340 
2341  if (metad->btm_version < BTREE_MIN_VERSION ||
2342  metad->btm_version > BTREE_VERSION)
2343  ereport(ERROR,
2344  (errcode(ERRCODE_INDEX_CORRUPTED),
2345  errmsg("version mismatch in index \"%s\": file version %d, "
2346  "current version %d, minimum supported version %d",
2347  RelationGetRelationName(state->rel),
2348  metad->btm_version, BTREE_VERSION,
2349  BTREE_MIN_VERSION)));
2350 
2351  /* Finished with metapage checks */
2352  return page;
2353  }
2354 
2355  /*
2356  * Deleted pages have no sane "level" field, so can only check non-deleted
2357  * page level
2358  */
2359  if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
2360  ereport(ERROR,
2361  (errcode(ERRCODE_INDEX_CORRUPTED),
2362  errmsg("invalid leaf page level %u for block %u in index \"%s\"",
2363  opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
2364 
2365  if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
2366  opaque->btpo.level == 0)
2367  ereport(ERROR,
2368  (errcode(ERRCODE_INDEX_CORRUPTED),
2369  errmsg("invalid internal page level 0 for block %u in index \"%s\"",
2370  blocknum, RelationGetRelationName(state->rel))));
2371 
2372  /*
2373  * Sanity checks for number of items on page.
2374  *
2375  * As noted at the beginning of _bt_binsrch(), an internal page must have
2376  * children, since there must always be a negative infinity downlink
2377  * (there may also be a highkey). In the case of non-rightmost leaf
2378  * pages, there must be at least a highkey.
2379  *
2380  * This is correct when pages are half-dead, since internal pages are
2381  * never half-dead, and leaf pages must have a high key when half-dead
2382  * (the rightmost page can never be deleted). It's also correct with
2383  * fully deleted pages: _bt_unlink_halfdead_page() doesn't change anything
2384  * about the target page other than setting the page as fully dead, and
2385  * setting its xact field. In particular, it doesn't change the sibling
2386  * links in the deletion target itself, since they're required when index
2387  * scans land on the deletion target, and then need to move right (or need
2388  * to move left, in the case of backward index scans).
2389  */
2390  maxoffset = PageGetMaxOffsetNumber(page);
2391  if (maxoffset > MaxIndexTuplesPerPage)
2392  ereport(ERROR,
2393  (errcode(ERRCODE_INDEX_CORRUPTED),
2394  errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)",
2395  blocknum, RelationGetRelationName(state->rel),
2397 
2398  if (!P_ISLEAF(opaque) && maxoffset < P_FIRSTDATAKEY(opaque))
2399  ereport(ERROR,
2400  (errcode(ERRCODE_INDEX_CORRUPTED),
2401  errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink",
2402  blocknum, RelationGetRelationName(state->rel))));
2403 
2404  if (P_ISLEAF(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY)
2405  ereport(ERROR,
2406  (errcode(ERRCODE_INDEX_CORRUPTED),
2407  errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item",
2408  blocknum, RelationGetRelationName(state->rel))));
2409 
2410  /*
2411  * In general, internal pages are never marked half-dead, except on
2412  * versions of Postgres prior to 9.4, where it can be valid transient
2413  * state. This state is nonetheless treated as corruption by VACUUM on
2414  * from version 9.4 on, so do the same here. See _bt_pagedel() for full
2415  * details.
2416  *
2417  * Internal pages should never have garbage items, either.
2418  */
2419  if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
2420  ereport(ERROR,
2421  (errcode(ERRCODE_INDEX_CORRUPTED),
2422  errmsg("internal page block %u in index \"%s\" is half-dead",
2423  blocknum, RelationGetRelationName(state->rel)),
2424  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
2425 
2426  if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
2427  ereport(ERROR,
2428  (errcode(ERRCODE_INDEX_CORRUPTED),
2429  errmsg("internal page block %u in index \"%s\" has garbage items",
2430  blocknum, RelationGetRelationName(state->rel))));
2431 
2432  return page;
2433 }
2434 
2435 /*
2436  * _bt_mkscankey() wrapper that automatically prevents insertion scankey from
2437  * being considered greater than the pivot tuple that its values originated
2438  * from (or some other identical pivot tuple) in the common case where there
2439  * are truncated/minus infinity attributes. Without this extra step, there
2440  * are forms of corruption that amcheck could theoretically fail to report.
2441  *
2442  * For example, invariant_g_offset() might miss a cross-page invariant failure
2443  * on an internal level if the scankey built from the first item on the
2444  * target's right sibling page happened to be equal to (not greater than) the
2445  * last item on target page. The !pivotsearch tiebreaker in _bt_compare()
2446  * might otherwise cause amcheck to assume (rather than actually verify) that
2447  * the scankey is greater.
2448  */
2449 static inline BTScanInsert
2451 {
2452  BTScanInsert skey;
2453 
2454  skey = _bt_mkscankey(rel, itup);
2455  skey->pivotsearch = true;
2456 
2457  return skey;
2458 }
2459 
2460 /*
2461  * PageGetItemId() wrapper that validates returned line pointer.
2462  *
2463  * Buffer page/page item access macros generally trust that line pointers are
2464  * not corrupt, which might cause problems for verification itself. For
2465  * example, there is no bounds checking in PageGetItem(). Passing it a
2466  * corrupt line pointer can cause it to return a tuple/pointer that is unsafe
2467  * to dereference.
2468  *
2469  * Validating line pointers before tuples avoids undefined behavior and
2470  * assertion failures with corrupt indexes, making the verification process
2471  * more robust and predictable.
2472  */
2473 static ItemId
2475  OffsetNumber offset)
2476 {
2477  ItemId itemid = PageGetItemId(page, offset);
2478 
2479  if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) >
2480  BLCKSZ - sizeof(BTPageOpaqueData))
2481  ereport(ERROR,
2482  (errcode(ERRCODE_INDEX_CORRUPTED),
2483  errmsg("line pointer points past end of tuple space in index \"%s\"",
2484  RelationGetRelationName(state->rel)),
2485  errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
2486  block, offset, ItemIdGetOffset(itemid),
2487  ItemIdGetLength(itemid),
2488  ItemIdGetFlags(itemid))));
2489 
2490  /*
2491  * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree
2492  * never uses either. Verify that line pointer has storage, too, since
2493  * even LP_DEAD items should within nbtree.
2494  */
2495  if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) ||
2496  ItemIdGetLength(itemid) == 0)
2497  ereport(ERROR,
2498  (errcode(ERRCODE_INDEX_CORRUPTED),
2499  errmsg("invalid line pointer storage in index \"%s\"",
2500  RelationGetRelationName(state->rel)),
2501  errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
2502  block, offset, ItemIdGetOffset(itemid),
2503  ItemIdGetLength(itemid),
2504  ItemIdGetFlags(itemid))));
2505 
2506  return itemid;
2507 }
2508 
2509 /*
2510  * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must
2511  * be present in cases where that is mandatory.
2512  *
2513  * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK
2514  * bit is effectively a proxy for whether or not the tuple is a pivot tuple.
2515  * It may become more useful in the future, when non-pivot tuples support their
2516  * own alternative INDEX_ALT_TID_MASK representation.
2517  */
2518 static inline ItemPointer
2520  bool nonpivot)
2521 {
2522  ItemPointer result = BTreeTupleGetHeapTID(itup);
2524 
2525  if (result == NULL && nonpivot)
2526  ereport(ERROR,
2527  (errcode(ERRCODE_INDEX_CORRUPTED),
2528  errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
2529  targetblock, RelationGetRelationName(state->rel))));
2530 
2531  return result;
2532 }
PG_MODULE_MAGIC
Definition: verify_nbtree.c:42
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:542
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
bloom_filter * bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
Definition: bloomfilter.c:88
Oid IndexGetRelation(Oid indexId, bool missing_ok)
Definition: index.c:3235
#define VARATT_IS_COMPRESSED(PTR)
Definition: postgres.h:312
static bool invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
BlockNumber targetblock
Definition: verify_nbtree.c:92
BlockNumber btpo_next
Definition: nbtree.h:58
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:211
#define AllocSetContextCreate
Definition: memutils.h:169
#define DEBUG1
Definition: elog.h:25
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:133
int errhint(const char *fmt,...)
Definition: elog.c:974
#define ERRCODE_UNDEFINED_TABLE
Definition: pgbench.c:72
Datum bt_index_parent_check(PG_FUNCTION_ARGS)
static void btree_index_checkable(Relation rel)
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
bool bounds_valid
Definition: nbtree.h:508
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:865
uint32 btm_version
Definition: nbtree.h:100
#define RelationGetDescr(relation)
Definition: rel.h:440
int LOCKMODE
Definition: lockdefs.h:26
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:85
ItemPointer scantid
Definition: nbtree.h:476
static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
uint32 level
bloom_filter * downlinkfilter
#define PointerGetDatum(X)
Definition: postgres.h:556
#define BTREE_VERSION
Definition: nbtree.h:133
void bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:136
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
long random(void)
Definition: random.c:22
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
uint32 btm_magic
Definition: nbtree.h:99
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:642
static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
ItemPointerData t_tid
Definition: itup.h:37
union BTPageOpaqueData::@46 btpo
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:346
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define IsolationUsesXactSnapshot()
Definition: xact.h:51
#define P_NONE
Definition: nbtree.h:181
#define AccessShareLock
Definition: lockdefs.h:36
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:153
Oid * ii_ExclusionProcs
Definition: execnodes.h:164
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:195
int errcode(int sqlerrcode)
Definition: elog.c:570
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:136
static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state)
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PG_GETARG_BOOL(n)
Definition: fmgr.h:269
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2168
Form_pg_class rd_rel
Definition: rel.h:84
unsigned int Oid
Definition: postgres_ext.h:31
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
BufferAccessStrategy checkstrategy
Definition: verify_nbtree.c:83
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:306
#define OidIsValid(objectId)
Definition: c.h:638
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
static TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, bool allow_strat, bool allow_sync)
Definition: tableam.h:760
bloom_filter * filter
signed int int32
Definition: c.h:346
int errdetail_internal(const char *fmt,...)
Definition: elog.c:887
struct HeapTupleData * rd_indextuple
Definition: rel.h:146
uint16 OffsetNumber
Definition: off.h:24
HeapTupleHeader t_data
Definition: htup.h:68
static bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
Definition: type.h:89
#define P_ISMETA(opaque)
Definition: nbtree.h:192
static void bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, bool rootdescend)
static bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound)
#define VARATT_IS_EXTERNAL(PTR)
Definition: postgres.h:313
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:40
#define BT_READ
Definition: nbtree.h:402
XLogRecPtr targetlsn
Definition: verify_nbtree.c:94
Form_pg_index rd_index
Definition: rel.h:144
BlockNumber btm_fastroot
Definition: nbtree.h:103
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_MAGIC
Definition: nbtree.h:132
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:729
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:193
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3376
#define ERROR
Definition: elog.h:43
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1500
ItemPointerData t_self
Definition: htup.h:65
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:191
#define DEBUG2
Definition: elog.h:24
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
Datum bt_index_check(PG_FUNCTION_ARGS)
#define PG_GETARG_OID(n)
Definition: fmgr.h:270
static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, bool readonly, bool heapallindexed, bool rootdescend)
TransactionId RecentGlobalXmin
Definition: snapmgr.c:168
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:445
BlockNumber leftmost
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:200
unsigned int uint32
Definition: c.h:358
static void bt_downlink_missing_check(BtreeCheckState *state)
#define ItemIdGetOffset(itemId)
Definition: itemid.h:65
TransactionId xmin
Definition: snapshot.h:157
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
IndexTuple itup
Definition: nbtree.h:496
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:559
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
static ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot)
#define BTREE_METAPAGE
Definition: nbtree.h:131
bool pivotsearch
Definition: nbtree.h:475
#define P_ISDELETED(opaque)
Definition: nbtree.h:191
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define P_ISROOT(opaque)
Definition: nbtree.h:190
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:907
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:44
void bloom_free(bloom_filter *filter)
Definition: bloomfilter.c:127
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
Relation heaprel
Definition: verify_nbtree.c:71
uint32 btm_fastlevel
Definition: nbtree.h:104
uint32 level
Definition: nbtree.h:61
static bool invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, BlockNumber nontargetblock, Page nontarget, OffsetNumber upperbound)
#define ItemIdGetFlags(itemId)
Definition: itemid.h:71
void * palloc0(Size size)
Definition: mcxt.c:955
uintptr_t Datum
Definition: postgres.h:367
static BTScanInsert bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
BlockNumber btm_root
Definition: nbtree.h:101
int work_mem
Definition: globals.c:121
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, BlockNumber childblock)
bool _bt_heapkeyspace(Relation rel)
Definition: nbtpage.c:684
static void bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate)
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
int maintenance_work_mem
Definition: globals.c:122
#define PG_RETURN_VOID()
Definition: fmgr.h:339
int errmsg_internal(const char *fmt,...)
Definition: elog.c:814
bool ii_Unique
Definition: execnodes.h:169
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2382
uint64 XLogRecPtr
Definition: xlogdefs.h:21
BTScanInsert itup_key
Definition: nbtree.h:498
#define Assert(condition)
Definition: c.h:732
Definition: regguts.h:298
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:544
#define BTreeTupleGetNKeyAtts(itup, rel)
Definition: verify_nbtree.c:49
static void bt_target_page_check(BtreeCheckState *state)
#define BTreeTupleGetTopParent(itup)
Definition: nbtree.h:312
struct BtreeLevel BtreeLevel
bool heapkeyspace
Definition: nbtree.h:472
#define HeapTupleHeaderGetXmin(tup)
Definition: htup_details.h:313
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define PG_NARGS()
Definition: fmgr.h:198
#define IndexTupleHasVarwidths(itup)
Definition: itup.h:73
bool ii_Concurrent
Definition: execnodes.h:171
#define INT64_FORMAT
Definition: c.h:400
#define SnapshotAny
Definition: snapmgr.h:70
double bloom_prop_bits_set(bloom_filter *filter)
Definition: bloomfilter.c:188
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:152
#define PageGetLSN(page)
Definition: bufpage.h:366
#define DatumGetPointer(X)
Definition: postgres.h:549
#define BTMaxItemSize(page)
Definition: nbtree.h:147
#define P_HIKEY
Definition: nbtree.h:217
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
static Datum values[MAXATTR]
Definition: bootstrap.c:167
Oid * ii_ExclusionOps
Definition: execnodes.h:163
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:784
static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, OffsetNumber offset)
#define ItemPointerGetBlockNumberNoCheck(pointer)
Definition: itemptr.h:89
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
#define ShareLock
Definition: lockdefs.h:41
int i
bool istruerootlevel
#define PG_DETOAST_DATUM(datum)
Definition: fmgr.h:235
#define PG_FUNCTION_ARGS
Definition: fmgr.h:188
static bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define InvalidBtreeLevel
Definition: verify_nbtree.c:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
PG_FUNCTION_INFO_V1(bt_index_check)
uint16 * ii_ExclusionStrats
Definition: execnodes.h:165
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:39
MemoryContext targetcontext
Definition: verify_nbtree.c:81
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:126
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:158
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742
#define P_ISLEAF(opaque)
Definition: nbtree.h:189
struct BtreeCheckState BtreeCheckState