PostgreSQL Source Code  git master
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static int _bt_setuppostingitems (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
 
static void _bt_savepostingitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static bool _bt_parallel_readpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf, Snapshot snapshot)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_initialize_more_data (BTScanOpaque so, ScanDirection dir)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 343 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, BTScanInsertData::nextkey, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTScanInsertData::scantid, and unlikely.

Referenced by _bt_first(), and _bt_search().

346 {
347  Page page;
348  BTPageOpaque opaque;
349  OffsetNumber low,
350  high;
351  int32 result,
352  cmpval;
353 
354  page = BufferGetPage(buf);
355  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
356 
357  /* Requesting nextkey semantics while using scantid seems nonsensical */
358  Assert(!key->nextkey || key->scantid == NULL);
359  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
360  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
361 
362  low = P_FIRSTDATAKEY(opaque);
363  high = PageGetMaxOffsetNumber(page);
364 
365  /*
366  * If there are no keys on the page, return the first available slot. Note
367  * this covers two cases: the page is really empty (no keys), or it
368  * contains only a high key. The latter case is possible after vacuuming.
369  * This can never happen on an internal page, however, since they are
370  * never empty (an internal page must have children).
371  */
372  if (unlikely(high < low))
373  return low;
374 
375  /*
376  * Binary search to find the first key on the page >= scan key, or first
377  * key > scankey when nextkey is true.
378  *
379  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381  *
382  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384  *
385  * We can fall out when high == low.
386  */
387  high++; /* establish the loop invariant for high */
388 
389  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
390 
391  while (high > low)
392  {
393  OffsetNumber mid = low + ((high - low) / 2);
394 
395  /* We have low <= mid < high, so mid points at a real slot */
396 
397  result = _bt_compare(rel, key, page, mid);
398 
399  if (result >= cmpval)
400  low = mid + 1;
401  else
402  high = mid;
403  }
404 
405  /*
406  * At this point we have high == low, but be careful: they could point
407  * past the last slot on the page.
408  *
409  * On a leaf page, we always return the first key >= scan key (resp. >
410  * scan key), which could be the last slot + 1.
411  */
412  if (P_ISLEAF(opaque))
413  return low;
414 
415  /*
416  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
417  * There must be one if _bt_compare() is playing by the rules.
418  */
419  Assert(low > P_FIRSTDATAKEY(opaque));
420 
421  return OffsetNumberPrev(low);
422 }
ItemPointer scantid
Definition: nbtree.h:664
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:417
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:68
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:644
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define Assert(condition)
Definition: c.h:800
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define unlikely(x)
Definition: c.h:261
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 447 of file nbtsearch.c.

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::postingoff, BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

448 {
449  BTScanInsert key = insertstate->itup_key;
450  Page page;
451  BTPageOpaque opaque;
452  OffsetNumber low,
453  high,
454  stricthigh;
455  int32 result,
456  cmpval;
457 
458  page = BufferGetPage(insertstate->buf);
459  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
460 
461  Assert(P_ISLEAF(opaque));
462  Assert(!key->nextkey);
463  Assert(insertstate->postingoff == 0);
464 
465  if (!insertstate->bounds_valid)
466  {
467  /* Start new binary search */
468  low = P_FIRSTDATAKEY(opaque);
469  high = PageGetMaxOffsetNumber(page);
470  }
471  else
472  {
473  /* Restore result of previous binary search against same page */
474  low = insertstate->low;
475  high = insertstate->stricthigh;
476  }
477 
478  /* If there are no keys on the page, return the first available slot */
479  if (unlikely(high < low))
480  {
481  /* Caller can't reuse bounds */
482  insertstate->low = InvalidOffsetNumber;
483  insertstate->stricthigh = InvalidOffsetNumber;
484  insertstate->bounds_valid = false;
485  return low;
486  }
487 
488  /*
489  * Binary search to find the first key on the page >= scan key. (nextkey
490  * is always false when inserting).
491  *
492  * The loop invariant is: all slots before 'low' are < scan key, all slots
493  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
494  * maintained to save additional search effort for caller.
495  *
496  * We can fall out when high == low.
497  */
498  if (!insertstate->bounds_valid)
499  high++; /* establish the loop invariant for high */
500  stricthigh = high; /* high initially strictly higher */
501 
502  cmpval = 1; /* !nextkey comparison value */
503 
504  while (high > low)
505  {
506  OffsetNumber mid = low + ((high - low) / 2);
507 
508  /* We have low <= mid < high, so mid points at a real slot */
509 
510  result = _bt_compare(rel, key, page, mid);
511 
512  if (result >= cmpval)
513  low = mid + 1;
514  else
515  {
516  high = mid;
517  if (result != 0)
518  stricthigh = high;
519  }
520 
521  /*
522  * If tuple at offset located by binary search is a posting list whose
523  * TID range overlaps with caller's scantid, perform posting list
524  * binary search to set postingoff for caller. Caller must split the
525  * posting list when postingoff is set. This should happen
526  * infrequently.
527  */
528  if (unlikely(result == 0 && key->scantid != NULL))
529  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
530  }
531 
532  /*
533  * On a leaf page, a binary search always returns the first key >= scan
534  * key (at least in !nextkey case), which could be the last slot + 1. This
535  * is also the lower bound of cached search.
536  *
537  * stricthigh may also be the last slot + 1, which prevents caller from
538  * using bounds directly, but is still useful to us if we're called a
539  * second time with cached bounds (cached low will be < stricthigh when
540  * that happens).
541  */
542  insertstate->low = low;
543  insertstate->stricthigh = stricthigh;
544  insertstate->bounds_valid = true;
545 
546  return low;
547 }
bool bounds_valid
Definition: nbtree.h:696
ItemPointer scantid
Definition: nbtree.h:664
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
OffsetNumber stricthigh
Definition: nbtree.h:698
OffsetNumber low
Definition: nbtree.h:697
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:417
uint16 OffsetNumber
Definition: off.h:24
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:644
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:800
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define unlikely(x)
Definition: c.h:261
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 558 of file nbtsearch.c.

References BTScanInsertData::allequalimage, Assert, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanInsertData::heapkeyspace, ItemIdIsDead, ItemPointerCompare(), PageGetItem, PageGetItemId, and BTScanInsertData::scantid.

Referenced by _bt_binsrch_insert().

559 {
560  IndexTuple itup;
561  ItemId itemid;
562  int low,
563  high,
564  mid,
565  res;
566 
567  /*
568  * If this isn't a posting tuple, then the index must be corrupt (if it is
569  * an ordinary non-pivot tuple then there must be an existing tuple with a
570  * heap TID that equals inserter's new heap TID/scantid). Defensively
571  * check that tuple is a posting list tuple whose posting list range
572  * includes caller's scantid.
573  *
574  * (This is also needed because contrib/amcheck's rootdescend option needs
575  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
576  */
577  itemid = PageGetItemId(page, offnum);
578  itup = (IndexTuple) PageGetItem(page, itemid);
579  if (!BTreeTupleIsPosting(itup))
580  return 0;
581 
582  Assert(key->heapkeyspace && key->allequalimage);
583 
584  /*
585  * In the event that posting list tuple has LP_DEAD bit set, indicate this
586  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
587  * second call to _bt_binsrch_insert() can take place when its caller has
588  * removed the dead item.
589  */
590  if (ItemIdIsDead(itemid))
591  return -1;
592 
593  /* "high" is past end of posting list for loop invariant */
594  low = 0;
595  high = BTreeTupleGetNPosting(itup);
596  Assert(high >= 2);
597 
598  while (high > low)
599  {
600  mid = low + ((high - low) / 2);
601  res = ItemPointerCompare(key->scantid,
602  BTreeTupleGetPostingN(itup, mid));
603 
604  if (res > 0)
605  low = mid + 1;
606  else if (res < 0)
607  high = mid;
608  else
609  return mid;
610  }
611 
612  /* Exact match not found */
613  return low;
614 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
ItemPointer scantid
Definition: nbtree.h:664
bool allequalimage
Definition: nbtree.h:660
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
#define Assert(condition)
Definition: c.h:800
bool heapkeyspace
Definition: nbtree.h:659
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
#define PageGetItem(page, itemId)
Definition: bufpage.h:340

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 644 of file nbtsearch.c.

References _bt_check_natts(), BTScanInsertData::allequalimage, Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

648 {
649  TupleDesc itupdesc = RelationGetDescr(rel);
651  IndexTuple itup;
652  ItemPointer heapTid;
653  ScanKey scankey;
654  int ncmpkey;
655  int ntupatts;
656  int32 result;
657 
658  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
660  Assert(key->heapkeyspace || key->scantid == NULL);
661 
662  /*
663  * Force result ">" if target item is first data item on an internal page
664  * --- see NOTE above.
665  */
666  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
667  return 1;
668 
669  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
670  ntupatts = BTreeTupleGetNAtts(itup, rel);
671 
672  /*
673  * The scan key is set up with the attribute number associated with each
674  * term in the key. It is important that, if the index is multi-key, the
675  * scan contain the first k key attributes, and that they be in order. If
676  * you think about how multi-key ordering works, you'll understand why
677  * this is.
678  *
679  * We don't test for violation of this condition here, however. The
680  * initial setup for the index scan had better have gotten it right (see
681  * _bt_first).
682  */
683 
684  ncmpkey = Min(ntupatts, key->keysz);
685  Assert(key->heapkeyspace || ncmpkey == key->keysz);
686  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
687  scankey = key->scankeys;
688  for (int i = 1; i <= ncmpkey; i++)
689  {
690  Datum datum;
691  bool isNull;
692 
693  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
694 
695  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
696  {
697  if (isNull)
698  result = 0; /* NULL "=" NULL */
699  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
700  result = -1; /* NULL "<" NOT_NULL */
701  else
702  result = 1; /* NULL ">" NOT_NULL */
703  }
704  else if (isNull) /* key is NOT_NULL and item is NULL */
705  {
706  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
707  result = 1; /* NOT_NULL ">" NULL */
708  else
709  result = -1; /* NOT_NULL "<" NULL */
710  }
711  else
712  {
713  /*
714  * The sk_func needs to be passed the index value as left arg and
715  * the sk_argument as right arg (they might be of different
716  * types). Since it is convenient for callers to think of
717  * _bt_compare as comparing the scankey to the index item, we have
718  * to flip the sign of the comparison result. (Unless it's a DESC
719  * column, in which case we *don't* flip the sign.)
720  */
721  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
722  scankey->sk_collation,
723  datum,
724  scankey->sk_argument));
725 
726  if (!(scankey->sk_flags & SK_BT_DESC))
727  INVERT_COMPARE_RESULT(result);
728  }
729 
730  /* if the keys are unequal, return the difference */
731  if (result != 0)
732  return result;
733 
734  scankey++;
735  }
736 
737  /*
738  * All non-truncated attributes (other than heap TID) were found to be
739  * equal. Treat truncated attributes as minus infinity when scankey has a
740  * key attribute value that would otherwise be compared directly.
741  *
742  * Note: it doesn't matter if ntupatts includes non-key attributes;
743  * scankey won't, so explicitly excluding non-key attributes isn't
744  * necessary.
745  */
746  if (key->keysz > ntupatts)
747  return 1;
748 
749  /*
750  * Use the heap TID attribute and scantid to try to break the tie. The
751  * rules are the same as any other key attribute -- only the
752  * representation differs.
753  */
754  heapTid = BTreeTupleGetHeapTID(itup);
755  if (key->scantid == NULL)
756  {
757  /*
758  * Most searches have a scankey that is considered greater than a
759  * truncated pivot tuple if and when the scankey has equal values for
760  * attributes up to and including the least significant untruncated
761  * attribute in tuple.
762  *
763  * For example, if an index has the minimum two attributes (single
764  * user key attribute, plus heap TID attribute), and a page's high key
765  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
766  * will not descend to the page to the left. The search will descend
767  * right instead. The truncated attribute in pivot tuple means that
768  * all non-pivot tuples on the page to the left are strictly < 'foo',
769  * so it isn't necessary to descend left. In other words, search
770  * doesn't have to descend left because it isn't interested in a match
771  * that has a heap TID value of -inf.
772  *
773  * However, some searches (pivotsearch searches) actually require that
774  * we descend left when this happens. -inf is treated as a possible
775  * match for omitted scankey attribute(s). This is needed by page
776  * deletion, which must re-find leaf pages that are targets for
777  * deletion using their high keys.
778  *
779  * Note: the heap TID part of the test ensures that scankey is being
780  * compared to a pivot tuple with one or more truncated key
781  * attributes.
782  *
783  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
784  * left here, since they have no heap TID attribute (and cannot have
785  * any -inf key values in any case, since truncation can only remove
786  * non-key attributes). !heapkeyspace searches must always be
787  * prepared to deal with matches on both sides of the pivot once the
788  * leaf level is reached.
789  */
790  if (key->heapkeyspace && !key->pivotsearch &&
791  key->keysz == ntupatts && heapTid == NULL)
792  return 1;
793 
794  /* All provided scankey arguments found to be equal */
795  return 0;
796  }
797 
798  /*
799  * Treat truncated heap TID as minus infinity, since scankey has a key
800  * attribute value (scantid) that would otherwise be compared directly
801  */
803  if (heapTid == NULL)
804  return 1;
805 
806  /*
807  * Scankey must be treated as equal to a posting list tuple if its scantid
808  * value falls within the range of the posting list. In all other cases
809  * there can only be a single heap TID value, which is compared directly
810  * with scantid.
811  */
813  result = ItemPointerCompare(key->scantid, heapTid);
814  if (result <= 0 || !BTreeTupleIsPosting(itup))
815  return result;
816  else
817  {
818  result = ItemPointerCompare(key->scantid,
820  if (result > 0)
821  return 1;
822  }
823 
824  return 0;
825 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:506
#define DatumGetInt32(X)
Definition: postgres.h:472
#define RelationGetDescr(relation)
Definition: rel.h:483
ItemPointer scantid
Definition: nbtree.h:664
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define Min(x, y)
Definition: c.h:982
bool allequalimage
Definition: nbtree.h:660
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1152
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:417
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:476
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:663
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:532
uintptr_t Datum
Definition: postgres.h:367
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2466
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:800
#define SK_BT_DESC
Definition: nbtree.h:956
bool heapkeyspace
Definition: nbtree.h:659
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:666
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1121
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 65 of file nbtsearch.c.

References _bt_unlockbuf(), BTScanPosData::buf, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_readpage(), and _bt_steppage().

66 {
67  _bt_unlockbuf(scan->indexRelation, sp->buf);
68 
69  if (IsMVCCSnapshot(scan->xs_snapshot) &&
71  !scan->xs_want_itup)
72  {
73  ReleaseBuffer(sp->buf);
74  sp->buf = InvalidBuffer;
75  }
76 }
#define InvalidBuffer
Definition: buf.h:25
struct SnapshotData * xs_snapshot
Definition: relscan.h:116
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3513
Relation indexRelation
Definition: relscan.h:115
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:97
#define RelationNeedsWAL(relation)
Definition: rel.h:563
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1006
Buffer buf
Definition: nbtree.h:825

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2379 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_initialize_more_data(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert, BTScanPosInvalidate, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog, ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber, PageGetSpecialPointer, PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

2380 {
2381  Relation rel = scan->indexRelation;
2382  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2383  Buffer buf;
2384  Page page;
2385  BTPageOpaque opaque;
2386  OffsetNumber start;
2387  BTScanPosItem *currItem;
2388 
2389  /*
2390  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2391  * version of _bt_search(). We don't maintain a stack since we know we
2392  * won't need it.
2393  */
2395 
2396  if (!BufferIsValid(buf))
2397  {
2398  /*
2399  * Empty index. Lock the whole relation, as nothing finer to lock
2400  * exists.
2401  */
2402  PredicateLockRelation(rel, scan->xs_snapshot);
2404  return false;
2405  }
2406 
2408  page = BufferGetPage(buf);
2409  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2410  Assert(P_ISLEAF(opaque));
2411 
2412  if (ScanDirectionIsForward(dir))
2413  {
2414  /* There could be dead pages to the left, so not this: */
2415  /* Assert(P_LEFTMOST(opaque)); */
2416 
2417  start = P_FIRSTDATAKEY(opaque);
2418  }
2419  else if (ScanDirectionIsBackward(dir))
2420  {
2421  Assert(P_RIGHTMOST(opaque));
2422 
2423  start = PageGetMaxOffsetNumber(page);
2424  }
2425  else
2426  {
2427  elog(ERROR, "invalid scan direction: %d", (int) dir);
2428  start = 0; /* keep compiler quiet */
2429  }
2430 
2431  /* remember which buffer we have pinned */
2432  so->currPos.buf = buf;
2433 
2434  _bt_initialize_more_data(so, dir);
2435 
2436  /*
2437  * Now load data from the first page of the scan.
2438  */
2439  if (!_bt_readpage(scan, dir, start))
2440  {
2441  /*
2442  * There's no actually-matching data on this page. Try to advance to
2443  * the next page. Return false if there's no matching data at all.
2444  */
2445  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2446  if (!_bt_steppage(scan, dir))
2447  return false;
2448  }
2449  else
2450  {
2451  /* Drop the lock, and maybe the pin, on the current page */
2453  }
2454 
2455  /* OK, itemIndex says what to return */
2456  currItem = &so->currPos.items[so->currPos.itemIndex];
2457  scan->xs_heaptid = currItem->heapTid;
2458  if (scan->xs_want_itup)
2459  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2460 
2461  return true;
2462 }
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2521
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:139
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
int itemIndex
Definition: nbtree.h:855
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2498
char * currTuples
Definition: nbtree.h:929
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
struct SnapshotData * xs_snapshot
Definition: relscan.h:116
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2295
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Relation indexRelation
Definition: relscan.h:115
uint16 OffsetNumber
Definition: off.h:24
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
ItemPointerData xs_heaptid
Definition: relscan.h:144
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2469
#define Assert(condition)
Definition: c.h:800
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:885
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1006
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
BTScanPosData currPos
Definition: nbtree.h:942
#define elog(elevel,...)
Definition: elog.h:228
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1861
Buffer buf
Definition: nbtree.h:825
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1509
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 848 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), _bt_unlockbuf(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

849 {
850  Relation rel = scan->indexRelation;
851  BTScanOpaque so = (BTScanOpaque) scan->opaque;
852  Buffer buf;
853  BTStack stack;
854  OffsetNumber offnum;
855  StrategyNumber strat;
856  bool nextkey;
857  bool goback;
858  BTScanInsertData inskey;
859  ScanKey startKeys[INDEX_MAX_KEYS];
860  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
861  int keysCount = 0;
862  int i;
863  bool status;
864  StrategyNumber strat_total;
865  BTScanPosItem *currItem;
866  BlockNumber blkno;
867 
869 
871 
872  /*
873  * Examine the scan keys and eliminate any redundant keys; also mark the
874  * keys that must be matched to continue the scan.
875  */
876  _bt_preprocess_keys(scan);
877 
878  /*
879  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
880  * never be satisfied (eg, x == 1 AND x > 2).
881  */
882  if (!so->qual_ok)
883  {
884  /* Notify any other workers that we're done with this scan key. */
885  _bt_parallel_done(scan);
886  return false;
887  }
888 
889  /*
890  * For parallel scans, get the starting page from shared state. If the
891  * scan has not started, proceed to find out first leaf page in the usual
892  * way while keeping other participating processes waiting. If the scan
893  * has already begun, use the page number from the shared structure.
894  */
895  if (scan->parallel_scan != NULL)
896  {
897  status = _bt_parallel_seize(scan, &blkno);
898  if (!status)
899  return false;
900  else if (blkno == P_NONE)
901  {
902  _bt_parallel_done(scan);
903  return false;
904  }
905  else if (blkno != InvalidBlockNumber)
906  {
907  if (!_bt_parallel_readpage(scan, blkno, dir))
908  return false;
909  goto readcomplete;
910  }
911  }
912 
913  /*----------
914  * Examine the scan keys to discover where we need to start the scan.
915  *
916  * We want to identify the keys that can be used as starting boundaries;
917  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
918  * a backwards scan. We can use keys for multiple attributes so long as
919  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
920  * a > or < boundary or find an attribute with no boundary (which can be
921  * thought of as the same as "> -infinity"), we can't use keys for any
922  * attributes to its right, because it would break our simplistic notion
923  * of what initial positioning strategy to use.
924  *
925  * When the scan keys include cross-type operators, _bt_preprocess_keys
926  * may not be able to eliminate redundant keys; in such cases we will
927  * arbitrarily pick a usable one for each attribute. This is correct
928  * but possibly not optimal behavior. (For example, with keys like
929  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
930  * x=5 would be more efficient.) Since the situation only arises given
931  * a poorly-worded query plus an incomplete opfamily, live with it.
932  *
933  * When both equality and inequality keys appear for a single attribute
934  * (again, only possible when cross-type operators appear), we *must*
935  * select one of the equality keys for the starting point, because
936  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
937  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
938  * start at x=4, we will fail and stop before reaching x=10. If multiple
939  * equality quals survive preprocessing, however, it doesn't matter which
940  * one we use --- by definition, they are either redundant or
941  * contradictory.
942  *
943  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
944  * If the index stores nulls at the end of the index we'll be starting
945  * from, and we have no boundary key for the column (which means the key
946  * we deduced NOT NULL from is an inequality key that constrains the other
947  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
948  * use as a boundary key. If we didn't do this, we might find ourselves
949  * traversing a lot of null entries at the start of the scan.
950  *
951  * In this loop, row-comparison keys are treated the same as keys on their
952  * first (leftmost) columns. We'll add on lower-order columns of the row
953  * comparison below, if possible.
954  *
955  * The selected scan keys (at most one per index column) are remembered by
956  * storing their addresses into the local startKeys[] array.
957  *----------
958  */
959  strat_total = BTEqualStrategyNumber;
960  if (so->numberOfKeys > 0)
961  {
962  AttrNumber curattr;
963  ScanKey chosen;
964  ScanKey impliesNN;
965  ScanKey cur;
966 
967  /*
968  * chosen is the so-far-chosen key for the current attribute, if any.
969  * We don't cast the decision in stone until we reach keys for the
970  * next attribute.
971  */
972  curattr = 1;
973  chosen = NULL;
974  /* Also remember any scankey that implies a NOT NULL constraint */
975  impliesNN = NULL;
976 
977  /*
978  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
979  * pass to handle after-last-key processing. Actual exit from the
980  * loop is at one of the "break" statements below.
981  */
982  for (cur = so->keyData, i = 0;; cur++, i++)
983  {
984  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
985  {
986  /*
987  * Done looking at keys for curattr. If we didn't find a
988  * usable boundary key, see if we can deduce a NOT NULL key.
989  */
990  if (chosen == NULL && impliesNN != NULL &&
991  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
994  {
995  /* Yes, so build the key in notnullkeys[keysCount] */
996  chosen = &notnullkeys[keysCount];
997  ScanKeyEntryInitialize(chosen,
999  (impliesNN->sk_flags &
1001  curattr,
1002  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1005  InvalidOid,
1006  InvalidOid,
1007  InvalidOid,
1008  (Datum) 0);
1009  }
1010 
1011  /*
1012  * If we still didn't find a usable boundary key, quit; else
1013  * save the boundary key pointer in startKeys.
1014  */
1015  if (chosen == NULL)
1016  break;
1017  startKeys[keysCount++] = chosen;
1018 
1019  /*
1020  * Adjust strat_total, and quit if we have stored a > or <
1021  * key.
1022  */
1023  strat = chosen->sk_strategy;
1024  if (strat != BTEqualStrategyNumber)
1025  {
1026  strat_total = strat;
1027  if (strat == BTGreaterStrategyNumber ||
1028  strat == BTLessStrategyNumber)
1029  break;
1030  }
1031 
1032  /*
1033  * Done if that was the last attribute, or if next key is not
1034  * in sequence (implying no boundary key is available for the
1035  * next attribute).
1036  */
1037  if (i >= so->numberOfKeys ||
1038  cur->sk_attno != curattr + 1)
1039  break;
1040 
1041  /*
1042  * Reset for next attr.
1043  */
1044  curattr = cur->sk_attno;
1045  chosen = NULL;
1046  impliesNN = NULL;
1047  }
1048 
1049  /*
1050  * Can we use this key as a starting boundary for this attr?
1051  *
1052  * If not, does it imply a NOT NULL constraint? (Because
1053  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1054  * *any* inequality key works for that; we need not test.)
1055  */
1056  switch (cur->sk_strategy)
1057  {
1058  case BTLessStrategyNumber:
1060  if (chosen == NULL)
1061  {
1062  if (ScanDirectionIsBackward(dir))
1063  chosen = cur;
1064  else
1065  impliesNN = cur;
1066  }
1067  break;
1068  case BTEqualStrategyNumber:
1069  /* override any non-equality choice */
1070  chosen = cur;
1071  break;
1074  if (chosen == NULL)
1075  {
1076  if (ScanDirectionIsForward(dir))
1077  chosen = cur;
1078  else
1079  impliesNN = cur;
1080  }
1081  break;
1082  }
1083  }
1084  }
1085 
1086  /*
1087  * If we found no usable boundary keys, we have to start from one end of
1088  * the tree. Walk down that edge to the first or last key, and scan from
1089  * there.
1090  */
1091  if (keysCount == 0)
1092  {
1093  bool match;
1094 
1095  match = _bt_endpoint(scan, dir);
1096 
1097  if (!match)
1098  {
1099  /* No match, so mark (parallel) scan finished */
1100  _bt_parallel_done(scan);
1101  }
1102 
1103  return match;
1104  }
1105 
1106  /*
1107  * We want to start the scan somewhere within the index. Set up an
1108  * insertion scankey we can use to search for the boundary point we
1109  * identified above. The insertion scankey is built using the keys
1110  * identified by startKeys[]. (Remaining insertion scankey fields are
1111  * initialized after initial-positioning strategy is finalized.)
1112  */
1113  Assert(keysCount <= INDEX_MAX_KEYS);
1114  for (i = 0; i < keysCount; i++)
1115  {
1116  ScanKey cur = startKeys[i];
1117 
1118  Assert(cur->sk_attno == i + 1);
1119 
1120  if (cur->sk_flags & SK_ROW_HEADER)
1121  {
1122  /*
1123  * Row comparison header: look to the first row member instead.
1124  *
1125  * The member scankeys are already in insertion format (ie, they
1126  * have sk_func = 3-way-comparison function), but we have to watch
1127  * out for nulls, which _bt_preprocess_keys didn't check. A null
1128  * in the first row member makes the condition unmatchable, just
1129  * like qual_ok = false.
1130  */
1131  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1132 
1133  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1134  if (subkey->sk_flags & SK_ISNULL)
1135  {
1136  _bt_parallel_done(scan);
1137  return false;
1138  }
1139  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1140 
1141  /*
1142  * If the row comparison is the last positioning key we accepted,
1143  * try to add additional keys from the lower-order row members.
1144  * (If we accepted independent conditions on additional index
1145  * columns, we use those instead --- doesn't seem worth trying to
1146  * determine which is more restrictive.) Note that this is OK
1147  * even if the row comparison is of ">" or "<" type, because the
1148  * condition applied to all but the last row member is effectively
1149  * ">=" or "<=", and so the extra keys don't break the positioning
1150  * scheme. But, by the same token, if we aren't able to use all
1151  * the row members, then the part of the row comparison that we
1152  * did use has to be treated as just a ">=" or "<=" condition, and
1153  * so we'd better adjust strat_total accordingly.
1154  */
1155  if (i == keysCount - 1)
1156  {
1157  bool used_all_subkeys = false;
1158 
1159  Assert(!(subkey->sk_flags & SK_ROW_END));
1160  for (;;)
1161  {
1162  subkey++;
1163  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1164  if (subkey->sk_attno != keysCount + 1)
1165  break; /* out-of-sequence, can't use it */
1166  if (subkey->sk_strategy != cur->sk_strategy)
1167  break; /* wrong direction, can't use it */
1168  if (subkey->sk_flags & SK_ISNULL)
1169  break; /* can't use null keys */
1170  Assert(keysCount < INDEX_MAX_KEYS);
1171  memcpy(inskey.scankeys + keysCount, subkey,
1172  sizeof(ScanKeyData));
1173  keysCount++;
1174  if (subkey->sk_flags & SK_ROW_END)
1175  {
1176  used_all_subkeys = true;
1177  break;
1178  }
1179  }
1180  if (!used_all_subkeys)
1181  {
1182  switch (strat_total)
1183  {
1184  case BTLessStrategyNumber:
1185  strat_total = BTLessEqualStrategyNumber;
1186  break;
1188  strat_total = BTGreaterEqualStrategyNumber;
1189  break;
1190  }
1191  }
1192  break; /* done with outer loop */
1193  }
1194  }
1195  else
1196  {
1197  /*
1198  * Ordinary comparison key. Transform the search-style scan key
1199  * to an insertion scan key by replacing the sk_func with the
1200  * appropriate btree comparison function.
1201  *
1202  * If scankey operator is not a cross-type comparison, we can use
1203  * the cached comparison function; otherwise gotta look it up in
1204  * the catalogs. (That can't lead to infinite recursion, since no
1205  * indexscan initiated by syscache lookup will use cross-data-type
1206  * operators.)
1207  *
1208  * We support the convention that sk_subtype == InvalidOid means
1209  * the opclass input type; this is a hack to simplify life for
1210  * ScanKeyInit().
1211  */
1212  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1213  cur->sk_subtype == InvalidOid)
1214  {
1215  FmgrInfo *procinfo;
1216 
1217  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1218  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1219  cur->sk_flags,
1220  cur->sk_attno,
1222  cur->sk_subtype,
1223  cur->sk_collation,
1224  procinfo,
1225  cur->sk_argument);
1226  }
1227  else
1228  {
1229  RegProcedure cmp_proc;
1230 
1231  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1232  rel->rd_opcintype[i],
1233  cur->sk_subtype,
1234  BTORDER_PROC);
1235  if (!RegProcedureIsValid(cmp_proc))
1236  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1237  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1238  cur->sk_attno, RelationGetRelationName(rel));
1239  ScanKeyEntryInitialize(inskey.scankeys + i,
1240  cur->sk_flags,
1241  cur->sk_attno,
1243  cur->sk_subtype,
1244  cur->sk_collation,
1245  cmp_proc,
1246  cur->sk_argument);
1247  }
1248  }
1249  }
1250 
1251  /*----------
1252  * Examine the selected initial-positioning strategy to determine exactly
1253  * where we need to start the scan, and set flag variables to control the
1254  * code below.
1255  *
1256  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1257  * item >= scan key. If nextkey = true, they will locate the first
1258  * item > scan key.
1259  *
1260  * If goback = true, we will then step back one item, while if
1261  * goback = false, we will start the scan on the located item.
1262  *----------
1263  */
1264  switch (strat_total)
1265  {
1266  case BTLessStrategyNumber:
1267 
1268  /*
1269  * Find first item >= scankey, then back up one to arrive at last
1270  * item < scankey. (Note: this positioning strategy is only used
1271  * for a backward scan, so that is always the correct starting
1272  * position.)
1273  */
1274  nextkey = false;
1275  goback = true;
1276  break;
1277 
1279 
1280  /*
1281  * Find first item > scankey, then back up one to arrive at last
1282  * item <= scankey. (Note: this positioning strategy is only used
1283  * for a backward scan, so that is always the correct starting
1284  * position.)
1285  */
1286  nextkey = true;
1287  goback = true;
1288  break;
1289 
1290  case BTEqualStrategyNumber:
1291 
1292  /*
1293  * If a backward scan was specified, need to start with last equal
1294  * item not first one.
1295  */
1296  if (ScanDirectionIsBackward(dir))
1297  {
1298  /*
1299  * This is the same as the <= strategy. We will check at the
1300  * end whether the found item is actually =.
1301  */
1302  nextkey = true;
1303  goback = true;
1304  }
1305  else
1306  {
1307  /*
1308  * This is the same as the >= strategy. We will check at the
1309  * end whether the found item is actually =.
1310  */
1311  nextkey = false;
1312  goback = false;
1313  }
1314  break;
1315 
1317 
1318  /*
1319  * Find first item >= scankey. (This is only used for forward
1320  * scans.)
1321  */
1322  nextkey = false;
1323  goback = false;
1324  break;
1325 
1327 
1328  /*
1329  * Find first item > scankey. (This is only used for forward
1330  * scans.)
1331  */
1332  nextkey = true;
1333  goback = false;
1334  break;
1335 
1336  default:
1337  /* can't get here, but keep compiler quiet */
1338  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1339  return false;
1340  }
1341 
1342  /* Initialize remaining insertion scan key fields */
1343  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1344  inskey.anynullkeys = false; /* unused */
1345  inskey.nextkey = nextkey;
1346  inskey.pivotsearch = false;
1347  inskey.scantid = NULL;
1348  inskey.keysz = keysCount;
1349 
1350  /*
1351  * Use the manufactured insertion scan key to descend the tree and
1352  * position ourselves on the target leaf page.
1353  */
1354  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1355 
1356  /* don't need to keep the stack around... */
1357  _bt_freestack(stack);
1358 
1359  if (!BufferIsValid(buf))
1360  {
1361  /*
1362  * We only get here if the index is completely empty. Lock relation
1363  * because nothing finer to lock exists.
1364  */
1365  PredicateLockRelation(rel, scan->xs_snapshot);
1366 
1367  /*
1368  * mark parallel scan as done, so that all the workers can finish
1369  * their scan
1370  */
1371  _bt_parallel_done(scan);
1373 
1374  return false;
1375  }
1376  else
1378  scan->xs_snapshot);
1379 
1380  _bt_initialize_more_data(so, dir);
1381 
1382  /* position to the precise item on the page */
1383  offnum = _bt_binsrch(rel, &inskey, buf);
1384 
1385  /*
1386  * If nextkey = false, we are positioned at the first item >= scan key, or
1387  * possibly at the end of a page on which all the existing items are less
1388  * than the scan key and we know that everything on later pages is greater
1389  * than or equal to scan key.
1390  *
1391  * If nextkey = true, we are positioned at the first item > scan key, or
1392  * possibly at the end of a page on which all the existing items are less
1393  * than or equal to the scan key and we know that everything on later
1394  * pages is greater than scan key.
1395  *
1396  * The actually desired starting point is either this item or the prior
1397  * one, or in the end-of-page case it's the first item on the next page or
1398  * the last item on this page. Adjust the starting offset if needed. (If
1399  * this results in an offset before the first item or after the last one,
1400  * _bt_readpage will report no items found, and then we'll step to the
1401  * next page as needed.)
1402  */
1403  if (goback)
1404  offnum = OffsetNumberPrev(offnum);
1405 
1406  /* remember which buffer we have pinned, if any */
1408  so->currPos.buf = buf;
1409 
1410  /*
1411  * Now load data from the first page of the scan.
1412  */
1413  if (!_bt_readpage(scan, dir, offnum))
1414  {
1415  /*
1416  * There's no actually-matching data on this page. Try to advance to
1417  * the next page. Return false if there's no matching data at all.
1418  */
1419  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1420  if (!_bt_steppage(scan, dir))
1421  return false;
1422  }
1423  else
1424  {
1425  /* Drop the lock, and maybe the pin, on the current page */
1427  }
1428 
1429 readcomplete:
1430  /* OK, itemIndex says what to return */
1431  currItem = &so->currPos.items[so->currPos.itemIndex];
1432  scan->xs_heaptid = currItem->heapTid;
1433  if (scan->xs_want_itup)
1434  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1435 
1436  return true;
1437 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2521
#define BTORDER_PROC
Definition: nbtree.h:575
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:801
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:139
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:728
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2146
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
int itemIndex
Definition: nbtree.h:855
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2498
char * currTuples
Definition: nbtree.h:929
regproc RegProcedure
Definition: c.h:573
#define P_NONE
Definition: nbtree.h:206
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:879
struct SnapshotData * xs_snapshot
Definition: relscan.h:116
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:115
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:587
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:144
static char * buf
Definition: pg_test_fsync.c:68
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2379
#define RegProcedureIsValid(p)
Definition: c.h:708
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:491
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1497
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:647
Oid * rd_opfamily
Definition: rel.h:190
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:668
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:742
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2469
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:163
uintptr_t Datum
Definition: postgres.h:367
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:800
#define SK_BT_DESC
Definition: nbtree.h:956
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:794
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:885
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
int numberOfKeys
Definition: nbtree.h:908
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
#define DatumGetPointer(X)
Definition: postgres.h:549
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1006
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
BTScanPosData currPos
Definition: nbtree.h:942
ScanKey keyData
Definition: nbtree.h:909
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:228
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1861
Buffer buf
Definition: nbtree.h:825
Oid * rd_opcintype
Definition: rel.h:191
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1509
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 2295 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage, BufferIsValid, elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

2297 {
2298  Buffer buf;
2299  Page page;
2300  BTPageOpaque opaque;
2301  OffsetNumber offnum;
2302  BlockNumber blkno;
2303  IndexTuple itup;
2304 
2305  /*
2306  * If we are looking for a leaf page, okay to descend from fast root;
2307  * otherwise better descend from true root. (There is no point in being
2308  * smarter about intermediate levels.)
2309  */
2310  if (level == 0)
2311  buf = _bt_getroot(rel, BT_READ);
2312  else
2313  buf = _bt_gettrueroot(rel);
2314 
2315  if (!BufferIsValid(buf))
2316  return InvalidBuffer;
2317 
2318  page = BufferGetPage(buf);
2319  TestForOldSnapshot(snapshot, rel, page);
2320  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2321 
2322  for (;;)
2323  {
2324  /*
2325  * If we landed on a deleted page, step right to find a live page
2326  * (there must be one). Also, if we want the rightmost page, step
2327  * right if needed to get to it (this could happen if the page split
2328  * since we obtained a pointer to it).
2329  */
2330  while (P_IGNORE(opaque) ||
2331  (rightmost && !P_RIGHTMOST(opaque)))
2332  {
2333  blkno = opaque->btpo_next;
2334  if (blkno == P_NONE)
2335  elog(ERROR, "fell off the end of index \"%s\"",
2337  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2338  page = BufferGetPage(buf);
2339  TestForOldSnapshot(snapshot, rel, page);
2340  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2341  }
2342 
2343  /* Done? */
2344  if (opaque->btpo.level == level)
2345  break;
2346  if (opaque->btpo.level < level)
2347  ereport(ERROR,
2348  (errcode(ERRCODE_INDEX_CORRUPTED),
2349  errmsg_internal("btree level %u not found in index \"%s\"",
2350  level, RelationGetRelationName(rel))));
2351 
2352  /* Descend to leftmost or rightmost child page */
2353  if (rightmost)
2354  offnum = PageGetMaxOffsetNumber(page);
2355  else
2356  offnum = P_FIRSTDATAKEY(opaque);
2357 
2358  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2359  blkno = BTreeTupleGetDownLink(itup);
2360 
2361  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2362  page = BufferGetPage(buf);
2363  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2364  }
2365 
2366  return buf;
2367 }
BlockNumber btpo_next
Definition: nbtree.h:59
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:277
#define P_IGNORE(opaque)
Definition: nbtree.h:219
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:939
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
union BTPageOpaqueData::@45 btpo
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:691
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:587
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:491
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:272
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:62
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:508
#define ereport(elevel,...)
Definition: elog.h:155
int errmsg_internal(const char *fmt,...)
Definition: elog.c:989
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define elog(elevel,...)
Definition: elog.h:228
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_initialize_more_data()

static void _bt_initialize_more_data ( BTScanOpaque  so,
ScanDirection  dir 
)
inlinestatic

Definition at line 2469 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::numKilled, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_parallel_readpage().

2470 {
2471  /* initialize moreLeft/moreRight appropriately for scan direction */
2472  if (ScanDirectionIsForward(dir))
2473  {
2474  so->currPos.moreLeft = false;
2475  so->currPos.moreRight = true;
2476  }
2477  else
2478  {
2479  so->currPos.moreLeft = true;
2480  so->currPos.moreRight = false;
2481  }
2482  so->numKilled = 0; /* just paranoia */
2483  so->markItemIndex = -1; /* ditto */
2484 }
bool moreRight
Definition: nbtree.h:838
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
bool moreLeft
Definition: nbtree.h:837
int markItemIndex
Definition: nbtree.h:939
BTScanPosData currPos
Definition: nbtree.h:942

◆ _bt_moveright()

Buffer _bt_moveright ( Relation  rel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access,
Snapshot  snapshot 
)

Definition at line 241 of file nbtsearch.c.

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), BT_READ, BT_WRITE, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage, elog, ERROR, BTScanInsertData::nextkey, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_search().

248 {
249  Page page;
250  BTPageOpaque opaque;
251  int32 cmpval;
252 
253  /*
254  * When nextkey = false (normal case): if the scan key that brought us to
255  * this page is > the high key stored on the page, then the page has split
256  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
257  * have some duplicates to the right as well as the left, but that's
258  * something that's only ever dealt with on the leaf level, after
259  * _bt_search has found an initial leaf page.)
260  *
261  * When nextkey = true: move right if the scan key is >= page's high key.
262  * (Note that key.scantid cannot be set in this case.)
263  *
264  * The page could even have split more than once, so scan as far as
265  * needed.
266  *
267  * We also have to move right if we followed a link that brought us to a
268  * dead page.
269  */
270  cmpval = key->nextkey ? 0 : 1;
271 
272  for (;;)
273  {
274  page = BufferGetPage(buf);
275  TestForOldSnapshot(snapshot, rel, page);
276  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
277 
278  if (P_RIGHTMOST(opaque))
279  break;
280 
281  /*
282  * Finish any incomplete splits we encounter along the way.
283  */
284  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
285  {
287 
288  /* upgrade our lock if necessary */
289  if (access == BT_READ)
290  {
291  _bt_unlockbuf(rel, buf);
292  _bt_lockbuf(rel, buf, BT_WRITE);
293  }
294 
295  if (P_INCOMPLETE_SPLIT(opaque))
296  _bt_finish_split(rel, buf, stack);
297  else
298  _bt_relbuf(rel, buf);
299 
300  /* re-acquire the lock in the right mode, and re-check */
301  buf = _bt_getbuf(rel, blkno, access);
302  continue;
303  }
304 
305  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
306  {
307  /* step right one page */
308  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
309  continue;
310  }
311  else
312  break;
313  }
314 
315  if (P_IGNORE(opaque))
316  elog(ERROR, "fell off the end of index \"%s\"",
318 
319  return buf;
320 }
BlockNumber btpo_next
Definition: nbtree.h:59
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:277
#define P_IGNORE(opaque)
Definition: nbtree.h:219
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:939
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:803
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:417
#define BT_READ
Definition: nbtree.h:587
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2196
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:491
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:644
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:975
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:959
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1006
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
#define elog(elevel,...)
Definition: elog.h:228
#define BT_WRITE
Definition: nbtree.h:588
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Pointer Page
Definition: bufpage.h:78

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1454 of file nbtsearch.c.

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

1455 {
1456  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1457  BTScanPosItem *currItem;
1458 
1459  /*
1460  * Advance to next tuple on current page; or if there's no more, try to
1461  * step to the next page with data.
1462  */
1463  if (ScanDirectionIsForward(dir))
1464  {
1465  if (++so->currPos.itemIndex > so->currPos.lastItem)
1466  {
1467  if (!_bt_steppage(scan, dir))
1468  return false;
1469  }
1470  }
1471  else
1472  {
1473  if (--so->currPos.itemIndex < so->currPos.firstItem)
1474  {
1475  if (!_bt_steppage(scan, dir))
1476  return false;
1477  }
1478  }
1479 
1480  /* OK, itemIndex says what to return */
1481  currItem = &so->currPos.items[so->currPos.itemIndex];
1482  scan->xs_heaptid = currItem->heapTid;
1483  if (scan->xs_want_itup)
1484  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1485 
1486  return true;
1487 }
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:139
int itemIndex
Definition: nbtree.h:855
char * currTuples
Definition: nbtree.h:929
int lastItem
Definition: nbtree.h:854
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
ItemPointerData xs_heaptid
Definition: relscan.h:144
IndexTupleData * IndexTuple
Definition: itup.h:53
int firstItem
Definition: nbtree.h:853
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
BTScanPosData currPos
Definition: nbtree.h:942
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1861

◆ _bt_parallel_readpage()

static bool _bt_parallel_readpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2146 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_initialize_more_data(), _bt_readnextpage(), BTScanOpaqueData::currPos, and IndexScanDescData::opaque.

Referenced by _bt_first().

2147 {
2148  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2149 
2150  _bt_initialize_more_data(so, dir);
2151 
2152  if (!_bt_readnextpage(scan, blkno, dir))
2153  return false;
2154 
2155  /* Drop the lock, and maybe the pin, on the current page */
2157 
2158  return true;
2159 }
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2469
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1968
BTScanPosData currPos
Definition: nbtree.h:942

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 1968 of file nbtsearch.c.

References _bt_getbuf(), _bt_lockbuf(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), BT_READ, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBuffer, BTScanPosData::moreLeft, BTScanPosData::moreRight, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber, PageGetSpecialPointer, IndexScanDescData::parallel_scan, PredicateLockPage(), ScanDirectionIsForward, status(), TestForOldSnapshot(), and IndexScanDescData::xs_snapshot.

Referenced by _bt_parallel_readpage(), and _bt_steppage().

1969 {
1970  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1971  Relation rel;
1972  Page page;
1973  BTPageOpaque opaque;
1974  bool status;
1975 
1976  rel = scan->indexRelation;
1977 
1978  if (ScanDirectionIsForward(dir))
1979  {
1980  for (;;)
1981  {
1982  /*
1983  * if we're at end of scan, give up and mark parallel scan as
1984  * done, so that all the workers can finish their scan
1985  */
1986  if (blkno == P_NONE || !so->currPos.moreRight)
1987  {
1988  _bt_parallel_done(scan);
1990  return false;
1991  }
1992  /* check for interrupts while we're not holding any buffer lock */
1994  /* step right one page */
1995  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1996  page = BufferGetPage(so->currPos.buf);
1997  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1998  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1999  /* check for deleted page */
2000  if (!P_IGNORE(opaque))
2001  {
2002  PredicateLockPage(rel, blkno, scan->xs_snapshot);
2003  /* see if there are any matches on this page */
2004  /* note that this will clear moreRight if we can stop */
2005  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
2006  break;
2007  }
2008  else if (scan->parallel_scan != NULL)
2009  {
2010  /* allow next page be processed by parallel worker */
2011  _bt_parallel_release(scan, opaque->btpo_next);
2012  }
2013 
2014  /* nope, keep going */
2015  if (scan->parallel_scan != NULL)
2016  {
2017  _bt_relbuf(rel, so->currPos.buf);
2018  status = _bt_parallel_seize(scan, &blkno);
2019  if (!status)
2020  {
2022  return false;
2023  }
2024  }
2025  else
2026  {
2027  blkno = opaque->btpo_next;
2028  _bt_relbuf(rel, so->currPos.buf);
2029  }
2030  }
2031  }
2032  else
2033  {
2034  /*
2035  * Should only happen in parallel cases, when some other backend
2036  * advanced the scan.
2037  */
2038  if (so->currPos.currPage != blkno)
2039  {
2041  so->currPos.currPage = blkno;
2042  }
2043 
2044  /*
2045  * Walk left to the next page with data. This is much more complex
2046  * than the walk-right case because of the possibility that the page
2047  * to our left splits while we are in flight to it, plus the
2048  * possibility that the page we were on gets deleted after we leave
2049  * it. See nbtree/README for details.
2050  *
2051  * It might be possible to rearrange this code to have less overhead
2052  * in pinning and locking, but that would require capturing the left
2053  * pointer when the page is initially read, and using it here, along
2054  * with big changes to _bt_walk_left() and the code below. It is not
2055  * clear whether this would be a win, since if the page immediately to
2056  * the left splits after we read this page and before we step left, we
2057  * would need to visit more pages than with the current code.
2058  *
2059  * Note that if we change the code so that we drop the pin for a scan
2060  * which uses a non-MVCC snapshot, we will need to modify the code for
2061  * walking left, to allow for the possibility that a referenced page
2062  * has been deleted. As long as the buffer is pinned or the snapshot
2063  * is MVCC the page cannot move past the half-dead state to fully
2064  * deleted.
2065  */
2066  if (BTScanPosIsPinned(so->currPos))
2067  _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2068  else
2069  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2070 
2071  for (;;)
2072  {
2073  /* Done if we know there are no matching keys to the left */
2074  if (!so->currPos.moreLeft)
2075  {
2076  _bt_relbuf(rel, so->currPos.buf);
2077  _bt_parallel_done(scan);
2079  return false;
2080  }
2081 
2082  /* Step to next physical page */
2083  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
2084  scan->xs_snapshot);
2085 
2086  /* if we're physically at end of index, return failure */
2087  if (so->currPos.buf == InvalidBuffer)
2088  {
2089  _bt_parallel_done(scan);
2091  return false;
2092  }
2093 
2094  /*
2095  * Okay, we managed to move left to a non-deleted page. Done if
2096  * it's not half-dead and contains matching tuples. Else loop back
2097  * and do it all again.
2098  */
2099  page = BufferGetPage(so->currPos.buf);
2100  TestForOldSnapshot(scan->xs_snapshot, rel, page);
2101  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2102  if (!P_IGNORE(opaque))
2103  {
2105  /* see if there are any matches on this page */
2106  /* note that this will clear moreLeft if we can stop */
2107  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
2108  break;
2109  }
2110  else if (scan->parallel_scan != NULL)
2111  {
2112  /* allow next page be processed by parallel worker */
2114  }
2115 
2116  /*
2117  * For parallel scans, get the last page scanned as it is quite
2118  * possible that by the time we try to seize the scan, some other
2119  * worker has already advanced the scan to a different page. We
2120  * must continue based on the latest page scanned by any worker.
2121  */
2122  if (scan->parallel_scan != NULL)
2123  {
2124  _bt_relbuf(rel, so->currPos.buf);
2125  status = _bt_parallel_seize(scan, &blkno);
2126  if (!status)
2127  {
2129  return false;
2130  }
2131  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2132  }
2133  }
2134  }
2135 
2136  return true;
2137 }
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2521
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:277
#define P_IGNORE(opaque)
Definition: nbtree.h:219
bool moreRight
Definition: nbtree.h:838
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:728
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:803
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define P_NONE
Definition: nbtree.h:206
#define InvalidBuffer
Definition: buf.h:25
BlockNumber currPage
Definition: nbtree.h:828
struct SnapshotData * xs_snapshot
Definition: relscan.h:116
bool moreLeft
Definition: nbtree.h:837
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Relation indexRelation
Definition: relscan.h:115
#define BT_READ
Definition: nbtree.h:587
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:862
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:647
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:163
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:975
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:959
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:885
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:705
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
BTScanPosData currPos
Definition: nbtree.h:942
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
Buffer buf
Definition: nbtree.h:825
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1509
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:873
Pointer Page
Definition: bufpage.h:78
static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:2176

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum 
)
static

Definition at line 1509 of file nbtsearch.c.

References _bt_checkkeys(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_setuppostingitems(), Assert, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage, BufferIsValid, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_HIKEY, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_readnextpage().

1510 {
1511  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1512  Page page;
1513  BTPageOpaque opaque;
1514  OffsetNumber minoff;
1515  OffsetNumber maxoff;
1516  int itemIndex;
1517  bool continuescan;
1518  int indnatts;
1519 
1520  /*
1521  * We must have the buffer pinned and locked, but the usual macro can't be
1522  * used here; this function is what makes it good for currPos.
1523  */
1525 
1526  page = BufferGetPage(so->currPos.buf);
1527  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1528 
1529  /* allow next page be processed by parallel worker */
1530  if (scan->parallel_scan)
1531  {
1532  if (ScanDirectionIsForward(dir))
1533  _bt_parallel_release(scan, opaque->btpo_next);
1534  else
1536  }
1537 
1538  continuescan = true; /* default assumption */
1540  minoff = P_FIRSTDATAKEY(opaque);
1541  maxoff = PageGetMaxOffsetNumber(page);
1542 
1543  /*
1544  * We note the buffer's block number so that we can release the pin later.
1545  * This allows us to re-read the buffer if it is needed again for hinting.
1546  */
1548 
1549  /*
1550  * We save the LSN of the page as we read it, so that we know whether it
1551  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1552  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1553  */
1555 
1556  /*
1557  * we must save the page's right-link while scanning it; this tells us
1558  * where to step right to after we're done with these items. There is no
1559  * corresponding need for the left-link, since splits always go right.
1560  */
1561  so->currPos.nextPage = opaque->btpo_next;
1562 
1563  /* initialize tuple workspace to empty */
1564  so->currPos.nextTupleOffset = 0;
1565 
1566  /*
1567  * Now that the current page has been made consistent, the macro should be
1568  * good.
1569  */
1571 
1572  if (ScanDirectionIsForward(dir))
1573  {
1574  /* load items[] in ascending order */
1575  itemIndex = 0;
1576 
1577  offnum = Max(offnum, minoff);
1578 
1579  while (offnum <= maxoff)
1580  {
1581  ItemId iid = PageGetItemId(page, offnum);
1582  IndexTuple itup;
1583 
1584  /*
1585  * If the scan specifies not to return killed tuples, then we
1586  * treat a killed tuple as not passing the qual
1587  */
1588  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1589  {
1590  offnum = OffsetNumberNext(offnum);
1591  continue;
1592  }
1593 
1594  itup = (IndexTuple) PageGetItem(page, iid);
1595 
1596  if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1597  {
1598  /* tuple passes all scan key conditions */
1599  if (!BTreeTupleIsPosting(itup))
1600  {
1601  /* Remember it */
1602  _bt_saveitem(so, itemIndex, offnum, itup);
1603  itemIndex++;
1604  }
1605  else
1606  {
1607  int tupleOffset;
1608 
1609  /*
1610  * Set up state to return posting list, and remember first
1611  * TID
1612  */
1613  tupleOffset =
1614  _bt_setuppostingitems(so, itemIndex, offnum,
1615  BTreeTupleGetPostingN(itup, 0),
1616  itup);
1617  itemIndex++;
1618  /* Remember additional TIDs */
1619  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1620  {
1621  _bt_savepostingitem(so, itemIndex, offnum,
1622  BTreeTupleGetPostingN(itup, i),
1623  tupleOffset);
1624  itemIndex++;
1625  }
1626  }
1627  }
1628  /* When !continuescan, there can't be any more matches, so stop */
1629  if (!continuescan)
1630  break;
1631 
1632  offnum = OffsetNumberNext(offnum);
1633  }
1634 
1635  /*
1636  * We don't need to visit page to the right when the high key
1637  * indicates that no more matches will be found there.
1638  *
1639  * Checking the high key like this works out more often than you might
1640  * think. Leaf page splits pick a split point between the two most
1641  * dissimilar tuples (this is weighed against the need to evenly share
1642  * free space). Leaf pages with high key attribute values that can
1643  * only appear on non-pivot tuples on the right sibling page are
1644  * common.
1645  */
1646  if (continuescan && !P_RIGHTMOST(opaque))
1647  {
1648  ItemId iid = PageGetItemId(page, P_HIKEY);
1649  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1650  int truncatt;
1651 
1652  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1653  _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1654  }
1655 
1656  if (!continuescan)
1657  so->currPos.moreRight = false;
1658 
1659  Assert(itemIndex <= MaxTIDsPerBTreePage);
1660  so->currPos.firstItem = 0;
1661  so->currPos.lastItem = itemIndex - 1;
1662  so->currPos.itemIndex = 0;
1663  }
1664  else
1665  {
1666  /* load items[] in descending order */
1667  itemIndex = MaxTIDsPerBTreePage;
1668 
1669  offnum = Min(offnum, maxoff);
1670 
1671  while (offnum >= minoff)
1672  {
1673  ItemId iid = PageGetItemId(page, offnum);
1674  IndexTuple itup;
1675  bool tuple_alive;
1676  bool passes_quals;
1677 
1678  /*
1679  * If the scan specifies not to return killed tuples, then we
1680  * treat a killed tuple as not passing the qual. Most of the
1681  * time, it's a win to not bother examining the tuple's index
1682  * keys, but just skip to the next tuple (previous, actually,
1683  * since we're scanning backwards). However, if this is the first
1684  * tuple on the page, we do check the index keys, to prevent
1685  * uselessly advancing to the page to the left. This is similar
1686  * to the high key optimization used by forward scans.
1687  */
1688  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1689  {
1690  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1691  if (offnum > P_FIRSTDATAKEY(opaque))
1692  {
1693  offnum = OffsetNumberPrev(offnum);
1694  continue;
1695  }
1696 
1697  tuple_alive = false;
1698  }
1699  else
1700  tuple_alive = true;
1701 
1702  itup = (IndexTuple) PageGetItem(page, iid);
1703 
1704  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1705  &continuescan);
1706  if (passes_quals && tuple_alive)
1707  {
1708  /* tuple passes all scan key conditions */
1709  if (!BTreeTupleIsPosting(itup))
1710  {
1711  /* Remember it */
1712  itemIndex--;
1713  _bt_saveitem(so, itemIndex, offnum, itup);
1714  }
1715  else
1716  {
1717  int tupleOffset;
1718 
1719  /*
1720  * Set up state to return posting list, and remember first
1721  * TID.
1722  *
1723  * Note that we deliberately save/return items from
1724  * posting lists in ascending heap TID order for backwards
1725  * scans. This allows _bt_killitems() to make a
1726  * consistent assumption about the order of items
1727  * associated with the same posting list tuple.
1728  */
1729  itemIndex--;
1730  tupleOffset =
1731  _bt_setuppostingitems(so, itemIndex, offnum,
1732  BTreeTupleGetPostingN(itup, 0),
1733  itup);
1734  /* Remember additional TIDs */
1735  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1736  {
1737  itemIndex--;
1738  _bt_savepostingitem(so, itemIndex, offnum,
1739  BTreeTupleGetPostingN(itup, i),
1740  tupleOffset);
1741  }
1742  }
1743  }
1744  if (!continuescan)
1745  {
1746  /* there can't be any more matches, so stop */
1747  so->currPos.moreLeft = false;
1748  break;
1749  }
1750 
1751  offnum = OffsetNumberPrev(offnum);
1752  }
1753 
1754  Assert(itemIndex >= 0);
1755  so->currPos.firstItem = itemIndex;
1758  }
1759 
1760  return (so->currPos.firstItem <= so->currPos.lastItem);
1761 }
bool moreRight
Definition: nbtree.h:838
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
int itemIndex
Definition: nbtree.h:855
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define Min(x, y)
Definition: c.h:982
BlockNumber currPage
Definition: nbtree.h:828
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1795
#define MaxTIDsPerBTreePage
Definition: nbtree.h:179
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
bool moreLeft
Definition: nbtree.h:837
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:1833
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
bool ignore_killed_tuples
Definition: relscan.h:126
Relation indexRelation
Definition: relscan.h:115
uint16 OffsetNumber
Definition: off.h:24
int nextTupleOffset
Definition: nbtree.h:844
int lastItem
Definition: nbtree.h:854
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2926
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:862
IndexTupleData * IndexTuple
Definition: itup.h:53
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1765
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:469
int firstItem
Definition: nbtree.h:853
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:163
bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1355
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
#define Max(x, y)
Definition: c.h:976
#define Assert(condition)
Definition: c.h:800
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:705
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber nextPage
Definition: nbtree.h:829
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
BTScanPosData currPos
Definition: nbtree.h:942
int i
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
XLogRecPtr lsn
Definition: nbtree.h:827
Buffer buf
Definition: nbtree.h:825
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1765 of file nbtsearch.c.

References Assert, BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

1767 {
1768  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1769 
1770  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1771 
1772  currItem->heapTid = itup->t_tid;
1773  currItem->indexOffset = offnum;
1774  if (so->currTuples)
1775  {
1776  Size itupsz = IndexTupleSize(itup);
1777 
1778  currItem->tupleOffset = so->currPos.nextTupleOffset;
1779  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1780  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1781  }
1782 }
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
char * currTuples
Definition: nbtree.h:929
ItemPointerData t_tid
Definition: itup.h:37
OffsetNumber indexOffset
Definition: nbtree.h:819
int nextTupleOffset
Definition: nbtree.h:844
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
#define Assert(condition)
Definition: c.h:800
size_t Size
Definition: c.h:528
#define MAXALIGN(LEN)
Definition: c.h:753
ItemPointerData heapTid
Definition: nbtree.h:818
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
BTScanPosData currPos
Definition: nbtree.h:942
LocationIndex tupleOffset
Definition: nbtree.h:820
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_savepostingitem()

static void _bt_savepostingitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
int  tupleOffset 
)
inlinestatic

Definition at line 1833 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

1835 {
1836  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1837 
1838  currItem->heapTid = *heapTid;
1839  currItem->indexOffset = offnum;
1840 
1841  /*
1842  * Have index-only scans return the same base IndexTuple for every TID
1843  * that originates from the same posting list
1844  */
1845  if (so->currTuples)
1846  currItem->tupleOffset = tupleOffset;
1847 }
char * currTuples
Definition: nbtree.h:929
OffsetNumber indexOffset
Definition: nbtree.h:819
ItemPointerData heapTid
Definition: nbtree.h:818
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
BTScanPosData currPos
Definition: nbtree.h:942
LocationIndex tupleOffset
Definition: nbtree.h:820

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
BTScanInsert  key,
Buffer bufP,
int  access,
Snapshot  snapshot 
)

Definition at line 101 of file nbtsearch.c.

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageOpaqueData::btpo, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTScanInsertData::heapkeyspace, BTPageOpaqueData::level, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

103 {
104  BTStack stack_in = NULL;
105  int page_access = BT_READ;
106 
107  /* Get the root page to start with */
108  *bufP = _bt_getroot(rel, access);
109 
110  /* If index is empty and access = BT_READ, no root page is created. */
111  if (!BufferIsValid(*bufP))
112  return (BTStack) NULL;
113 
114  /* Loop iterates once per level descended in the tree */
115  for (;;)
116  {
117  Page page;
118  BTPageOpaque opaque;
119  OffsetNumber offnum;
120  ItemId itemid;
121  IndexTuple itup;
122  BlockNumber child;
123  BTStack new_stack;
124 
125  /*
126  * Race -- the page we just grabbed may have split since we read its
127  * downlink in its parent page (or the metapage). If it has, we may
128  * need to move right to its new sibling. Do that.
129  *
130  * In write-mode, allow _bt_moveright to finish any incomplete splits
131  * along the way. Strictly speaking, we'd only need to finish an
132  * incomplete split on the leaf page we're about to insert to, not on
133  * any of the upper levels (internal pages with incomplete splits are
134  * also taken care of in _bt_getstackbuf). But this is a good
135  * opportunity to finish splits of internal pages too.
136  */
137  *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
138  page_access, snapshot);
139 
140  /* if this is a leaf page, we're done */
141  page = BufferGetPage(*bufP);
142  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
143  if (P_ISLEAF(opaque))
144  break;
145 
146  /*
147  * Find the appropriate pivot tuple on this page. Its downlink points
148  * to the child page that we're about to descend to.
149  */
150  offnum = _bt_binsrch(rel, key, *bufP);
151  itemid = PageGetItemId(page, offnum);
152  itup = (IndexTuple) PageGetItem(page, itemid);
153  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
154  child = BTreeTupleGetDownLink(itup);
155 
156  /*
157  * We need to save the location of the pivot tuple we chose in a new
158  * stack entry for this page/level. If caller ends up splitting a
159  * page one level down, it usually ends up inserting a new pivot
160  * tuple/downlink immediately after the location recorded here.
161  */
162  new_stack = (BTStack) palloc(sizeof(BTStackData));
163  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
164  new_stack->bts_offset = offnum;
165  new_stack->bts_parent = stack_in;
166 
167  /*
168  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
169  * we're on the level 1 and asked to lock leaf page in write mode,
170  * then lock next page in write mode, because it must be a leaf.
171  */
172  if (opaque->btpo.level == 1 && access == BT_WRITE)
173  page_access = BT_WRITE;
174 
175  /* drop the read lock on the page, then acquire one on its child */
176  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
177 
178  /* okay, all set to move down a level */
179  stack_in = new_stack;
180  }
181 
182  /*
183  * If we're asked to lock leaf in write mode, but didn't manage to, then
184  * relock. This should only happen when the root page is a leaf page (and
185  * the only page in the index other than the metapage).
186  */
187  if (access == BT_WRITE && page_access == BT_READ)
188  {
189  /* trade in our read lock for a write lock */
190  _bt_unlockbuf(rel, *bufP);
191  _bt_lockbuf(rel, *bufP, BT_WRITE);
192 
193  /*
194  * Race -- the leaf page may have split after we dropped the read lock
195  * but before we acquired a write lock. If it has, we may need to
196  * move right to its new sibling. Do that.
197  */
198  *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
199  snapshot);
200  }
201 
202  return stack_in;
203 }
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:939
union BTPageOpaqueData::@45 btpo
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
BTStackData * BTStack
Definition: nbtree.h:607
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:587
OffsetNumber bts_offset
Definition: nbtree.h:603
IndexTupleData * IndexTuple
Definition: itup.h:53
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:272
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:62
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:975
BlockNumber bts_blkno
Definition: nbtree.h:602
Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:241
#define Assert(condition)
Definition: c.h:800
bool heapkeyspace
Definition: nbtree.h:659
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
struct BTStackData * bts_parent
Definition: nbtree.h:604
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1006
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
void * palloc(Size size)
Definition: mcxt.c:950
#define BT_WRITE
Definition: nbtree.h:588
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_setuppostingitems()

static int _bt_setuppostingitems ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
IndexTuple  itup 
)
static

Definition at line 1795 of file nbtsearch.c.

References Assert, BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, INDEX_SIZE_MASK, BTScanPosItem::indexOffset, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_info, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

1797 {
1798  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1799 
1800  Assert(BTreeTupleIsPosting(itup));
1801 
1802  currItem->heapTid = *heapTid;
1803  currItem->indexOffset = offnum;
1804  if (so->currTuples)
1805  {
1806  /* Save base IndexTuple (truncate posting list) */
1807  IndexTuple base;
1808  Size itupsz = BTreeTupleGetPostingOffset(itup);
1809 
1810  itupsz = MAXALIGN(itupsz);
1811  currItem->tupleOffset = so->currPos.nextTupleOffset;
1812  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1813  memcpy(base, itup, itupsz);
1814  /* Defensively reduce work area index tuple header size */
1815  base->t_info &= ~INDEX_SIZE_MASK;
1816  base->t_info |= itupsz;
1817  so->currPos.nextTupleOffset += itupsz;
1818 
1819  return currItem->tupleOffset;
1820  }
1821 
1822  return 0;
1823 }
char * currTuples
Definition: nbtree.h:929
#define INDEX_SIZE_MASK
Definition: itup.h:65
OffsetNumber indexOffset
Definition: nbtree.h:819
int nextTupleOffset
Definition: nbtree.h:844
IndexTupleData * IndexTuple
Definition: itup.h:53
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:397
#define Assert(condition)
Definition: c.h:800
size_t Size
Definition: c.h:528
#define MAXALIGN(LEN)
Definition: c.h:753
ItemPointerData heapTid
Definition: nbtree.h:818
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
BTScanPosData currPos
Definition: nbtree.h:942
unsigned short t_info
Definition: itup.h:49
LocationIndex tupleOffset
Definition: nbtree.h:820

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1861 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_killitems(), _bt_parallel_seize(), _bt_readnextpage(), Assert, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), InvalidBlockNumber, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, ScanDirectionIsForward, and status().

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

1862 {
1863  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1865  bool status;
1866 
1868 
1869  /* Before leaving current page, deal with any killed items */
1870  if (so->numKilled > 0)
1871  _bt_killitems(scan);
1872 
1873  /*
1874  * Before we modify currPos, make a copy of the page data if there was a
1875  * mark position that needs it.
1876  */
1877  if (so->markItemIndex >= 0)
1878  {
1879  /* bump pin on current buffer for assignment to mark buffer */
1880  if (BTScanPosIsPinned(so->currPos))
1882  memcpy(&so->markPos, &so->currPos,
1883  offsetof(BTScanPosData, items[1]) +
1884  so->currPos.lastItem * sizeof(BTScanPosItem));
1885  if (so->markTuples)
1886  memcpy(so->markTuples, so->currTuples,
1887  so->currPos.nextTupleOffset);
1888  so->markPos.itemIndex = so->markItemIndex;
1889  so->markItemIndex = -1;
1890  }
1891 
1892  if (ScanDirectionIsForward(dir))
1893  {
1894  /* Walk right to the next page with data */
1895  if (scan->parallel_scan != NULL)
1896  {
1897  /*
1898  * Seize the scan to get the next block number; if the scan has
1899  * ended already, bail out.
1900  */
1901  status = _bt_parallel_seize(scan, &blkno);
1902  if (!status)
1903  {
1904  /* release the previous buffer, if pinned */
1907  return false;
1908  }
1909  }
1910  else
1911  {
1912  /* Not parallel, so use the previously-saved nextPage link. */
1913  blkno = so->currPos.nextPage;
1914  }
1915 
1916  /* Remember we left a page with data */
1917  so->currPos.moreLeft = true;
1918 
1919  /* release the previous buffer, if pinned */
1921  }
1922  else
1923  {
1924  /* Remember we left a page with data */
1925  so->currPos.moreRight = true;
1926 
1927  if (scan->parallel_scan != NULL)
1928  {
1929  /*
1930  * Seize the scan to get the current block number; if the scan has
1931  * ended already, bail out.
1932  */
1933  status = _bt_parallel_seize(scan, &blkno);
1935  if (!status)
1936  {
1938  return false;
1939  }
1940  }
1941  else
1942  {
1943  /* Not parallel, so just use our own notion of the current page */
1944  blkno = so->currPos.currPage;
1945  }
1946  }
1947 
1948  if (!_bt_readnextpage(scan, blkno, dir))
1949  return false;
1950 
1951  /* Drop the lock, and maybe the pin, on the current page */
1953 
1954  return true;
1955 }
bool moreRight
Definition: nbtree.h:838
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
int itemIndex
Definition: nbtree.h:855
char * currTuples
Definition: nbtree.h:929
BlockNumber currPage
Definition: nbtree.h:828
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:879
uint32 BlockNumber
Definition: block.h:31
bool moreLeft
Definition: nbtree.h:837
int nextTupleOffset
Definition: nbtree.h:844
int lastItem
Definition: nbtree.h:854
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
BTScanPosData markPos
Definition: nbtree.h:943
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:862
char * markTuples
Definition: nbtree.h:930
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:647
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:163
int markItemIndex
Definition: nbtree.h:939
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1968
#define Assert(condition)
Definition: c.h:800
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:885
#define InvalidBlockNumber
Definition: block.h:33
BlockNumber nextPage
Definition: nbtree.h:829
BTScanPosData currPos
Definition: nbtree.h:942
Buffer buf
Definition: nbtree.h:825
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3551
#define offsetof(type, field)
Definition: c.h:723
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:873

◆ _bt_walk_left()

static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf,
Snapshot  snapshot 
)
static

Definition at line 2176 of file nbtsearch.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_readnextpage().

2177 {
2178  Page page;
2179  BTPageOpaque opaque;
2180 
2181  page = BufferGetPage(buf);
2182  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2183 
2184  for (;;)
2185  {
2186  BlockNumber obknum;
2187  BlockNumber lblkno;
2188  BlockNumber blkno;
2189  int tries;
2190 
2191  /* if we're at end of tree, release buf and return failure */
2192  if (P_LEFTMOST(opaque))
2193  {
2194  _bt_relbuf(rel, buf);
2195  break;
2196  }
2197  /* remember original page we are stepping left from */
2198  obknum = BufferGetBlockNumber(buf);
2199  /* step left */
2200  blkno = lblkno = opaque->btpo_prev;
2201  _bt_relbuf(rel, buf);
2202  /* check for interrupts while we're not holding any buffer lock */
2204  buf = _bt_getbuf(rel, blkno, BT_READ);
2205  page = BufferGetPage(buf);
2206  TestForOldSnapshot(snapshot, rel, page);
2207  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2208 
2209  /*
2210  * If this isn't the page we want, walk right till we find what we
2211  * want --- but go no more than four hops (an arbitrary limit). If we
2212  * don't find the correct page by then, the most likely bet is that
2213  * the original page got deleted and isn't in the sibling chain at all
2214  * anymore, not that its left sibling got split more than four times.
2215  *
2216  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2217  * because half-dead pages are still in the sibling chain. Caller
2218  * must reject half-dead pages if wanted.
2219  */
2220  tries = 0;
2221  for (;;)
2222  {
2223  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2224  {
2225  /* Found desired page, return it */
2226  return buf;
2227  }
2228  if (P_RIGHTMOST(opaque) || ++tries > 4)
2229  break;
2230  blkno = opaque->btpo_next;
2231  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2232  page = BufferGetPage(buf);
2233  TestForOldSnapshot(snapshot, rel, page);
2234  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2235  }
2236 
2237  /* Return to the original page to see what's up */
2238  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2239  page = BufferGetPage(buf);
2240  TestForOldSnapshot(snapshot, rel, page);
2241  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2242  if (P_ISDELETED(opaque))
2243  {
2244  /*
2245  * It was deleted. Move right to first nondeleted page (there
2246  * must be one); that is the page that has acquired the deleted
2247  * one's keyspace, so stepping left from it will take us where we
2248  * want to be.
2249  */
2250  for (;;)
2251  {
2252  if (P_RIGHTMOST(opaque))
2253  elog(ERROR, "fell off the end of index \"%s\"",
2255  blkno = opaque->btpo_next;
2256  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2257  page = BufferGetPage(buf);
2258  TestForOldSnapshot(snapshot, rel, page);
2259  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2260  if (!P_ISDELETED(opaque))
2261  break;
2262  }
2263 
2264  /*
2265  * Now return to top of loop, resetting obknum to point to this
2266  * nondeleted page, and try again.
2267  */
2268  }
2269  else
2270  {
2271  /*
2272  * It wasn't deleted; the explanation had better be that the page
2273  * to the left got split or deleted. Without this check, we'd go
2274  * into an infinite loop if there's anything wrong.
2275  */
2276  if (opaque->btpo_prev == lblkno)
2277  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2278  obknum, RelationGetRelationName(rel));
2279  /* Okay to try again with new lblkno value */
2280  }
2281  }
2282 
2283  return InvalidBuffer;
2284 }
BlockNumber btpo_next
Definition: nbtree.h:59
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:277
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:939
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:803
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
#define BT_READ
Definition: nbtree.h:587
#define ERROR
Definition: elog.h:43
BlockNumber btpo_prev
Definition: nbtree.h:58
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:491
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define P_ISDELETED(opaque)
Definition: nbtree.h:216
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:959
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
#define elog(elevel,...)
Definition: elog.h:228
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Pointer Page
Definition: bufpage.h:78