PostgreSQL Source Code  git master
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static int _bt_setuppostingitems (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
 
static void _bt_savepostingitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static bool _bt_parallel_readpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_initialize_more_data (BTScanOpaque so, ScanDirection dir)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
Buffer _bt_moveright (Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 337 of file nbtsearch.c.

340 {
341  Page page;
342  BTPageOpaque opaque;
343  OffsetNumber low,
344  high;
345  int32 result,
346  cmpval;
347 
348  page = BufferGetPage(buf);
349  opaque = BTPageGetOpaque(page);
350 
351  /* Requesting nextkey semantics while using scantid seems nonsensical */
352  Assert(!key->nextkey || key->scantid == NULL);
353  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
354  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
355 
356  low = P_FIRSTDATAKEY(opaque);
357  high = PageGetMaxOffsetNumber(page);
358 
359  /*
360  * If there are no keys on the page, return the first available slot. Note
361  * this covers two cases: the page is really empty (no keys), or it
362  * contains only a high key. The latter case is possible after vacuuming.
363  * This can never happen on an internal page, however, since they are
364  * never empty (an internal page must have at least one child).
365  */
366  if (unlikely(high < low))
367  return low;
368 
369  /*
370  * Binary search to find the first key on the page >= scan key, or first
371  * key > scankey when nextkey is true.
372  *
373  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
374  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
375  *
376  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
377  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
378  *
379  * We can fall out when high == low.
380  */
381  high++; /* establish the loop invariant for high */
382 
383  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
384 
385  while (high > low)
386  {
387  OffsetNumber mid = low + ((high - low) / 2);
388 
389  /* We have low <= mid < high, so mid points at a real slot */
390 
391  result = _bt_compare(rel, key, page, mid);
392 
393  if (result >= cmpval)
394  low = mid + 1;
395  else
396  high = mid;
397  }
398 
399  /*
400  * At this point we have high == low.
401  *
402  * On a leaf page we always return the first non-pivot tuple >= scan key
403  * (resp. > scan key) for forward scan callers. For backward scans, it's
404  * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
405  */
406  if (P_ISLEAF(opaque))
407  {
408  /*
409  * In the backward scan case we're supposed to locate the last
410  * matching tuple on the leaf level -- not the first matching tuple
411  * (the last tuple will be the first one returned by the scan).
412  *
413  * At this point we've located the first non-pivot tuple immediately
414  * after the last matching tuple (which might just be maxoff + 1).
415  * Compensate by stepping back.
416  */
417  if (key->backward)
418  return OffsetNumberPrev(low);
419 
420  return low;
421  }
422 
423  /*
424  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
425  * There must be one if _bt_compare() is playing by the rules.
426  *
427  * _bt_compare() will seldom see any exactly-matching pivot tuples, since
428  * a truncated -inf heap TID is usually enough to prevent it altogether.
429  * Even omitted scan key entries are treated as > truncated attributes.
430  *
431  * However, during backward scans _bt_compare() interprets omitted scan
432  * key attributes as == corresponding truncated -inf attributes instead.
433  * This works just like < would work here. Under this scheme, < strategy
434  * backward scans will always directly descend to the correct leaf page.
435  * In particular, they will never incur an "extra" leaf page access with a
436  * scan key that happens to contain the same prefix of values as some
437  * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
438  * it uses a leaf page high key to "re-find" a page undergoing deletion.
439  */
440  Assert(low > P_FIRSTDATAKEY(opaque));
441 
442  return OffsetNumberPrev(low);
443 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:404
Pointer Page
Definition: bufpage.h:78
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
signed int int32
Definition: c.h:494
#define Assert(condition)
Definition: c.h:858
#define unlikely(x)
Definition: c.h:311
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:682
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:73

References _bt_compare(), Assert, BTPageGetOpaque, buf, BufferGetPage(), sort-test::key, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), and unlikely.

Referenced by _bt_first(), and _bt_search().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 468 of file nbtsearch.c.

469 {
470  BTScanInsert key = insertstate->itup_key;
471  Page page;
472  BTPageOpaque opaque;
473  OffsetNumber low,
474  high,
475  stricthigh;
476  int32 result,
477  cmpval;
478 
479  page = BufferGetPage(insertstate->buf);
480  opaque = BTPageGetOpaque(page);
481 
482  Assert(P_ISLEAF(opaque));
483  Assert(!key->nextkey);
484  Assert(insertstate->postingoff == 0);
485 
486  if (!insertstate->bounds_valid)
487  {
488  /* Start new binary search */
489  low = P_FIRSTDATAKEY(opaque);
490  high = PageGetMaxOffsetNumber(page);
491  }
492  else
493  {
494  /* Restore result of previous binary search against same page */
495  low = insertstate->low;
496  high = insertstate->stricthigh;
497  }
498 
499  /* If there are no keys on the page, return the first available slot */
500  if (unlikely(high < low))
501  {
502  /* Caller can't reuse bounds */
503  insertstate->low = InvalidOffsetNumber;
504  insertstate->stricthigh = InvalidOffsetNumber;
505  insertstate->bounds_valid = false;
506  return low;
507  }
508 
509  /*
510  * Binary search to find the first key on the page >= scan key. (nextkey
511  * is always false when inserting).
512  *
513  * The loop invariant is: all slots before 'low' are < scan key, all slots
514  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
515  * maintained to save additional search effort for caller.
516  *
517  * We can fall out when high == low.
518  */
519  if (!insertstate->bounds_valid)
520  high++; /* establish the loop invariant for high */
521  stricthigh = high; /* high initially strictly higher */
522 
523  cmpval = 1; /* !nextkey comparison value */
524 
525  while (high > low)
526  {
527  OffsetNumber mid = low + ((high - low) / 2);
528 
529  /* We have low <= mid < high, so mid points at a real slot */
530 
531  result = _bt_compare(rel, key, page, mid);
532 
533  if (result >= cmpval)
534  low = mid + 1;
535  else
536  {
537  high = mid;
538  if (result != 0)
539  stricthigh = high;
540  }
541 
542  /*
543  * If tuple at offset located by binary search is a posting list whose
544  * TID range overlaps with caller's scantid, perform posting list
545  * binary search to set postingoff for caller. Caller must split the
546  * posting list when postingoff is set. This should happen
547  * infrequently.
548  */
549  if (unlikely(result == 0 && key->scantid != NULL))
550  {
551  /*
552  * postingoff should never be set more than once per leaf page
553  * binary search. That would mean that there are duplicate table
554  * TIDs in the index, which is never okay. Check for that here.
555  */
556  if (insertstate->postingoff != 0)
557  ereport(ERROR,
558  (errcode(ERRCODE_INDEX_CORRUPTED),
559  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
560  ItemPointerGetBlockNumber(key->scantid),
562  low, stricthigh,
563  BufferGetBlockNumber(insertstate->buf),
564  RelationGetRelationName(rel))));
565 
566  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
567  }
568  }
569 
570  /*
571  * On a leaf page, a binary search always returns the first key >= scan
572  * key (at least in !nextkey case), which could be the last slot + 1. This
573  * is also the lower bound of cached search.
574  *
575  * stricthigh may also be the last slot + 1, which prevents caller from
576  * using bounds directly, but is still useful to us if we're called a
577  * second time with cached bounds (cached low will be < stricthigh when
578  * that happens).
579  */
580  insertstate->low = low;
581  insertstate->stricthigh = stricthigh;
582  insertstate->bounds_valid = true;
583 
584  return low;
585 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3713
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:857
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:596
#define InvalidOffsetNumber
Definition: off.h:26
#define RelationGetRelationName(relation)
Definition: rel.h:539
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 596 of file nbtsearch.c.

597 {
598  IndexTuple itup;
599  ItemId itemid;
600  int low,
601  high,
602  mid,
603  res;
604 
605  /*
606  * If this isn't a posting tuple, then the index must be corrupt (if it is
607  * an ordinary non-pivot tuple then there must be an existing tuple with a
608  * heap TID that equals inserter's new heap TID/scantid). Defensively
609  * check that tuple is a posting list tuple whose posting list range
610  * includes caller's scantid.
611  *
612  * (This is also needed because contrib/amcheck's rootdescend option needs
613  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
614  */
615  itemid = PageGetItemId(page, offnum);
616  itup = (IndexTuple) PageGetItem(page, itemid);
617  if (!BTreeTupleIsPosting(itup))
618  return 0;
619 
620  Assert(key->heapkeyspace && key->allequalimage);
621 
622  /*
623  * In the event that posting list tuple has LP_DEAD bit set, indicate this
624  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
625  * second call to _bt_binsrch_insert() can take place when its caller has
626  * removed the dead item.
627  */
628  if (ItemIdIsDead(itemid))
629  return -1;
630 
631  /* "high" is past end of posting list for loop invariant */
632  low = 0;
633  high = BTreeTupleGetNPosting(itup);
634  Assert(high >= 2);
635 
636  while (high > low)
637  {
638  mid = low + ((high - low) / 2);
639  res = ItemPointerCompare(key->scantid,
640  BTreeTupleGetPostingN(itup, mid));
641 
642  if (res > 0)
643  low = mid + 1;
644  else if (res < 0)
645  high = mid;
646  else
647  return mid;
648  }
649 
650  /* Exact match not found */
651  return low;
652 }
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), ItemIdIsDead, ItemPointerCompare(), sort-test::key, PageGetItem(), PageGetItemId(), and res.

Referenced by _bt_binsrch_insert().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 682 of file nbtsearch.c.

686 {
687  TupleDesc itupdesc = RelationGetDescr(rel);
688  BTPageOpaque opaque = BTPageGetOpaque(page);
689  IndexTuple itup;
690  ItemPointer heapTid;
691  ScanKey scankey;
692  int ncmpkey;
693  int ntupatts;
694  int32 result;
695 
696  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
698  Assert(key->heapkeyspace || key->scantid == NULL);
699 
700  /*
701  * Force result ">" if target item is first data item on an internal page
702  * --- see NOTE above.
703  */
704  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
705  return 1;
706 
707  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
708  ntupatts = BTreeTupleGetNAtts(itup, rel);
709 
710  /*
711  * The scan key is set up with the attribute number associated with each
712  * term in the key. It is important that, if the index is multi-key, the
713  * scan contain the first k key attributes, and that they be in order. If
714  * you think about how multi-key ordering works, you'll understand why
715  * this is.
716  *
717  * We don't test for violation of this condition here, however. The
718  * initial setup for the index scan had better have gotten it right (see
719  * _bt_first).
720  */
721 
722  ncmpkey = Min(ntupatts, key->keysz);
723  Assert(key->heapkeyspace || ncmpkey == key->keysz);
724  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
725  scankey = key->scankeys;
726  for (int i = 1; i <= ncmpkey; i++)
727  {
728  Datum datum;
729  bool isNull;
730 
731  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
732 
733  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
734  {
735  if (isNull)
736  result = 0; /* NULL "=" NULL */
737  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
738  result = -1; /* NULL "<" NOT_NULL */
739  else
740  result = 1; /* NULL ">" NOT_NULL */
741  }
742  else if (isNull) /* key is NOT_NULL and item is NULL */
743  {
744  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
745  result = 1; /* NOT_NULL ">" NULL */
746  else
747  result = -1; /* NOT_NULL "<" NULL */
748  }
749  else
750  {
751  /*
752  * The sk_func needs to be passed the index value as left arg and
753  * the sk_argument as right arg (they might be of different
754  * types). Since it is convenient for callers to think of
755  * _bt_compare as comparing the scankey to the index item, we have
756  * to flip the sign of the comparison result. (Unless it's a DESC
757  * column, in which case we *don't* flip the sign.)
758  */
759  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
760  scankey->sk_collation,
761  datum,
762  scankey->sk_argument));
763 
764  if (!(scankey->sk_flags & SK_BT_DESC))
765  INVERT_COMPARE_RESULT(result);
766  }
767 
768  /* if the keys are unequal, return the difference */
769  if (result != 0)
770  return result;
771 
772  scankey++;
773  }
774 
775  /*
776  * All non-truncated attributes (other than heap TID) were found to be
777  * equal. Treat truncated attributes as minus infinity when scankey has a
778  * key attribute value that would otherwise be compared directly.
779  *
780  * Note: it doesn't matter if ntupatts includes non-key attributes;
781  * scankey won't, so explicitly excluding non-key attributes isn't
782  * necessary.
783  */
784  if (key->keysz > ntupatts)
785  return 1;
786 
787  /*
788  * Use the heap TID attribute and scantid to try to break the tie. The
789  * rules are the same as any other key attribute -- only the
790  * representation differs.
791  */
792  heapTid = BTreeTupleGetHeapTID(itup);
793  if (key->scantid == NULL)
794  {
795  /*
796  * Forward scans have a scankey that is considered greater than a
797  * truncated pivot tuple if and when the scankey has equal values for
798  * attributes up to and including the least significant untruncated
799  * attribute in tuple. Even attributes that were omitted from the
800  * scan key are considered greater than -inf truncated attributes.
801  * (See _bt_binsrch for an explanation of our backward scan behavior.)
802  *
803  * For example, if an index has the minimum two attributes (single
804  * user key attribute, plus heap TID attribute), and a page's high key
805  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
806  * will not descend to the page to the left. The search will descend
807  * right instead. The truncated attribute in pivot tuple means that
808  * all non-pivot tuples on the page to the left are strictly < 'foo',
809  * so it isn't necessary to descend left. In other words, search
810  * doesn't have to descend left because it isn't interested in a match
811  * that has a heap TID value of -inf.
812  *
813  * Note: the heap TID part of the test ensures that scankey is being
814  * compared to a pivot tuple with one or more truncated -inf key
815  * attributes. The heap TID attribute is the last key attribute in
816  * every index, of course, but other than that it isn't special.
817  */
818  if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
819  key->heapkeyspace)
820  return 1;
821 
822  /* All provided scankey arguments found to be equal */
823  return 0;
824  }
825 
826  /*
827  * Treat truncated heap TID as minus infinity, since scankey has a key
828  * attribute value (scantid) that would otherwise be compared directly
829  */
831  if (heapTid == NULL)
832  return 1;
833 
834  /*
835  * Scankey must be treated as equal to a posting list tuple if its scantid
836  * value falls within the range of the posting list. In all other cases
837  * there can only be a single heap TID value, which is compared directly
838  * with scantid.
839  */
841  result = ItemPointerCompare(key->scantid, heapTid);
842  if (result <= 0 || !BTreeTupleIsPosting(itup))
843  return result;
844  else
845  {
846  result = ItemPointerCompare(key->scantid,
848  if (result > 0)
849  return 1;
850  }
851 
852  return 0;
853 }
#define Min(x, y)
Definition: c.h:1004
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1106
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int i
Definition: isn.c:73
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1128
#define SK_BT_DESC
Definition: nbtree.h:1127
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4921
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define RelationGetDescr(relation)
Definition: rel.h:531
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 61 of file nbtsearch.c.

62 {
63  _bt_unlockbuf(scan->indexRelation, sp->buf);
64 
65  if (IsMVCCSnapshot(scan->xs_snapshot) &&
67  !scan->xs_want_itup)
68  {
69  ReleaseBuffer(sp->buf);
70  sp->buf = InvalidBuffer;
71  }
72 }
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4896
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
Buffer buf
Definition: nbtree.h:953
Relation indexRelation
Definition: relscan.h:118
struct SnapshotData * xs_snapshot
Definition: relscan.h:119

References _bt_unlockbuf(), BTScanPosData::buf, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2560 of file nbtsearch.c.

2561 {
2562  Relation rel = scan->indexRelation;
2563  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2564  Buffer buf;
2565  Page page;
2566  BTPageOpaque opaque;
2568  BTScanPosItem *currItem;
2569 
2570  /*
2571  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2572  * version of _bt_search(). We don't maintain a stack since we know we
2573  * won't need it.
2574  */
2576 
2577  if (!BufferIsValid(buf))
2578  {
2579  /*
2580  * Empty index. Lock the whole relation, as nothing finer to lock
2581  * exists.
2582  */
2583  PredicateLockRelation(rel, scan->xs_snapshot);
2585  return false;
2586  }
2587 
2589  page = BufferGetPage(buf);
2590  opaque = BTPageGetOpaque(page);
2591  Assert(P_ISLEAF(opaque));
2592 
2593  if (ScanDirectionIsForward(dir))
2594  {
2595  /* There could be dead pages to the left, so not this: */
2596  /* Assert(P_LEFTMOST(opaque)); */
2597 
2598  start = P_FIRSTDATAKEY(opaque);
2599  }
2600  else if (ScanDirectionIsBackward(dir))
2601  {
2602  Assert(P_RIGHTMOST(opaque));
2603 
2604  start = PageGetMaxOffsetNumber(page);
2605  }
2606  else
2607  {
2608  elog(ERROR, "invalid scan direction: %d", (int) dir);
2609  start = 0; /* keep compiler quiet */
2610  }
2611 
2612  /* remember which buffer we have pinned */
2613  so->currPos.buf = buf;
2614 
2615  _bt_initialize_more_data(so, dir);
2616 
2617  /*
2618  * Now load data from the first page of the scan.
2619  */
2620  if (!_bt_readpage(scan, dir, start, true))
2621  {
2622  /*
2623  * There's no actually-matching data on this page. Try to advance to
2624  * the next page. Return false if there's no matching data at all.
2625  */
2626  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2627  if (!_bt_steppage(scan, dir))
2628  return false;
2629  }
2630  else
2631  {
2632  /* We have at least one item to return as scan's first item */
2634  }
2635 
2636  /* OK, itemIndex says what to return */
2637  currItem = &so->currPos.items[so->currPos.itemIndex];
2638  scan->xs_heaptid = currItem->heapTid;
2639  if (scan->xs_want_itup)
2640  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2641 
2642  return true;
2643 }
int Buffer
Definition: buf.h:23
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:355
#define elog(elevel,...)
Definition: elog.h:224
return str start
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1022
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1081
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2479
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:61
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2036
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
Definition: nbtsearch.c:1556
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2650
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2584
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2561
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
BTScanPosData currPos
Definition: nbtree.h:1077
char * currTuples
Definition: nbtree.h:1064
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:994
int itemIndex
Definition: nbtree.h:992
IndexTuple xs_itup
Definition: relscan.h:142
ItemPointerData xs_heaptid
Definition: relscan.h:147

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_initialize_more_data(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert, BTPageGetOpaque, BTScanPosInvalidate, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog, ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber(), PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, start, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 876 of file nbtsearch.c.

877 {
878  Relation rel = scan->indexRelation;
879  BTScanOpaque so = (BTScanOpaque) scan->opaque;
880  Buffer buf;
881  BTStack stack;
882  OffsetNumber offnum;
883  StrategyNumber strat;
884  BTScanInsertData inskey;
885  ScanKey startKeys[INDEX_MAX_KEYS];
886  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
887  int keysz = 0;
888  int i;
889  bool status;
890  StrategyNumber strat_total;
891  BTScanPosItem *currItem;
892  BlockNumber blkno;
893 
895 
897 
898  /*
899  * Examine the scan keys and eliminate any redundant keys; also mark the
900  * keys that must be matched to continue the scan.
901  */
902  _bt_preprocess_keys(scan);
903 
904  /*
905  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906  * never be satisfied (eg, x == 1 AND x > 2).
907  */
908  if (!so->qual_ok)
909  {
910  _bt_parallel_done(scan);
911  return false;
912  }
913 
914  /*
915  * For parallel scans, get the starting page from shared state. If the
916  * scan has not started, proceed to find out first leaf page in the usual
917  * way while keeping other participating processes waiting. If the scan
918  * has already begun, use the page number from the shared structure.
919  *
920  * When a parallel scan has another primitive index scan scheduled, a
921  * parallel worker will seize the scan for that purpose now. This is
922  * similar to the case where the top-level scan hasn't started.
923  */
924  if (scan->parallel_scan != NULL)
925  {
926  status = _bt_parallel_seize(scan, &blkno, true);
927 
928  /*
929  * Initialize arrays (when _bt_parallel_seize didn't already set up
930  * the next primitive index scan)
931  */
932  if (so->numArrayKeys && !so->needPrimScan)
933  _bt_start_array_keys(scan, dir);
934 
935  if (!status)
936  return false;
937  else if (blkno == P_NONE)
938  {
939  _bt_parallel_done(scan);
940  return false;
941  }
942  else if (blkno != InvalidBlockNumber)
943  {
944  if (!_bt_parallel_readpage(scan, blkno, dir))
945  return false;
946  goto readcomplete;
947  }
948  }
949  else if (so->numArrayKeys && !so->needPrimScan)
950  {
951  /*
952  * First _bt_first call (for current btrescan) without parallelism.
953  *
954  * Initialize arrays, and the corresponding scan keys that were just
955  * output by _bt_preprocess_keys.
956  */
957  _bt_start_array_keys(scan, dir);
958  }
959 
960  /*----------
961  * Examine the scan keys to discover where we need to start the scan.
962  *
963  * We want to identify the keys that can be used as starting boundaries;
964  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
965  * a backwards scan. We can use keys for multiple attributes so long as
966  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
967  * a > or < boundary or find an attribute with no boundary (which can be
968  * thought of as the same as "> -infinity"), we can't use keys for any
969  * attributes to its right, because it would break our simplistic notion
970  * of what initial positioning strategy to use.
971  *
972  * When the scan keys include cross-type operators, _bt_preprocess_keys
973  * may not be able to eliminate redundant keys; in such cases we will
974  * arbitrarily pick a usable one for each attribute. This is correct
975  * but possibly not optimal behavior. (For example, with keys like
976  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
977  * x=5 would be more efficient.) Since the situation only arises given
978  * a poorly-worded query plus an incomplete opfamily, live with it.
979  *
980  * When both equality and inequality keys appear for a single attribute
981  * (again, only possible when cross-type operators appear), we *must*
982  * select one of the equality keys for the starting point, because
983  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
984  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
985  * start at x=4, we will fail and stop before reaching x=10. If multiple
986  * equality quals survive preprocessing, however, it doesn't matter which
987  * one we use --- by definition, they are either redundant or
988  * contradictory.
989  *
990  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
991  * If the index stores nulls at the end of the index we'll be starting
992  * from, and we have no boundary key for the column (which means the key
993  * we deduced NOT NULL from is an inequality key that constrains the other
994  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
995  * use as a boundary key. If we didn't do this, we might find ourselves
996  * traversing a lot of null entries at the start of the scan.
997  *
998  * In this loop, row-comparison keys are treated the same as keys on their
999  * first (leftmost) columns. We'll add on lower-order columns of the row
1000  * comparison below, if possible.
1001  *
1002  * The selected scan keys (at most one per index column) are remembered by
1003  * storing their addresses into the local startKeys[] array.
1004  *
1005  * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1006  * the next primitive index scan (for scans with array keys) based in part
1007  * on an understanding of how it'll enable us to reposition the scan.
1008  * They're directly aware of how we'll sometimes cons up an explicit
1009  * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1010  * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1011  * scans to skip large groups of NULLs through repeated deductions about
1012  * key strictness (for a required inequality key) and whether NULLs in the
1013  * key's index column are stored last or first (relative to non-NULLs).
1014  * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1015  * need to be kept in sync.
1016  *----------
1017  */
1018  strat_total = BTEqualStrategyNumber;
1019  if (so->numberOfKeys > 0)
1020  {
1021  AttrNumber curattr;
1022  ScanKey chosen;
1023  ScanKey impliesNN;
1024  ScanKey cur;
1025 
1026  /*
1027  * chosen is the so-far-chosen key for the current attribute, if any.
1028  * We don't cast the decision in stone until we reach keys for the
1029  * next attribute.
1030  */
1031  curattr = 1;
1032  chosen = NULL;
1033  /* Also remember any scankey that implies a NOT NULL constraint */
1034  impliesNN = NULL;
1035 
1036  /*
1037  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1038  * pass to handle after-last-key processing. Actual exit from the
1039  * loop is at one of the "break" statements below.
1040  */
1041  for (cur = so->keyData, i = 0;; cur++, i++)
1042  {
1043  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1044  {
1045  /*
1046  * Done looking at keys for curattr. If we didn't find a
1047  * usable boundary key, see if we can deduce a NOT NULL key.
1048  */
1049  if (chosen == NULL && impliesNN != NULL &&
1050  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1051  ScanDirectionIsForward(dir) :
1053  {
1054  /* Yes, so build the key in notnullkeys[keysz] */
1055  chosen = &notnullkeys[keysz];
1056  ScanKeyEntryInitialize(chosen,
1058  (impliesNN->sk_flags &
1060  curattr,
1061  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1064  InvalidOid,
1065  InvalidOid,
1066  InvalidOid,
1067  (Datum) 0);
1068  }
1069 
1070  /*
1071  * If we still didn't find a usable boundary key, quit; else
1072  * save the boundary key pointer in startKeys.
1073  */
1074  if (chosen == NULL)
1075  break;
1076  startKeys[keysz++] = chosen;
1077 
1078  /*
1079  * Adjust strat_total, and quit if we have stored a > or <
1080  * key.
1081  */
1082  strat = chosen->sk_strategy;
1083  if (strat != BTEqualStrategyNumber)
1084  {
1085  strat_total = strat;
1086  if (strat == BTGreaterStrategyNumber ||
1087  strat == BTLessStrategyNumber)
1088  break;
1089  }
1090 
1091  /*
1092  * Done if that was the last attribute, or if next key is not
1093  * in sequence (implying no boundary key is available for the
1094  * next attribute).
1095  */
1096  if (i >= so->numberOfKeys ||
1097  cur->sk_attno != curattr + 1)
1098  break;
1099 
1100  /*
1101  * Reset for next attr.
1102  */
1103  curattr = cur->sk_attno;
1104  chosen = NULL;
1105  impliesNN = NULL;
1106  }
1107 
1108  /*
1109  * Can we use this key as a starting boundary for this attr?
1110  *
1111  * If not, does it imply a NOT NULL constraint? (Because
1112  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1113  * *any* inequality key works for that; we need not test.)
1114  */
1115  switch (cur->sk_strategy)
1116  {
1117  case BTLessStrategyNumber:
1119  if (chosen == NULL)
1120  {
1121  if (ScanDirectionIsBackward(dir))
1122  chosen = cur;
1123  else
1124  impliesNN = cur;
1125  }
1126  break;
1127  case BTEqualStrategyNumber:
1128  /* override any non-equality choice */
1129  chosen = cur;
1130  break;
1133  if (chosen == NULL)
1134  {
1135  if (ScanDirectionIsForward(dir))
1136  chosen = cur;
1137  else
1138  impliesNN = cur;
1139  }
1140  break;
1141  }
1142  }
1143  }
1144 
1145  /*
1146  * If we found no usable boundary keys, we have to start from one end of
1147  * the tree. Walk down that edge to the first or last key, and scan from
1148  * there.
1149  */
1150  if (keysz == 0)
1151  {
1152  bool match;
1153 
1154  match = _bt_endpoint(scan, dir);
1155 
1156  if (!match)
1157  {
1158  /* No match, so mark (parallel) scan finished */
1159  _bt_parallel_done(scan);
1160  }
1161 
1162  return match;
1163  }
1164 
1165  /*
1166  * We want to start the scan somewhere within the index. Set up an
1167  * insertion scankey we can use to search for the boundary point we
1168  * identified above. The insertion scankey is built using the keys
1169  * identified by startKeys[]. (Remaining insertion scankey fields are
1170  * initialized after initial-positioning strategy is finalized.)
1171  */
1172  Assert(keysz <= INDEX_MAX_KEYS);
1173  for (i = 0; i < keysz; i++)
1174  {
1175  ScanKey cur = startKeys[i];
1176 
1177  Assert(cur->sk_attno == i + 1);
1178 
1179  if (cur->sk_flags & SK_ROW_HEADER)
1180  {
1181  /*
1182  * Row comparison header: look to the first row member instead.
1183  *
1184  * The member scankeys are already in insertion format (ie, they
1185  * have sk_func = 3-way-comparison function), but we have to watch
1186  * out for nulls, which _bt_preprocess_keys didn't check. A null
1187  * in the first row member makes the condition unmatchable, just
1188  * like qual_ok = false.
1189  */
1190  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1191 
1192  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1193  if (subkey->sk_flags & SK_ISNULL)
1194  {
1195  _bt_parallel_done(scan);
1196  return false;
1197  }
1198  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1199 
1200  /*
1201  * If the row comparison is the last positioning key we accepted,
1202  * try to add additional keys from the lower-order row members.
1203  * (If we accepted independent conditions on additional index
1204  * columns, we use those instead --- doesn't seem worth trying to
1205  * determine which is more restrictive.) Note that this is OK
1206  * even if the row comparison is of ">" or "<" type, because the
1207  * condition applied to all but the last row member is effectively
1208  * ">=" or "<=", and so the extra keys don't break the positioning
1209  * scheme. But, by the same token, if we aren't able to use all
1210  * the row members, then the part of the row comparison that we
1211  * did use has to be treated as just a ">=" or "<=" condition, and
1212  * so we'd better adjust strat_total accordingly.
1213  */
1214  if (i == keysz - 1)
1215  {
1216  bool used_all_subkeys = false;
1217 
1218  Assert(!(subkey->sk_flags & SK_ROW_END));
1219  for (;;)
1220  {
1221  subkey++;
1222  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1223  if (subkey->sk_attno != keysz + 1)
1224  break; /* out-of-sequence, can't use it */
1225  if (subkey->sk_strategy != cur->sk_strategy)
1226  break; /* wrong direction, can't use it */
1227  if (subkey->sk_flags & SK_ISNULL)
1228  break; /* can't use null keys */
1229  Assert(keysz < INDEX_MAX_KEYS);
1230  memcpy(inskey.scankeys + keysz, subkey,
1231  sizeof(ScanKeyData));
1232  keysz++;
1233  if (subkey->sk_flags & SK_ROW_END)
1234  {
1235  used_all_subkeys = true;
1236  break;
1237  }
1238  }
1239  if (!used_all_subkeys)
1240  {
1241  switch (strat_total)
1242  {
1243  case BTLessStrategyNumber:
1244  strat_total = BTLessEqualStrategyNumber;
1245  break;
1247  strat_total = BTGreaterEqualStrategyNumber;
1248  break;
1249  }
1250  }
1251  break; /* done with outer loop */
1252  }
1253  }
1254  else
1255  {
1256  /*
1257  * Ordinary comparison key. Transform the search-style scan key
1258  * to an insertion scan key by replacing the sk_func with the
1259  * appropriate btree comparison function.
1260  *
1261  * If scankey operator is not a cross-type comparison, we can use
1262  * the cached comparison function; otherwise gotta look it up in
1263  * the catalogs. (That can't lead to infinite recursion, since no
1264  * indexscan initiated by syscache lookup will use cross-data-type
1265  * operators.)
1266  *
1267  * We support the convention that sk_subtype == InvalidOid means
1268  * the opclass input type; this is a hack to simplify life for
1269  * ScanKeyInit().
1270  */
1271  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1272  cur->sk_subtype == InvalidOid)
1273  {
1274  FmgrInfo *procinfo;
1275 
1276  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1277  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1278  cur->sk_flags,
1279  cur->sk_attno,
1281  cur->sk_subtype,
1282  cur->sk_collation,
1283  procinfo,
1284  cur->sk_argument);
1285  }
1286  else
1287  {
1288  RegProcedure cmp_proc;
1289 
1290  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1291  rel->rd_opcintype[i],
1292  cur->sk_subtype,
1293  BTORDER_PROC);
1294  if (!RegProcedureIsValid(cmp_proc))
1295  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1296  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1297  cur->sk_attno, RelationGetRelationName(rel));
1298  ScanKeyEntryInitialize(inskey.scankeys + i,
1299  cur->sk_flags,
1300  cur->sk_attno,
1302  cur->sk_subtype,
1303  cur->sk_collation,
1304  cmp_proc,
1305  cur->sk_argument);
1306  }
1307  }
1308  }
1309 
1310  /*----------
1311  * Examine the selected initial-positioning strategy to determine exactly
1312  * where we need to start the scan, and set flag variables to control the
1313  * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1314  * page _bt_search returns).
1315  *----------
1316  */
1317  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1318  inskey.anynullkeys = false; /* unused */
1319  inskey.scantid = NULL;
1320  inskey.keysz = keysz;
1321  switch (strat_total)
1322  {
1323  case BTLessStrategyNumber:
1324 
1325  inskey.nextkey = false;
1326  inskey.backward = true;
1327  break;
1328 
1330 
1331  inskey.nextkey = true;
1332  inskey.backward = true;
1333  break;
1334 
1335  case BTEqualStrategyNumber:
1336 
1337  /*
1338  * If a backward scan was specified, need to start with last equal
1339  * item not first one.
1340  */
1341  if (ScanDirectionIsBackward(dir))
1342  {
1343  /*
1344  * This is the same as the <= strategy
1345  */
1346  inskey.nextkey = true;
1347  inskey.backward = true;
1348  }
1349  else
1350  {
1351  /*
1352  * This is the same as the >= strategy
1353  */
1354  inskey.nextkey = false;
1355  inskey.backward = false;
1356  }
1357  break;
1358 
1360 
1361  /*
1362  * Find first item >= scankey
1363  */
1364  inskey.nextkey = false;
1365  inskey.backward = false;
1366  break;
1367 
1369 
1370  /*
1371  * Find first item > scankey
1372  */
1373  inskey.nextkey = true;
1374  inskey.backward = false;
1375  break;
1376 
1377  default:
1378  /* can't get here, but keep compiler quiet */
1379  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1380  return false;
1381  }
1382 
1383  /*
1384  * Use the manufactured insertion scan key to descend the tree and
1385  * position ourselves on the target leaf page.
1386  */
1387  Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1388  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1389 
1390  /* don't need to keep the stack around... */
1391  _bt_freestack(stack);
1392 
1393  if (!BufferIsValid(buf))
1394  {
1395  /*
1396  * We only get here if the index is completely empty. Lock relation
1397  * because nothing finer to lock exists. Without a buffer lock, it's
1398  * possible for another transaction to insert data between
1399  * _bt_search() and PredicateLockRelation(). We have to try again
1400  * after taking the relation-level predicate lock, to close a narrow
1401  * window where we wouldn't scan concurrently inserted tuples, but the
1402  * writer wouldn't see our predicate lock.
1403  */
1405  {
1406  PredicateLockRelation(rel, scan->xs_snapshot);
1407  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1408  _bt_freestack(stack);
1409  }
1410 
1411  if (!BufferIsValid(buf))
1412  {
1413  /*
1414  * Mark parallel scan as done, so that all the workers can finish
1415  * their scan.
1416  */
1417  _bt_parallel_done(scan);
1419  return false;
1420  }
1421  }
1422 
1424 
1425  _bt_initialize_more_data(so, dir);
1426 
1427  /* position to the precise item on the page */
1428  offnum = _bt_binsrch(rel, &inskey, buf);
1430  so->currPos.buf = buf;
1431 
1432  /*
1433  * Now load data from the first page of the scan.
1434  *
1435  * If inskey.nextkey = false and inskey.backward = false, offnum is
1436  * positioned at the first non-pivot tuple >= inskey.scankeys.
1437  *
1438  * If inskey.nextkey = false and inskey.backward = true, offnum is
1439  * positioned at the last non-pivot tuple < inskey.scankeys.
1440  *
1441  * If inskey.nextkey = true and inskey.backward = false, offnum is
1442  * positioned at the first non-pivot tuple > inskey.scankeys.
1443  *
1444  * If inskey.nextkey = true and inskey.backward = true, offnum is
1445  * positioned at the last non-pivot tuple <= inskey.scankeys.
1446  *
1447  * It's possible that _bt_binsrch returned an offnum that is out of bounds
1448  * for the page. For example, when inskey is both < the leaf page's high
1449  * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1450  */
1451  if (!_bt_readpage(scan, dir, offnum, true))
1452  {
1453  /*
1454  * There's no actually-matching data on this page. Try to advance to
1455  * the next page. Return false if there's no matching data at all.
1456  */
1457  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1458  if (!_bt_steppage(scan, dir))
1459  return false;
1460  }
1461  else
1462  {
1463  /* We have at least one item to return as scan's first item */
1465  }
1466 
1467 readcomplete:
1468  /* OK, itemIndex says what to return */
1469  currItem = &so->currPos.items[so->currPos.itemIndex];
1470  scan->xs_heaptid = currItem->heapTid;
1471  if (scan->xs_want_itup)
1472  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1473 
1474  return true;
1475 }
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define RegProcedureIsValid(p)
Definition: c.h:777
regproc RegProcedure
Definition: c.h:650
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:860
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:796
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:732
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
Definition: nbtree.c:605
#define BTORDER_PROC
Definition: nbtree.h:707
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1016
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:719
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2334
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:337
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2560
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:96
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:221
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1343
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:2556
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:625
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
bool needPrimScan
Definition: nbtree.h:1049
ScanKey keyData
Definition: nbtree.h:1045
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_start_array_keys(), _bt_steppage(), _bt_unlockbuf(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid(), cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2479 of file nbtsearch.c.

2480 {
2481  Buffer buf;
2482  Page page;
2483  BTPageOpaque opaque;
2484  OffsetNumber offnum;
2485  BlockNumber blkno;
2486  IndexTuple itup;
2487 
2488  /*
2489  * If we are looking for a leaf page, okay to descend from fast root;
2490  * otherwise better descend from true root. (There is no point in being
2491  * smarter about intermediate levels.)
2492  */
2493  if (level == 0)
2494  buf = _bt_getroot(rel, NULL, BT_READ);
2495  else
2496  buf = _bt_gettrueroot(rel);
2497 
2498  if (!BufferIsValid(buf))
2499  return InvalidBuffer;
2500 
2501  page = BufferGetPage(buf);
2502  opaque = BTPageGetOpaque(page);
2503 
2504  for (;;)
2505  {
2506  /*
2507  * If we landed on a deleted page, step right to find a live page
2508  * (there must be one). Also, if we want the rightmost page, step
2509  * right if needed to get to it (this could happen if the page split
2510  * since we obtained a pointer to it).
2511  */
2512  while (P_IGNORE(opaque) ||
2513  (rightmost && !P_RIGHTMOST(opaque)))
2514  {
2515  blkno = opaque->btpo_next;
2516  if (blkno == P_NONE)
2517  elog(ERROR, "fell off the end of index \"%s\"",
2519  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2520  page = BufferGetPage(buf);
2521  opaque = BTPageGetOpaque(page);
2522  }
2523 
2524  /* Done? */
2525  if (opaque->btpo_level == level)
2526  break;
2527  if (opaque->btpo_level < level)
2528  ereport(ERROR,
2529  (errcode(ERRCODE_INDEX_CORRUPTED),
2530  errmsg_internal("btree level %u not found in index \"%s\"",
2531  level, RelationGetRelationName(rel))));
2532 
2533  /* Descend to leftmost or rightmost child page */
2534  if (rightmost)
2535  offnum = PageGetMaxOffsetNumber(page);
2536  else
2537  offnum = P_FIRSTDATAKEY(opaque);
2538 
2539  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2540  blkno = BTreeTupleGetDownLink(itup);
2541 
2542  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2543  page = BufferGetPage(buf);
2544  opaque = BTPageGetOpaque(page);
2545  }
2546 
2547  return buf;
2548 }
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define P_IGNORE(opaque)
Definition: nbtree.h:225
BlockNumber btpo_next
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_initialize_more_data()

static void _bt_initialize_more_data ( BTScanOpaque  so,
ScanDirection  dir 
)
inlinestatic

Definition at line 2650 of file nbtsearch.c.

2651 {
2652  so->currPos.dir = dir;
2653  if (so->needPrimScan)
2654  {
2655  Assert(so->numArrayKeys);
2656 
2657  so->currPos.moreLeft = true;
2658  so->currPos.moreRight = true;
2659  so->needPrimScan = false;
2660  }
2661  else if (ScanDirectionIsForward(dir))
2662  {
2663  so->currPos.moreLeft = false;
2664  so->currPos.moreRight = true;
2665  }
2666  else
2667  {
2668  so->currPos.moreLeft = true;
2669  so->currPos.moreRight = false;
2670  }
2671  so->numKilled = 0; /* just paranoia */
2672  so->markItemIndex = -1; /* ditto */
2673 }
bool moreRight
Definition: nbtree.h:966
bool moreLeft
Definition: nbtree.h:965
ScanDirection dir
Definition: nbtree.h:975

References Assert, BTScanOpaqueData::currPos, BTScanPosData::dir, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_parallel_readpage().

◆ _bt_moveright()

Buffer _bt_moveright ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access 
)

Definition at line 235 of file nbtsearch.c.

242 {
243  Page page;
244  BTPageOpaque opaque;
245  int32 cmpval;
246 
247  Assert(!forupdate || heaprel != NULL);
248 
249  /*
250  * When nextkey = false (normal case): if the scan key that brought us to
251  * this page is > the high key stored on the page, then the page has split
252  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
253  * have some duplicates to the right as well as the left, but that's
254  * something that's only ever dealt with on the leaf level, after
255  * _bt_search has found an initial leaf page.)
256  *
257  * When nextkey = true: move right if the scan key is >= page's high key.
258  * (Note that key.scantid cannot be set in this case.)
259  *
260  * The page could even have split more than once, so scan as far as
261  * needed.
262  *
263  * We also have to move right if we followed a link that brought us to a
264  * dead page.
265  */
266  cmpval = key->nextkey ? 0 : 1;
267 
268  for (;;)
269  {
270  page = BufferGetPage(buf);
271  opaque = BTPageGetOpaque(page);
272 
273  if (P_RIGHTMOST(opaque))
274  break;
275 
276  /*
277  * Finish any incomplete splits we encounter along the way.
278  */
279  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
280  {
282 
283  /* upgrade our lock if necessary */
284  if (access == BT_READ)
285  {
286  _bt_unlockbuf(rel, buf);
287  _bt_lockbuf(rel, buf, BT_WRITE);
288  }
289 
290  if (P_INCOMPLETE_SPLIT(opaque))
291  _bt_finish_split(rel, heaprel, buf, stack);
292  else
293  _bt_relbuf(rel, buf);
294 
295  /* re-acquire the lock in the right mode, and re-check */
296  buf = _bt_getbuf(rel, blkno, access);
297  continue;
298  }
299 
300  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
301  {
302  /* step right one page */
303  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
304  continue;
305  }
306  else
307  break;
308  }
309 
310  if (P_IGNORE(opaque))
311  elog(ERROR, "fell off the end of index \"%s\"",
313 
314  return buf;
315 }
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
#define P_HIKEY
Definition: nbtree.h:367
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BT_WRITE
Definition: nbtree.h:720
short access
Definition: preproc-type.c:36

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage(), elog, ERROR, sort-test::key, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_search().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1492 of file nbtsearch.c.

1493 {
1494  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1495  BTScanPosItem *currItem;
1496 
1497  /*
1498  * Advance to next tuple on current page; or if there's no more, try to
1499  * step to the next page with data.
1500  */
1501  if (ScanDirectionIsForward(dir))
1502  {
1503  if (++so->currPos.itemIndex > so->currPos.lastItem)
1504  {
1505  if (!_bt_steppage(scan, dir))
1506  return false;
1507  }
1508  }
1509  else
1510  {
1511  if (--so->currPos.itemIndex < so->currPos.firstItem)
1512  {
1513  if (!_bt_steppage(scan, dir))
1514  return false;
1515  }
1516  }
1517 
1518  /* OK, itemIndex says what to return */
1519  currItem = &so->currPos.items[so->currPos.itemIndex];
1520  scan->xs_heaptid = currItem->heapTid;
1521  if (scan->xs_want_itup)
1522  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1523 
1524  return true;
1525 }
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
int firstItem
Definition: nbtree.h:990
int lastItem
Definition: nbtree.h:991

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, if(), BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_parallel_readpage()

static bool _bt_parallel_readpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2334 of file nbtsearch.c.

2335 {
2336  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2337 
2338  Assert(!so->needPrimScan);
2339 
2340  _bt_initialize_more_data(so, dir);
2341 
2342  if (!_bt_readnextpage(scan, blkno, dir))
2343  return false;
2344 
2345  /* We have at least one item to return as scan's next item */
2347 
2348  return true;
2349 }
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2168

References _bt_drop_lock_and_maybe_pin(), _bt_initialize_more_data(), _bt_readnextpage(), Assert, BTScanOpaqueData::currPos, BTScanOpaqueData::needPrimScan, and IndexScanDescData::opaque.

Referenced by _bt_first().

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2168 of file nbtsearch.c.

2169 {
2170  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2171  Relation rel;
2172  Page page;
2173  BTPageOpaque opaque;
2174  bool status;
2175 
2176  rel = scan->indexRelation;
2177 
2178  if (ScanDirectionIsForward(dir))
2179  {
2180  for (;;)
2181  {
2182  /*
2183  * if we're at end of scan, give up and mark parallel scan as
2184  * done, so that all the workers can finish their scan
2185  */
2186  if (blkno == P_NONE || !so->currPos.moreRight)
2187  {
2188  _bt_parallel_done(scan);
2190  return false;
2191  }
2192  /* check for interrupts while we're not holding any buffer lock */
2194  /* step right one page */
2195  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2196  page = BufferGetPage(so->currPos.buf);
2197  opaque = BTPageGetOpaque(page);
2198  /* check for deleted page */
2199  if (!P_IGNORE(opaque))
2200  {
2201  PredicateLockPage(rel, blkno, scan->xs_snapshot);
2202  /* see if there are any matches on this page */
2203  /* note that this will clear moreRight if we can stop */
2204  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2205  break;
2206  }
2207  else if (scan->parallel_scan != NULL)
2208  {
2209  /* allow next page be processed by parallel worker */
2210  _bt_parallel_release(scan, opaque->btpo_next);
2211  }
2212 
2213  /* nope, keep going */
2214  if (scan->parallel_scan != NULL)
2215  {
2216  _bt_relbuf(rel, so->currPos.buf);
2217  status = _bt_parallel_seize(scan, &blkno, false);
2218  if (!status)
2219  {
2221  return false;
2222  }
2223  }
2224  else
2225  {
2226  blkno = opaque->btpo_next;
2227  _bt_relbuf(rel, so->currPos.buf);
2228  }
2229  }
2230  }
2231  else
2232  {
2233  /*
2234  * Should only happen in parallel cases, when some other backend
2235  * advanced the scan.
2236  */
2237  if (so->currPos.currPage != blkno)
2238  {
2240  so->currPos.currPage = blkno;
2241  }
2242 
2243  /*
2244  * Walk left to the next page with data. This is much more complex
2245  * than the walk-right case because of the possibility that the page
2246  * to our left splits while we are in flight to it, plus the
2247  * possibility that the page we were on gets deleted after we leave
2248  * it. See nbtree/README for details.
2249  *
2250  * It might be possible to rearrange this code to have less overhead
2251  * in pinning and locking, but that would require capturing the left
2252  * sibling block number when the page is initially read, and then
2253  * optimistically starting there (rather than pinning the page twice).
2254  * It is not clear that this would be worth the complexity.
2255  */
2256  if (BTScanPosIsPinned(so->currPos))
2257  _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2258  else
2259  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2260 
2261  for (;;)
2262  {
2263  /* Done if we know there are no matching keys to the left */
2264  if (!so->currPos.moreLeft)
2265  {
2266  _bt_relbuf(rel, so->currPos.buf);
2267  _bt_parallel_done(scan);
2269  return false;
2270  }
2271 
2272  /* Step to next physical page */
2273  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2274 
2275  /* if we're physically at end of index, return failure */
2276  if (so->currPos.buf == InvalidBuffer)
2277  {
2278  _bt_parallel_done(scan);
2280  return false;
2281  }
2282 
2283  /*
2284  * Okay, we managed to move left to a non-deleted page. Done if
2285  * it's not half-dead and contains matching tuples. Else loop back
2286  * and do it all again.
2287  */
2288  page = BufferGetPage(so->currPos.buf);
2289  opaque = BTPageGetOpaque(page);
2290  if (!P_IGNORE(opaque))
2291  {
2293  /* see if there are any matches on this page */
2294  /* note that this will clear moreLeft if we can stop */
2295  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2296  break;
2297  }
2298  else if (scan->parallel_scan != NULL)
2299  {
2300  /* allow next page be processed by parallel worker */
2302  }
2303 
2304  /*
2305  * For parallel scans, get the last page scanned as it is quite
2306  * possible that by the time we try to seize the scan, some other
2307  * worker has already advanced the scan to a different page. We
2308  * must continue based on the latest page scanned by any worker.
2309  */
2310  if (scan->parallel_scan != NULL)
2311  {
2312  _bt_relbuf(rel, so->currPos.buf);
2313  status = _bt_parallel_seize(scan, &blkno, false);
2314  if (!status)
2315  {
2317  return false;
2318  }
2319  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2320  }
2321  }
2322  }
2323 
2324  return true;
2325 }
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:709
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:999
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1010
static Buffer _bt_walk_left(Relation rel, Buffer buf)
Definition: nbtsearch.c:2365
BlockNumber currPage
Definition: nbtree.h:956

References _bt_getbuf(), _bt_lockbuf(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), BT_READ, BTPageGetOpaque, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, if(), IndexScanDescData::indexRelation, InvalidBuffer, BTScanPosData::moreLeft, BTScanPosData::moreRight, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), ScanDirectionIsForward, and IndexScanDescData::xs_snapshot.

Referenced by _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum,
bool  firstPage 
)
static

Definition at line 1556 of file nbtsearch.c.

1558 {
1559  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1560  Page page;
1561  BTPageOpaque opaque;
1562  OffsetNumber minoff;
1563  OffsetNumber maxoff;
1564  BTReadPageState pstate;
1565  bool arrayKeys;
1566  int itemIndex,
1567  indnatts;
1568 
1569  /*
1570  * We must have the buffer pinned and locked, but the usual macro can't be
1571  * used here; this function is what makes it good for currPos.
1572  */
1574 
1575  page = BufferGetPage(so->currPos.buf);
1576  opaque = BTPageGetOpaque(page);
1577 
1578  /* allow next page be processed by parallel worker */
1579  if (scan->parallel_scan)
1580  {
1581  if (ScanDirectionIsForward(dir))
1582  pstate.prev_scan_page = opaque->btpo_next;
1583  else
1584  pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
1585 
1586  _bt_parallel_release(scan, pstate.prev_scan_page);
1587  }
1588 
1590  arrayKeys = so->numArrayKeys != 0;
1591  minoff = P_FIRSTDATAKEY(opaque);
1592  maxoff = PageGetMaxOffsetNumber(page);
1593 
1594  /* initialize page-level state that we'll pass to _bt_checkkeys */
1595  pstate.dir = dir;
1596  pstate.minoff = minoff;
1597  pstate.maxoff = maxoff;
1598  pstate.finaltup = NULL;
1599  pstate.page = page;
1600  pstate.offnum = InvalidOffsetNumber;
1601  pstate.skip = InvalidOffsetNumber;
1602  pstate.continuescan = true; /* default assumption */
1603  pstate.prechecked = false;
1604  pstate.firstmatch = false;
1605  pstate.rechecks = 0;
1606  pstate.targetdistance = 0;
1607 
1608  /*
1609  * We note the buffer's block number so that we can release the pin later.
1610  * This allows us to re-read the buffer if it is needed again for hinting.
1611  */
1613 
1614  /*
1615  * We save the LSN of the page as we read it, so that we know whether it
1616  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1617  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1618  */
1620 
1621  /*
1622  * we must save the page's right-link while scanning it; this tells us
1623  * where to step right to after we're done with these items. There is no
1624  * corresponding need for the left-link, since splits always go right.
1625  */
1626  so->currPos.nextPage = opaque->btpo_next;
1627 
1628  /* initialize tuple workspace to empty */
1629  so->currPos.nextTupleOffset = 0;
1630 
1631  /*
1632  * Now that the current page has been made consistent, the macro should be
1633  * good.
1634  */
1636 
1637  /*
1638  * Prechecking the value of the continuescan flag for the last item on the
1639  * page (for backwards scan it will be the first item on a page). If we
1640  * observe it to be true, then it should be true for all other items. This
1641  * allows us to do significant optimizations in the _bt_checkkeys()
1642  * function for all the items on the page.
1643  *
1644  * With the forward scan, we do this check for the last item on the page
1645  * instead of the high key. It's relatively likely that the most
1646  * significant column in the high key will be different from the
1647  * corresponding value from the last item on the page. So checking with
1648  * the last item on the page would give a more precise answer.
1649  *
1650  * We skip this for the first page read by each (primitive) scan, to avoid
1651  * slowing down point queries. They typically don't stand to gain much
1652  * when the optimization can be applied, and are more likely to notice the
1653  * overhead of the precheck.
1654  *
1655  * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1656  * just set a low-order required array's key to the best available match
1657  * for a truncated -inf attribute value from the prior page's high key
1658  * (array element 0 is always the best available match in this scenario).
1659  * It's quite likely that matches for array element 0 begin on this page,
1660  * but the start of matches won't necessarily align with page boundaries.
1661  * When the start of matches is somewhere in the middle of this page, it
1662  * would be wrong to treat page's final non-pivot tuple as representative.
1663  * Doing so might lead us to treat some of the page's earlier tuples as
1664  * being part of a group of tuples thought to satisfy the required keys.
1665  *
1666  * Note: Conversely, in the case where the scan's arrays just advanced
1667  * using the prior page's HIKEY _without_ advancement setting scanBehind,
1668  * the start of matches must be aligned with page boundaries, which makes
1669  * it safe to attempt the optimization here now. It's also safe when the
1670  * prior page's HIKEY simply didn't need to advance any required array. In
1671  * both cases we can safely assume that the _first_ tuple from this page
1672  * must be >= the current set of array keys/equality constraints. And so
1673  * if the final tuple is == those same keys (and also satisfies any
1674  * required < or <= strategy scan keys) during the precheck, we can safely
1675  * assume that this must also be true of all earlier tuples from the page.
1676  */
1677  if (!firstPage && !so->scanBehind && minoff < maxoff)
1678  {
1679  ItemId iid;
1680  IndexTuple itup;
1681 
1682  iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1683  itup = (IndexTuple) PageGetItem(page, iid);
1684 
1685  /* Call with arrayKeys=false to avoid undesirable side-effects */
1686  _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1687  pstate.prechecked = pstate.continuescan;
1688  pstate.continuescan = true; /* reset */
1689  }
1690 
1691  if (ScanDirectionIsForward(dir))
1692  {
1693  /* SK_SEARCHARRAY forward scans must provide high key up front */
1694  if (arrayKeys && !P_RIGHTMOST(opaque))
1695  {
1696  ItemId iid = PageGetItemId(page, P_HIKEY);
1697 
1698  pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1699  }
1700 
1701  /* load items[] in ascending order */
1702  itemIndex = 0;
1703 
1704  offnum = Max(offnum, minoff);
1705 
1706  while (offnum <= maxoff)
1707  {
1708  ItemId iid = PageGetItemId(page, offnum);
1709  IndexTuple itup;
1710  bool passes_quals;
1711 
1712  /*
1713  * If the scan specifies not to return killed tuples, then we
1714  * treat a killed tuple as not passing the qual
1715  */
1716  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1717  {
1718  offnum = OffsetNumberNext(offnum);
1719  continue;
1720  }
1721 
1722  itup = (IndexTuple) PageGetItem(page, iid);
1723  Assert(!BTreeTupleIsPivot(itup));
1724 
1725  pstate.offnum = offnum;
1726  passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1727  itup, indnatts);
1728 
1729  /*
1730  * Check if we need to skip ahead to a later tuple (only possible
1731  * when the scan uses array keys)
1732  */
1733  if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1734  {
1735  Assert(!passes_quals && pstate.continuescan);
1736  Assert(offnum < pstate.skip);
1737 
1738  offnum = pstate.skip;
1739  pstate.skip = InvalidOffsetNumber;
1740  continue;
1741  }
1742 
1743  if (passes_quals)
1744  {
1745  /* tuple passes all scan key conditions */
1746  pstate.firstmatch = true;
1747  if (!BTreeTupleIsPosting(itup))
1748  {
1749  /* Remember it */
1750  _bt_saveitem(so, itemIndex, offnum, itup);
1751  itemIndex++;
1752  }
1753  else
1754  {
1755  int tupleOffset;
1756 
1757  /*
1758  * Set up state to return posting list, and remember first
1759  * TID
1760  */
1761  tupleOffset =
1762  _bt_setuppostingitems(so, itemIndex, offnum,
1763  BTreeTupleGetPostingN(itup, 0),
1764  itup);
1765  itemIndex++;
1766  /* Remember additional TIDs */
1767  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1768  {
1769  _bt_savepostingitem(so, itemIndex, offnum,
1770  BTreeTupleGetPostingN(itup, i),
1771  tupleOffset);
1772  itemIndex++;
1773  }
1774  }
1775  }
1776  /* When !continuescan, there can't be any more matches, so stop */
1777  if (!pstate.continuescan)
1778  break;
1779 
1780  offnum = OffsetNumberNext(offnum);
1781  }
1782 
1783  /*
1784  * We don't need to visit page to the right when the high key
1785  * indicates that no more matches will be found there.
1786  *
1787  * Checking the high key like this works out more often than you might
1788  * think. Leaf page splits pick a split point between the two most
1789  * dissimilar tuples (this is weighed against the need to evenly share
1790  * free space). Leaf pages with high key attribute values that can
1791  * only appear on non-pivot tuples on the right sibling page are
1792  * common.
1793  */
1794  if (pstate.continuescan && !P_RIGHTMOST(opaque))
1795  {
1796  ItemId iid = PageGetItemId(page, P_HIKEY);
1797  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1798  int truncatt;
1799 
1800  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1801  pstate.prechecked = false; /* precheck didn't cover HIKEY */
1802  _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1803  }
1804 
1805  if (!pstate.continuescan)
1806  so->currPos.moreRight = false;
1807 
1808  Assert(itemIndex <= MaxTIDsPerBTreePage);
1809  so->currPos.firstItem = 0;
1810  so->currPos.lastItem = itemIndex - 1;
1811  so->currPos.itemIndex = 0;
1812  }
1813  else
1814  {
1815  /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1816  if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1817  {
1818  ItemId iid = PageGetItemId(page, minoff);
1819 
1820  pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1821  }
1822 
1823  /* load items[] in descending order */
1824  itemIndex = MaxTIDsPerBTreePage;
1825 
1826  offnum = Min(offnum, maxoff);
1827 
1828  while (offnum >= minoff)
1829  {
1830  ItemId iid = PageGetItemId(page, offnum);
1831  IndexTuple itup;
1832  bool tuple_alive;
1833  bool passes_quals;
1834 
1835  /*
1836  * If the scan specifies not to return killed tuples, then we
1837  * treat a killed tuple as not passing the qual. Most of the
1838  * time, it's a win to not bother examining the tuple's index
1839  * keys, but just skip to the next tuple (previous, actually,
1840  * since we're scanning backwards). However, if this is the first
1841  * tuple on the page, we do check the index keys, to prevent
1842  * uselessly advancing to the page to the left. This is similar
1843  * to the high key optimization used by forward scans.
1844  */
1845  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1846  {
1847  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1848  if (offnum > P_FIRSTDATAKEY(opaque))
1849  {
1850  offnum = OffsetNumberPrev(offnum);
1851  continue;
1852  }
1853 
1854  tuple_alive = false;
1855  }
1856  else
1857  tuple_alive = true;
1858 
1859  itup = (IndexTuple) PageGetItem(page, iid);
1860  Assert(!BTreeTupleIsPivot(itup));
1861 
1862  pstate.offnum = offnum;
1863  passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1864  itup, indnatts);
1865 
1866  /*
1867  * Check if we need to skip ahead to a later tuple (only possible
1868  * when the scan uses array keys)
1869  */
1870  if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1871  {
1872  Assert(!passes_quals && pstate.continuescan);
1873  Assert(offnum > pstate.skip);
1874 
1875  offnum = pstate.skip;
1876  pstate.skip = InvalidOffsetNumber;
1877  continue;
1878  }
1879 
1880  if (passes_quals && tuple_alive)
1881  {
1882  /* tuple passes all scan key conditions */
1883  pstate.firstmatch = true;
1884  if (!BTreeTupleIsPosting(itup))
1885  {
1886  /* Remember it */
1887  itemIndex--;
1888  _bt_saveitem(so, itemIndex, offnum, itup);
1889  }
1890  else
1891  {
1892  int tupleOffset;
1893 
1894  /*
1895  * Set up state to return posting list, and remember first
1896  * TID.
1897  *
1898  * Note that we deliberately save/return items from
1899  * posting lists in ascending heap TID order for backwards
1900  * scans. This allows _bt_killitems() to make a
1901  * consistent assumption about the order of items
1902  * associated with the same posting list tuple.
1903  */
1904  itemIndex--;
1905  tupleOffset =
1906  _bt_setuppostingitems(so, itemIndex, offnum,
1907  BTreeTupleGetPostingN(itup, 0),
1908  itup);
1909  /* Remember additional TIDs */
1910  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1911  {
1912  itemIndex--;
1913  _bt_savepostingitem(so, itemIndex, offnum,
1914  BTreeTupleGetPostingN(itup, i),
1915  tupleOffset);
1916  }
1917  }
1918  }
1919  if (!pstate.continuescan)
1920  {
1921  /* there can't be any more matches, so stop */
1922  so->currPos.moreLeft = false;
1923  break;
1924  }
1925 
1926  offnum = OffsetNumberPrev(offnum);
1927  }
1928 
1929  Assert(itemIndex >= 0);
1930  so->currPos.firstItem = itemIndex;
1933  }
1934 
1935  return (so->currPos.firstItem <= so->currPos.lastItem);
1936 }
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3974
#define Max(x, y)
Definition: c.h:998
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1940
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1970
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:2008
bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtutils.c:3512
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:517
int nextTupleOffset
Definition: nbtree.h:981
BlockNumber nextPage
Definition: nbtree.h:957
XLogRecPtr lsn
Definition: nbtree.h:955
bool ignore_killed_tuples
Definition: relscan.h:129

References _bt_checkkeys(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_setuppostingitems(), Assert, BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage(), BufferIsValid(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, InvalidOffsetNumber, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, OffsetNumberIsValid, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_HIKEY, P_LEFTMOST, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, BTScanOpaqueData::scanBehind, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_readnextpage().

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1940 of file nbtsearch.c.

1942 {
1943  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1944 
1945  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1946 
1947  currItem->heapTid = itup->t_tid;
1948  currItem->indexOffset = offnum;
1949  if (so->currTuples)
1950  {
1951  Size itupsz = IndexTupleSize(itup);
1952 
1953  currItem->tupleOffset = so->currPos.nextTupleOffset;
1954  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1955  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1956  }
1957 }
#define MAXALIGN(LEN)
Definition: c.h:811
size_t Size
Definition: c.h:605
#define IndexTupleSize(itup)
Definition: itup.h:70
ItemPointerData heapTid
Definition: nbtree.h:946
LocationIndex tupleOffset
Definition: nbtree.h:948
OffsetNumber indexOffset
Definition: nbtree.h:947
ItemPointerData t_tid
Definition: itup.h:37

References Assert, BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_savepostingitem()

static void _bt_savepostingitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
int  tupleOffset 
)
inlinestatic

Definition at line 2008 of file nbtsearch.c.

2010 {
2011  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2012 
2013  currItem->heapTid = *heapTid;
2014  currItem->indexOffset = offnum;
2015 
2016  /*
2017  * Have index-only scans return the same base IndexTuple for every TID
2018  * that originates from the same posting list
2019  */
2020  if (so->currTuples)
2021  currItem->tupleOffset = tupleOffset;
2022 }

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 96 of file nbtsearch.c.

98 {
99  BTStack stack_in = NULL;
100  int page_access = BT_READ;
101 
102  /* heaprel must be set whenever _bt_allocbuf is reachable */
103  Assert(access == BT_READ || access == BT_WRITE);
104  Assert(access == BT_READ || heaprel != NULL);
105 
106  /* Get the root page to start with */
107  *bufP = _bt_getroot(rel, heaprel, access);
108 
109  /* If index is empty and access = BT_READ, no root page is created. */
110  if (!BufferIsValid(*bufP))
111  return (BTStack) NULL;
112 
113  /* Loop iterates once per level descended in the tree */
114  for (;;)
115  {
116  Page page;
117  BTPageOpaque opaque;
118  OffsetNumber offnum;
119  ItemId itemid;
120  IndexTuple itup;
121  BlockNumber child;
122  BTStack new_stack;
123 
124  /*
125  * Race -- the page we just grabbed may have split since we read its
126  * downlink in its parent page (or the metapage). If it has, we may
127  * need to move right to its new sibling. Do that.
128  *
129  * In write-mode, allow _bt_moveright to finish any incomplete splits
130  * along the way. Strictly speaking, we'd only need to finish an
131  * incomplete split on the leaf page we're about to insert to, not on
132  * any of the upper levels (internal pages with incomplete splits are
133  * also taken care of in _bt_getstackbuf). But this is a good
134  * opportunity to finish splits of internal pages too.
135  */
136  *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
137  stack_in, page_access);
138 
139  /* if this is a leaf page, we're done */
140  page = BufferGetPage(*bufP);
141  opaque = BTPageGetOpaque(page);
142  if (P_ISLEAF(opaque))
143  break;
144 
145  /*
146  * Find the appropriate pivot tuple on this page. Its downlink points
147  * to the child page that we're about to descend to.
148  */
149  offnum = _bt_binsrch(rel, key, *bufP);
150  itemid = PageGetItemId(page, offnum);
151  itup = (IndexTuple) PageGetItem(page, itemid);
152  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
153  child = BTreeTupleGetDownLink(itup);
154 
155  /*
156  * We need to save the location of the pivot tuple we chose in a new
157  * stack entry for this page/level. If caller ends up splitting a
158  * page one level down, it usually ends up inserting a new pivot
159  * tuple/downlink immediately after the location recorded here.
160  */
161  new_stack = (BTStack) palloc(sizeof(BTStackData));
162  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
163  new_stack->bts_offset = offnum;
164  new_stack->bts_parent = stack_in;
165 
166  /*
167  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
168  * we're on the level 1 and asked to lock leaf page in write mode,
169  * then lock next page in write mode, because it must be a leaf.
170  */
171  if (opaque->btpo_level == 1 && access == BT_WRITE)
172  page_access = BT_WRITE;
173 
174  /* drop the read lock on the page, then acquire one on its child */
175  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
176 
177  /* okay, all set to move down a level */
178  stack_in = new_stack;
179  }
180 
181  /*
182  * If we're asked to lock leaf in write mode, but didn't manage to, then
183  * relock. This should only happen when the root page is a leaf page (and
184  * the only page in the index other than the metapage).
185  */
186  if (access == BT_WRITE && page_access == BT_READ)
187  {
188  /* trade in our read lock for a write lock */
189  _bt_unlockbuf(rel, *bufP);
190  _bt_lockbuf(rel, *bufP, BT_WRITE);
191 
192  /*
193  * Race -- the leaf page may have split after we dropped the read lock
194  * but before we acquired a write lock. If it has, we may need to
195  * move right to its new sibling. Do that.
196  */
197  *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
198  }
199 
200  return stack_in;
201 }
void * palloc(Size size)
Definition: mcxt.c:1316
BTStackData * BTStack
Definition: nbtree.h:739
Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:235
BlockNumber bts_blkno
Definition: nbtree.h:734
struct BTStackData * bts_parent
Definition: nbtree.h:736
OffsetNumber bts_offset
Definition: nbtree.h:735

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_setuppostingitems()

static int _bt_setuppostingitems ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
IndexTuple  itup 
)
static

Definition at line 1970 of file nbtsearch.c.

1972 {
1973  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1974 
1975  Assert(BTreeTupleIsPosting(itup));
1976 
1977  currItem->heapTid = *heapTid;
1978  currItem->indexOffset = offnum;
1979  if (so->currTuples)
1980  {
1981  /* Save base IndexTuple (truncate posting list) */
1982  IndexTuple base;
1983  Size itupsz = BTreeTupleGetPostingOffset(itup);
1984 
1985  itupsz = MAXALIGN(itupsz);
1986  currItem->tupleOffset = so->currPos.nextTupleOffset;
1987  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1988  memcpy(base, itup, itupsz);
1989  /* Defensively reduce work area index tuple header size */
1990  base->t_info &= ~INDEX_SIZE_MASK;
1991  base->t_info |= itupsz;
1992  so->currPos.nextTupleOffset += itupsz;
1993 
1994  return currItem->tupleOffset;
1995  }
1996 
1997  return 0;
1998 }
#define INDEX_SIZE_MASK
Definition: itup.h:65
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529
unsigned short t_info
Definition: itup.h:49

References Assert, BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, INDEX_SIZE_MASK, BTScanPosItem::indexOffset, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_info, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2036 of file nbtsearch.c.

2037 {
2038  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2040  bool status;
2041 
2043 
2044  /* Before leaving current page, deal with any killed items */
2045  if (so->numKilled > 0)
2046  _bt_killitems(scan);
2047 
2048  /*
2049  * Before we modify currPos, make a copy of the page data if there was a
2050  * mark position that needs it.
2051  */
2052  if (so->markItemIndex >= 0)
2053  {
2054  /* bump pin on current buffer for assignment to mark buffer */
2055  if (BTScanPosIsPinned(so->currPos))
2057  memcpy(&so->markPos, &so->currPos,
2058  offsetof(BTScanPosData, items[1]) +
2059  so->currPos.lastItem * sizeof(BTScanPosItem));
2060  if (so->markTuples)
2061  memcpy(so->markTuples, so->currTuples,
2062  so->currPos.nextTupleOffset);
2063  so->markPos.itemIndex = so->markItemIndex;
2064  so->markItemIndex = -1;
2065 
2066  /*
2067  * If we're just about to start the next primitive index scan
2068  * (possible with a scan that has arrays keys, and needs to skip to
2069  * continue in the current scan direction), moreLeft/moreRight only
2070  * indicate the end of the current primitive index scan. They must
2071  * never be taken to indicate that the top-level index scan has ended
2072  * (that would be wrong).
2073  *
2074  * We could handle this case by treating the current array keys as
2075  * markPos state. But depending on the current array state like this
2076  * would add complexity. Instead, we just unset markPos's copy of
2077  * moreRight or moreLeft (whichever might be affected), while making
2078  * btrestpos reset the scan's arrays to their initial scan positions.
2079  * In effect, btrestpos leaves advancing the arrays up to the first
2080  * _bt_readpage call (that takes place after it has restored markPos).
2081  */
2082  Assert(so->markPos.dir == dir);
2083  if (so->needPrimScan)
2084  {
2085  if (ScanDirectionIsForward(dir))
2086  so->markPos.moreRight = true;
2087  else
2088  so->markPos.moreLeft = true;
2089  }
2090  }
2091 
2092  if (ScanDirectionIsForward(dir))
2093  {
2094  /* Walk right to the next page with data */
2095  if (scan->parallel_scan != NULL)
2096  {
2097  /*
2098  * Seize the scan to get the next block number; if the scan has
2099  * ended already, bail out.
2100  */
2101  status = _bt_parallel_seize(scan, &blkno, false);
2102  if (!status)
2103  {
2104  /* release the previous buffer, if pinned */
2107  return false;
2108  }
2109  }
2110  else
2111  {
2112  /* Not parallel, so use the previously-saved nextPage link. */
2113  blkno = so->currPos.nextPage;
2114  }
2115 
2116  /* Remember we left a page with data */
2117  so->currPos.moreLeft = true;
2118 
2119  /* release the previous buffer, if pinned */
2121  }
2122  else
2123  {
2124  /* Remember we left a page with data */
2125  so->currPos.moreRight = true;
2126 
2127  if (scan->parallel_scan != NULL)
2128  {
2129  /*
2130  * Seize the scan to get the current block number; if the scan has
2131  * ended already, bail out.
2132  */
2133  status = _bt_parallel_seize(scan, &blkno, false);
2135  if (!status)
2136  {
2138  return false;
2139  }
2140  }
2141  else
2142  {
2143  /* Not parallel, so just use our own notion of the current page */
2144  blkno = so->currPos.currPage;
2145  }
2146  }
2147 
2148  if (!_bt_readnextpage(scan, blkno, dir))
2149  return false;
2150 
2151  /* We have at least one item to return as scan's next item */
2153 
2154  return true;
2155 }
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4928
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:4175
char * markTuples
Definition: nbtree.h:1065
BTScanPosData markPos
Definition: nbtree.h:1078
static ItemArray items
Definition: test_tidstore.c:49

References _bt_drop_lock_and_maybe_pin(), _bt_killitems(), _bt_parallel_seize(), _bt_readnextpage(), Assert, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, IncrBufferRefCount(), InvalidBlockNumber, BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

◆ _bt_walk_left()

static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf 
)
static

Definition at line 2365 of file nbtsearch.c.

2366 {
2367  Page page;
2368  BTPageOpaque opaque;
2369 
2370  page = BufferGetPage(buf);
2371  opaque = BTPageGetOpaque(page);
2372 
2373  for (;;)
2374  {
2375  BlockNumber obknum;
2376  BlockNumber lblkno;
2377  BlockNumber blkno;
2378  int tries;
2379 
2380  /* if we're at end of tree, release buf and return failure */
2381  if (P_LEFTMOST(opaque))
2382  {
2383  _bt_relbuf(rel, buf);
2384  break;
2385  }
2386  /* remember original page we are stepping left from */
2387  obknum = BufferGetBlockNumber(buf);
2388  /* step left */
2389  blkno = lblkno = opaque->btpo_prev;
2390  _bt_relbuf(rel, buf);
2391  /* check for interrupts while we're not holding any buffer lock */
2393  buf = _bt_getbuf(rel, blkno, BT_READ);
2394  page = BufferGetPage(buf);
2395  opaque = BTPageGetOpaque(page);
2396 
2397  /*
2398  * If this isn't the page we want, walk right till we find what we
2399  * want --- but go no more than four hops (an arbitrary limit). If we
2400  * don't find the correct page by then, the most likely bet is that
2401  * the original page got deleted and isn't in the sibling chain at all
2402  * anymore, not that its left sibling got split more than four times.
2403  *
2404  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2405  * because half-dead pages are still in the sibling chain.
2406  */
2407  tries = 0;
2408  for (;;)
2409  {
2410  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2411  {
2412  /* Found desired page, return it */
2413  return buf;
2414  }
2415  if (P_RIGHTMOST(opaque) || ++tries > 4)
2416  break;
2417  blkno = opaque->btpo_next;
2418  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2419  page = BufferGetPage(buf);
2420  opaque = BTPageGetOpaque(page);
2421  }
2422 
2423  /* Return to the original page to see what's up */
2424  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2425  page = BufferGetPage(buf);
2426  opaque = BTPageGetOpaque(page);
2427  if (P_ISDELETED(opaque))
2428  {
2429  /*
2430  * It was deleted. Move right to first nondeleted page (there
2431  * must be one); that is the page that has acquired the deleted
2432  * one's keyspace, so stepping left from it will take us where we
2433  * want to be.
2434  */
2435  for (;;)
2436  {
2437  if (P_RIGHTMOST(opaque))
2438  elog(ERROR, "fell off the end of index \"%s\"",
2440  blkno = opaque->btpo_next;
2441  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2442  page = BufferGetPage(buf);
2443  opaque = BTPageGetOpaque(page);
2444  if (!P_ISDELETED(opaque))
2445  break;
2446  }
2447 
2448  /*
2449  * Now return to top of loop, resetting obknum to point to this
2450  * nondeleted page, and try again.
2451  */
2452  }
2453  else
2454  {
2455  /*
2456  * It wasn't deleted; the explanation had better be that the page
2457  * to the left got split or deleted. Without this check, we'd go
2458  * into an infinite loop if there's anything wrong.
2459  */
2460  if (opaque->btpo_prev == lblkno)
2461  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2462  obknum, RelationGetRelationName(rel));
2463  /* Okay to try again with new lblkno value */
2464  }
2465  }
2466 
2467  return InvalidBuffer;
2468 }
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
BlockNumber btpo_prev
Definition: nbtree.h:64

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_readnextpage().