PostgreSQL Source Code  git master
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static Buffer _bt_moveright (Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static int _bt_setuppostingitems (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
 
static void _bt_savepostingitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static bool _bt_parallel_readpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_initialize_more_data (BTScanOpaque so, ScanDirection dir)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 340 of file nbtsearch.c.

343 {
344  Page page;
345  BTPageOpaque opaque;
346  OffsetNumber low,
347  high;
348  int32 result,
349  cmpval;
350 
351  page = BufferGetPage(buf);
352  opaque = BTPageGetOpaque(page);
353 
354  /* Requesting nextkey semantics while using scantid seems nonsensical */
355  Assert(!key->nextkey || key->scantid == NULL);
356  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
357  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
358 
359  low = P_FIRSTDATAKEY(opaque);
360  high = PageGetMaxOffsetNumber(page);
361 
362  /*
363  * If there are no keys on the page, return the first available slot. Note
364  * this covers two cases: the page is really empty (no keys), or it
365  * contains only a high key. The latter case is possible after vacuuming.
366  * This can never happen on an internal page, however, since they are
367  * never empty (an internal page must have at least one child).
368  */
369  if (unlikely(high < low))
370  return low;
371 
372  /*
373  * Binary search to find the first key on the page >= scan key, or first
374  * key > scankey when nextkey is true.
375  *
376  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
377  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
378  *
379  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
380  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
381  *
382  * We can fall out when high == low.
383  */
384  high++; /* establish the loop invariant for high */
385 
386  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
387 
388  while (high > low)
389  {
390  OffsetNumber mid = low + ((high - low) / 2);
391 
392  /* We have low <= mid < high, so mid points at a real slot */
393 
394  result = _bt_compare(rel, key, page, mid);
395 
396  if (result >= cmpval)
397  low = mid + 1;
398  else
399  high = mid;
400  }
401 
402  /*
403  * At this point we have high == low.
404  *
405  * On a leaf page we always return the first non-pivot tuple >= scan key
406  * (resp. > scan key) for forward scan callers. For backward scans, it's
407  * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
408  */
409  if (P_ISLEAF(opaque))
410  {
411  /*
412  * In the backward scan case we're supposed to locate the last
413  * matching tuple on the leaf level -- not the first matching tuple
414  * (the last tuple will be the first one returned by the scan).
415  *
416  * At this point we've located the first non-pivot tuple immediately
417  * after the last matching tuple (which might just be maxoff + 1).
418  * Compensate by stepping back.
419  */
420  if (key->backward)
421  return OffsetNumberPrev(low);
422 
423  return low;
424  }
425 
426  /*
427  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
428  * There must be one if _bt_compare() is playing by the rules.
429  *
430  * _bt_compare() will seldom see any exactly-matching pivot tuples, since
431  * a truncated -inf heap TID is usually enough to prevent it altogether.
432  * Even omitted scan key entries are treated as > truncated attributes.
433  *
434  * However, during backward scans _bt_compare() interprets omitted scan
435  * key attributes as == corresponding truncated -inf attributes instead.
436  * This works just like < would work here. Under this scheme, < strategy
437  * backward scans will always directly descend to the correct leaf page.
438  * In particular, they will never incur an "extra" leaf page access with a
439  * scan key that happens to contain the same prefix of values as some
440  * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
441  * it uses a leaf page high key to "re-find" a page undergoing deletion.
442  */
443  Assert(low > P_FIRSTDATAKEY(opaque));
444 
445  return OffsetNumberPrev(low);
446 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
Pointer Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
signed int int32
Definition: c.h:497
#define Assert(condition)
Definition: c.h:861
#define unlikely(x)
Definition: c.h:314
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:685
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:73

References _bt_compare(), Assert, BTPageGetOpaque, buf, BufferGetPage(), sort-test::key, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), and unlikely.

Referenced by _bt_first(), and _bt_search().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 471 of file nbtsearch.c.

472 {
473  BTScanInsert key = insertstate->itup_key;
474  Page page;
475  BTPageOpaque opaque;
476  OffsetNumber low,
477  high,
478  stricthigh;
479  int32 result,
480  cmpval;
481 
482  page = BufferGetPage(insertstate->buf);
483  opaque = BTPageGetOpaque(page);
484 
485  Assert(P_ISLEAF(opaque));
486  Assert(!key->nextkey);
487  Assert(insertstate->postingoff == 0);
488 
489  if (!insertstate->bounds_valid)
490  {
491  /* Start new binary search */
492  low = P_FIRSTDATAKEY(opaque);
493  high = PageGetMaxOffsetNumber(page);
494  }
495  else
496  {
497  /* Restore result of previous binary search against same page */
498  low = insertstate->low;
499  high = insertstate->stricthigh;
500  }
501 
502  /* If there are no keys on the page, return the first available slot */
503  if (unlikely(high < low))
504  {
505  /* Caller can't reuse bounds */
506  insertstate->low = InvalidOffsetNumber;
507  insertstate->stricthigh = InvalidOffsetNumber;
508  insertstate->bounds_valid = false;
509  return low;
510  }
511 
512  /*
513  * Binary search to find the first key on the page >= scan key. (nextkey
514  * is always false when inserting).
515  *
516  * The loop invariant is: all slots before 'low' are < scan key, all slots
517  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
518  * maintained to save additional search effort for caller.
519  *
520  * We can fall out when high == low.
521  */
522  if (!insertstate->bounds_valid)
523  high++; /* establish the loop invariant for high */
524  stricthigh = high; /* high initially strictly higher */
525 
526  cmpval = 1; /* !nextkey comparison value */
527 
528  while (high > low)
529  {
530  OffsetNumber mid = low + ((high - low) / 2);
531 
532  /* We have low <= mid < high, so mid points at a real slot */
533 
534  result = _bt_compare(rel, key, page, mid);
535 
536  if (result >= cmpval)
537  low = mid + 1;
538  else
539  {
540  high = mid;
541  if (result != 0)
542  stricthigh = high;
543  }
544 
545  /*
546  * If tuple at offset located by binary search is a posting list whose
547  * TID range overlaps with caller's scantid, perform posting list
548  * binary search to set postingoff for caller. Caller must split the
549  * posting list when postingoff is set. This should happen
550  * infrequently.
551  */
552  if (unlikely(result == 0 && key->scantid != NULL))
553  {
554  /*
555  * postingoff should never be set more than once per leaf page
556  * binary search. That would mean that there are duplicate table
557  * TIDs in the index, which is never okay. Check for that here.
558  */
559  if (insertstate->postingoff != 0)
560  ereport(ERROR,
561  (errcode(ERRCODE_INDEX_CORRUPTED),
562  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
563  ItemPointerGetBlockNumber(key->scantid),
565  low, stricthigh,
566  BufferGetBlockNumber(insertstate->buf),
567  RelationGetRelationName(rel))));
568 
569  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
570  }
571  }
572 
573  /*
574  * On a leaf page, a binary search always returns the first key >= scan
575  * key (at least in !nextkey case), which could be the last slot + 1. This
576  * is also the lower bound of cached search.
577  *
578  * stricthigh may also be the last slot + 1, which prevents caller from
579  * using bounds directly, but is still useful to us if we're called a
580  * second time with cached bounds (cached low will be < stricthigh when
581  * that happens).
582  */
583  insertstate->low = low;
584  insertstate->stricthigh = stricthigh;
585  insertstate->bounds_valid = true;
586 
587  return low;
588 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3724
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:599
#define InvalidOffsetNumber
Definition: off.h:26
#define RelationGetRelationName(relation)
Definition: rel.h:539
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 599 of file nbtsearch.c.

600 {
601  IndexTuple itup;
602  ItemId itemid;
603  int low,
604  high,
605  mid,
606  res;
607 
608  /*
609  * If this isn't a posting tuple, then the index must be corrupt (if it is
610  * an ordinary non-pivot tuple then there must be an existing tuple with a
611  * heap TID that equals inserter's new heap TID/scantid). Defensively
612  * check that tuple is a posting list tuple whose posting list range
613  * includes caller's scantid.
614  *
615  * (This is also needed because contrib/amcheck's rootdescend option needs
616  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
617  */
618  itemid = PageGetItemId(page, offnum);
619  itup = (IndexTuple) PageGetItem(page, itemid);
620  if (!BTreeTupleIsPosting(itup))
621  return 0;
622 
623  Assert(key->heapkeyspace && key->allequalimage);
624 
625  /*
626  * In the event that posting list tuple has LP_DEAD bit set, indicate this
627  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
628  * second call to _bt_binsrch_insert() can take place when its caller has
629  * removed the dead item.
630  */
631  if (ItemIdIsDead(itemid))
632  return -1;
633 
634  /* "high" is past end of posting list for loop invariant */
635  low = 0;
636  high = BTreeTupleGetNPosting(itup);
637  Assert(high >= 2);
638 
639  while (high > low)
640  {
641  mid = low + ((high - low) / 2);
642  res = ItemPointerCompare(key->scantid,
643  BTreeTupleGetPostingN(itup, mid));
644 
645  if (res > 0)
646  low = mid + 1;
647  else if (res < 0)
648  high = mid;
649  else
650  return mid;
651  }
652 
653  /* Exact match not found */
654  return low;
655 }
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), ItemIdIsDead, ItemPointerCompare(), sort-test::key, PageGetItem(), PageGetItemId(), and res.

Referenced by _bt_binsrch_insert().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 685 of file nbtsearch.c.

689 {
690  TupleDesc itupdesc = RelationGetDescr(rel);
691  BTPageOpaque opaque = BTPageGetOpaque(page);
692  IndexTuple itup;
693  ItemPointer heapTid;
694  ScanKey scankey;
695  int ncmpkey;
696  int ntupatts;
697  int32 result;
698 
699  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
701  Assert(key->heapkeyspace || key->scantid == NULL);
702 
703  /*
704  * Force result ">" if target item is first data item on an internal page
705  * --- see NOTE above.
706  */
707  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
708  return 1;
709 
710  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
711  ntupatts = BTreeTupleGetNAtts(itup, rel);
712 
713  /*
714  * The scan key is set up with the attribute number associated with each
715  * term in the key. It is important that, if the index is multi-key, the
716  * scan contain the first k key attributes, and that they be in order. If
717  * you think about how multi-key ordering works, you'll understand why
718  * this is.
719  *
720  * We don't test for violation of this condition here, however. The
721  * initial setup for the index scan had better have gotten it right (see
722  * _bt_first).
723  */
724 
725  ncmpkey = Min(ntupatts, key->keysz);
726  Assert(key->heapkeyspace || ncmpkey == key->keysz);
727  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
728  scankey = key->scankeys;
729  for (int i = 1; i <= ncmpkey; i++)
730  {
731  Datum datum;
732  bool isNull;
733 
734  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
735 
736  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
737  {
738  if (isNull)
739  result = 0; /* NULL "=" NULL */
740  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
741  result = -1; /* NULL "<" NOT_NULL */
742  else
743  result = 1; /* NULL ">" NOT_NULL */
744  }
745  else if (isNull) /* key is NOT_NULL and item is NULL */
746  {
747  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
748  result = 1; /* NOT_NULL ">" NULL */
749  else
750  result = -1; /* NOT_NULL "<" NULL */
751  }
752  else
753  {
754  /*
755  * The sk_func needs to be passed the index value as left arg and
756  * the sk_argument as right arg (they might be of different
757  * types). Since it is convenient for callers to think of
758  * _bt_compare as comparing the scankey to the index item, we have
759  * to flip the sign of the comparison result. (Unless it's a DESC
760  * column, in which case we *don't* flip the sign.)
761  */
762  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
763  scankey->sk_collation,
764  datum,
765  scankey->sk_argument));
766 
767  if (!(scankey->sk_flags & SK_BT_DESC))
768  INVERT_COMPARE_RESULT(result);
769  }
770 
771  /* if the keys are unequal, return the difference */
772  if (result != 0)
773  return result;
774 
775  scankey++;
776  }
777 
778  /*
779  * All non-truncated attributes (other than heap TID) were found to be
780  * equal. Treat truncated attributes as minus infinity when scankey has a
781  * key attribute value that would otherwise be compared directly.
782  *
783  * Note: it doesn't matter if ntupatts includes non-key attributes;
784  * scankey won't, so explicitly excluding non-key attributes isn't
785  * necessary.
786  */
787  if (key->keysz > ntupatts)
788  return 1;
789 
790  /*
791  * Use the heap TID attribute and scantid to try to break the tie. The
792  * rules are the same as any other key attribute -- only the
793  * representation differs.
794  */
795  heapTid = BTreeTupleGetHeapTID(itup);
796  if (key->scantid == NULL)
797  {
798  /*
799  * Forward scans have a scankey that is considered greater than a
800  * truncated pivot tuple if and when the scankey has equal values for
801  * attributes up to and including the least significant untruncated
802  * attribute in tuple. Even attributes that were omitted from the
803  * scan key are considered greater than -inf truncated attributes.
804  * (See _bt_binsrch for an explanation of our backward scan behavior.)
805  *
806  * For example, if an index has the minimum two attributes (single
807  * user key attribute, plus heap TID attribute), and a page's high key
808  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
809  * will not descend to the page to the left. The search will descend
810  * right instead. The truncated attribute in pivot tuple means that
811  * all non-pivot tuples on the page to the left are strictly < 'foo',
812  * so it isn't necessary to descend left. In other words, search
813  * doesn't have to descend left because it isn't interested in a match
814  * that has a heap TID value of -inf.
815  *
816  * Note: the heap TID part of the test ensures that scankey is being
817  * compared to a pivot tuple with one or more truncated -inf key
818  * attributes. The heap TID attribute is the last key attribute in
819  * every index, of course, but other than that it isn't special.
820  */
821  if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
822  key->heapkeyspace)
823  return 1;
824 
825  /* All provided scankey arguments found to be equal */
826  return 0;
827  }
828 
829  /*
830  * Treat truncated heap TID as minus infinity, since scankey has a key
831  * attribute value (scantid) that would otherwise be compared directly
832  */
834  if (heapTid == NULL)
835  return 1;
836 
837  /*
838  * Scankey must be treated as equal to a posting list tuple if its scantid
839  * value falls within the range of the posting list. In all other cases
840  * there can only be a single heap TID value, which is compared directly
841  * with scantid.
842  */
844  result = ItemPointerCompare(key->scantid, heapTid);
845  if (result <= 0 || !BTreeTupleIsPosting(itup))
846  return result;
847  else
848  {
849  result = ItemPointerCompare(key->scantid,
851  if (result > 0)
852  return 1;
853  }
854 
855  return 0;
856 }
#define Min(x, y)
Definition: c.h:1007
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1109
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int i
Definition: isn.c:73
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1128
#define SK_BT_DESC
Definition: nbtree.h:1127
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4900
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define RelationGetDescr(relation)
Definition: rel.h:531
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 64 of file nbtsearch.c.

65 {
66  _bt_unlockbuf(scan->indexRelation, sp->buf);
67 
68  if (IsMVCCSnapshot(scan->xs_snapshot) &&
70  !scan->xs_want_itup)
71  {
72  ReleaseBuffer(sp->buf);
73  sp->buf = InvalidBuffer;
74  }
75 }
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4924
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
Buffer buf
Definition: nbtree.h:953
Relation indexRelation
Definition: relscan.h:119
struct SnapshotData * xs_snapshot
Definition: relscan.h:120

References _bt_unlockbuf(), BTScanPosData::buf, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2580 of file nbtsearch.c.

2581 {
2582  Relation rel = scan->indexRelation;
2583  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2584  Buffer buf;
2585  Page page;
2586  BTPageOpaque opaque;
2588  BTScanPosItem *currItem;
2589 
2590  /*
2591  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2592  * version of _bt_search(). We don't maintain a stack since we know we
2593  * won't need it.
2594  */
2596 
2597  if (!BufferIsValid(buf))
2598  {
2599  /*
2600  * Empty index. Lock the whole relation, as nothing finer to lock
2601  * exists.
2602  */
2603  PredicateLockRelation(rel, scan->xs_snapshot);
2605  return false;
2606  }
2607 
2609  page = BufferGetPage(buf);
2610  opaque = BTPageGetOpaque(page);
2611  Assert(P_ISLEAF(opaque));
2612 
2613  if (ScanDirectionIsForward(dir))
2614  {
2615  /* There could be dead pages to the left, so not this: */
2616  /* Assert(P_LEFTMOST(opaque)); */
2617 
2618  start = P_FIRSTDATAKEY(opaque);
2619  }
2620  else if (ScanDirectionIsBackward(dir))
2621  {
2622  Assert(P_RIGHTMOST(opaque));
2623 
2624  start = PageGetMaxOffsetNumber(page);
2625  }
2626  else
2627  {
2628  elog(ERROR, "invalid scan direction: %d", (int) dir);
2629  start = 0; /* keep compiler quiet */
2630  }
2631 
2632  /* remember which buffer we have pinned */
2633  so->currPos.buf = buf;
2634 
2635  _bt_initialize_more_data(so, dir);
2636 
2637  /*
2638  * Now load data from the first page of the scan.
2639  */
2640  if (!_bt_readpage(scan, dir, start, true))
2641  {
2642  /*
2643  * There's no actually-matching data on this page. Try to advance to
2644  * the next page. Return false if there's no matching data at all.
2645  */
2646  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2647  if (!_bt_steppage(scan, dir))
2648  return false;
2649  }
2650  else
2651  {
2652  /* We have at least one item to return as scan's first item */
2654  }
2655 
2656  /* OK, itemIndex says what to return */
2657  currItem = &so->currPos.items[so->currPos.itemIndex];
2658  scan->xs_heaptid = currItem->heapTid;
2659  if (scan->xs_want_itup)
2660  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2661 
2662  return true;
2663 }
int Buffer
Definition: buf.h:23
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
#define elog(elevel,...)
Definition: elog.h:225
return str start
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1022
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1081
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2499
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:64
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2047
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
Definition: nbtsearch.c:1563
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2670
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2584
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2561
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
BTScanPosData currPos
Definition: nbtree.h:1077
char * currTuples
Definition: nbtree.h:1064
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:994
int itemIndex
Definition: nbtree.h:992
IndexTuple xs_itup
Definition: relscan.h:143
ItemPointerData xs_heaptid
Definition: relscan.h:148

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_initialize_more_data(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert, BTPageGetOpaque, BTScanPosInvalidate, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog, ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber(), PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, start, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 879 of file nbtsearch.c.

880 {
881  Relation rel = scan->indexRelation;
882  BTScanOpaque so = (BTScanOpaque) scan->opaque;
883  Buffer buf;
884  BTStack stack;
885  OffsetNumber offnum;
886  StrategyNumber strat;
887  BTScanInsertData inskey;
888  ScanKey startKeys[INDEX_MAX_KEYS];
889  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
890  int keysz = 0;
891  int i;
892  bool status;
893  StrategyNumber strat_total;
894  BTScanPosItem *currItem;
895  BlockNumber blkno;
896 
898 
899  /*
900  * Examine the scan keys and eliminate any redundant keys; also mark the
901  * keys that must be matched to continue the scan.
902  */
903  _bt_preprocess_keys(scan);
904 
905  /*
906  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
907  * never be satisfied (eg, x == 1 AND x > 2).
908  */
909  if (!so->qual_ok)
910  {
911  _bt_parallel_done(scan);
912  return false;
913  }
914 
915  /*
916  * For parallel scans, get the starting page from shared state. If the
917  * scan has not started, proceed to find out first leaf page in the usual
918  * way while keeping other participating processes waiting. If the scan
919  * has already begun, use the page number from the shared structure.
920  *
921  * When a parallel scan has another primitive index scan scheduled, a
922  * parallel worker will seize the scan for that purpose now. This is
923  * similar to the case where the top-level scan hasn't started.
924  */
925  if (scan->parallel_scan != NULL)
926  {
927  status = _bt_parallel_seize(scan, &blkno, true);
928 
929  /*
930  * Initialize arrays (when _bt_parallel_seize didn't already set up
931  * the next primitive index scan)
932  */
933  if (so->numArrayKeys && !so->needPrimScan)
934  _bt_start_array_keys(scan, dir);
935 
936  if (!status)
937  return false;
938  else if (blkno == P_NONE)
939  {
940  _bt_parallel_done(scan);
941  return false;
942  }
943  else if (blkno != InvalidBlockNumber)
944  {
945  if (!_bt_parallel_readpage(scan, blkno, dir))
946  return false;
947  goto readcomplete;
948  }
949  }
950  else if (so->numArrayKeys && !so->needPrimScan)
951  {
952  /*
953  * First _bt_first call (for current btrescan) without parallelism.
954  *
955  * Initialize arrays, and the corresponding scan keys that were just
956  * output by _bt_preprocess_keys.
957  */
958  _bt_start_array_keys(scan, dir);
959  }
960 
961  /*
962  * Count an indexscan for stats, now that we know that we'll call
963  * _bt_search/_bt_endpoint below
964  */
966 
967  /*----------
968  * Examine the scan keys to discover where we need to start the scan.
969  *
970  * We want to identify the keys that can be used as starting boundaries;
971  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
972  * a backwards scan. We can use keys for multiple attributes so long as
973  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
974  * a > or < boundary or find an attribute with no boundary (which can be
975  * thought of as the same as "> -infinity"), we can't use keys for any
976  * attributes to its right, because it would break our simplistic notion
977  * of what initial positioning strategy to use.
978  *
979  * When the scan keys include cross-type operators, _bt_preprocess_keys
980  * may not be able to eliminate redundant keys; in such cases we will
981  * arbitrarily pick a usable one for each attribute. This is correct
982  * but possibly not optimal behavior. (For example, with keys like
983  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
984  * x=5 would be more efficient.) Since the situation only arises given
985  * a poorly-worded query plus an incomplete opfamily, live with it.
986  *
987  * When both equality and inequality keys appear for a single attribute
988  * (again, only possible when cross-type operators appear), we *must*
989  * select one of the equality keys for the starting point, because
990  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
991  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
992  * start at x=4, we will fail and stop before reaching x=10. If multiple
993  * equality quals survive preprocessing, however, it doesn't matter which
994  * one we use --- by definition, they are either redundant or
995  * contradictory.
996  *
997  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
998  * If the index stores nulls at the end of the index we'll be starting
999  * from, and we have no boundary key for the column (which means the key
1000  * we deduced NOT NULL from is an inequality key that constrains the other
1001  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1002  * use as a boundary key. If we didn't do this, we might find ourselves
1003  * traversing a lot of null entries at the start of the scan.
1004  *
1005  * In this loop, row-comparison keys are treated the same as keys on their
1006  * first (leftmost) columns. We'll add on lower-order columns of the row
1007  * comparison below, if possible.
1008  *
1009  * The selected scan keys (at most one per index column) are remembered by
1010  * storing their addresses into the local startKeys[] array.
1011  *
1012  * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1013  * the next primitive index scan (for scans with array keys) based in part
1014  * on an understanding of how it'll enable us to reposition the scan.
1015  * They're directly aware of how we'll sometimes cons up an explicit
1016  * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1017  * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1018  * scans to skip large groups of NULLs through repeated deductions about
1019  * key strictness (for a required inequality key) and whether NULLs in the
1020  * key's index column are stored last or first (relative to non-NULLs).
1021  * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1022  * need to be kept in sync.
1023  *----------
1024  */
1025  strat_total = BTEqualStrategyNumber;
1026  if (so->numberOfKeys > 0)
1027  {
1028  AttrNumber curattr;
1029  ScanKey chosen;
1030  ScanKey impliesNN;
1031  ScanKey cur;
1032 
1033  /*
1034  * chosen is the so-far-chosen key for the current attribute, if any.
1035  * We don't cast the decision in stone until we reach keys for the
1036  * next attribute.
1037  */
1038  curattr = 1;
1039  chosen = NULL;
1040  /* Also remember any scankey that implies a NOT NULL constraint */
1041  impliesNN = NULL;
1042 
1043  /*
1044  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1045  * pass to handle after-last-key processing. Actual exit from the
1046  * loop is at one of the "break" statements below.
1047  */
1048  for (cur = so->keyData, i = 0;; cur++, i++)
1049  {
1050  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1051  {
1052  /*
1053  * Done looking at keys for curattr. If we didn't find a
1054  * usable boundary key, see if we can deduce a NOT NULL key.
1055  */
1056  if (chosen == NULL && impliesNN != NULL &&
1057  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1058  ScanDirectionIsForward(dir) :
1060  {
1061  /* Yes, so build the key in notnullkeys[keysz] */
1062  chosen = &notnullkeys[keysz];
1063  ScanKeyEntryInitialize(chosen,
1065  (impliesNN->sk_flags &
1067  curattr,
1068  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1071  InvalidOid,
1072  InvalidOid,
1073  InvalidOid,
1074  (Datum) 0);
1075  }
1076 
1077  /*
1078  * If we still didn't find a usable boundary key, quit; else
1079  * save the boundary key pointer in startKeys.
1080  */
1081  if (chosen == NULL)
1082  break;
1083  startKeys[keysz++] = chosen;
1084 
1085  /*
1086  * Adjust strat_total, and quit if we have stored a > or <
1087  * key.
1088  */
1089  strat = chosen->sk_strategy;
1090  if (strat != BTEqualStrategyNumber)
1091  {
1092  strat_total = strat;
1093  if (strat == BTGreaterStrategyNumber ||
1094  strat == BTLessStrategyNumber)
1095  break;
1096  }
1097 
1098  /*
1099  * Done if that was the last attribute, or if next key is not
1100  * in sequence (implying no boundary key is available for the
1101  * next attribute).
1102  */
1103  if (i >= so->numberOfKeys ||
1104  cur->sk_attno != curattr + 1)
1105  break;
1106 
1107  /*
1108  * Reset for next attr.
1109  */
1110  curattr = cur->sk_attno;
1111  chosen = NULL;
1112  impliesNN = NULL;
1113  }
1114 
1115  /*
1116  * Can we use this key as a starting boundary for this attr?
1117  *
1118  * If not, does it imply a NOT NULL constraint? (Because
1119  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1120  * *any* inequality key works for that; we need not test.)
1121  */
1122  switch (cur->sk_strategy)
1123  {
1124  case BTLessStrategyNumber:
1126  if (chosen == NULL)
1127  {
1128  if (ScanDirectionIsBackward(dir))
1129  chosen = cur;
1130  else
1131  impliesNN = cur;
1132  }
1133  break;
1134  case BTEqualStrategyNumber:
1135  /* override any non-equality choice */
1136  chosen = cur;
1137  break;
1140  if (chosen == NULL)
1141  {
1142  if (ScanDirectionIsForward(dir))
1143  chosen = cur;
1144  else
1145  impliesNN = cur;
1146  }
1147  break;
1148  }
1149  }
1150  }
1151 
1152  /*
1153  * If we found no usable boundary keys, we have to start from one end of
1154  * the tree. Walk down that edge to the first or last key, and scan from
1155  * there.
1156  */
1157  if (keysz == 0)
1158  {
1159  bool match;
1160 
1161  match = _bt_endpoint(scan, dir);
1162 
1163  if (!match)
1164  {
1165  /* No match, so mark (parallel) scan finished */
1166  _bt_parallel_done(scan);
1167  }
1168 
1169  return match;
1170  }
1171 
1172  /*
1173  * We want to start the scan somewhere within the index. Set up an
1174  * insertion scankey we can use to search for the boundary point we
1175  * identified above. The insertion scankey is built using the keys
1176  * identified by startKeys[]. (Remaining insertion scankey fields are
1177  * initialized after initial-positioning strategy is finalized.)
1178  */
1179  Assert(keysz <= INDEX_MAX_KEYS);
1180  for (i = 0; i < keysz; i++)
1181  {
1182  ScanKey cur = startKeys[i];
1183 
1184  Assert(cur->sk_attno == i + 1);
1185 
1186  if (cur->sk_flags & SK_ROW_HEADER)
1187  {
1188  /*
1189  * Row comparison header: look to the first row member instead.
1190  *
1191  * The member scankeys are already in insertion format (ie, they
1192  * have sk_func = 3-way-comparison function), but we have to watch
1193  * out for nulls, which _bt_preprocess_keys didn't check. A null
1194  * in the first row member makes the condition unmatchable, just
1195  * like qual_ok = false.
1196  */
1197  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1198 
1199  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1200  if (subkey->sk_flags & SK_ISNULL)
1201  {
1202  _bt_parallel_done(scan);
1203  return false;
1204  }
1205  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1206 
1207  /*
1208  * If the row comparison is the last positioning key we accepted,
1209  * try to add additional keys from the lower-order row members.
1210  * (If we accepted independent conditions on additional index
1211  * columns, we use those instead --- doesn't seem worth trying to
1212  * determine which is more restrictive.) Note that this is OK
1213  * even if the row comparison is of ">" or "<" type, because the
1214  * condition applied to all but the last row member is effectively
1215  * ">=" or "<=", and so the extra keys don't break the positioning
1216  * scheme. But, by the same token, if we aren't able to use all
1217  * the row members, then the part of the row comparison that we
1218  * did use has to be treated as just a ">=" or "<=" condition, and
1219  * so we'd better adjust strat_total accordingly.
1220  */
1221  if (i == keysz - 1)
1222  {
1223  bool used_all_subkeys = false;
1224 
1225  Assert(!(subkey->sk_flags & SK_ROW_END));
1226  for (;;)
1227  {
1228  subkey++;
1229  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1230  if (subkey->sk_attno != keysz + 1)
1231  break; /* out-of-sequence, can't use it */
1232  if (subkey->sk_strategy != cur->sk_strategy)
1233  break; /* wrong direction, can't use it */
1234  if (subkey->sk_flags & SK_ISNULL)
1235  break; /* can't use null keys */
1236  Assert(keysz < INDEX_MAX_KEYS);
1237  memcpy(inskey.scankeys + keysz, subkey,
1238  sizeof(ScanKeyData));
1239  keysz++;
1240  if (subkey->sk_flags & SK_ROW_END)
1241  {
1242  used_all_subkeys = true;
1243  break;
1244  }
1245  }
1246  if (!used_all_subkeys)
1247  {
1248  switch (strat_total)
1249  {
1250  case BTLessStrategyNumber:
1251  strat_total = BTLessEqualStrategyNumber;
1252  break;
1254  strat_total = BTGreaterEqualStrategyNumber;
1255  break;
1256  }
1257  }
1258  break; /* done with outer loop */
1259  }
1260  }
1261  else
1262  {
1263  /*
1264  * Ordinary comparison key. Transform the search-style scan key
1265  * to an insertion scan key by replacing the sk_func with the
1266  * appropriate btree comparison function.
1267  *
1268  * If scankey operator is not a cross-type comparison, we can use
1269  * the cached comparison function; otherwise gotta look it up in
1270  * the catalogs. (That can't lead to infinite recursion, since no
1271  * indexscan initiated by syscache lookup will use cross-data-type
1272  * operators.)
1273  *
1274  * We support the convention that sk_subtype == InvalidOid means
1275  * the opclass input type; this is a hack to simplify life for
1276  * ScanKeyInit().
1277  */
1278  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1279  cur->sk_subtype == InvalidOid)
1280  {
1281  FmgrInfo *procinfo;
1282 
1283  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1284  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1285  cur->sk_flags,
1286  cur->sk_attno,
1288  cur->sk_subtype,
1289  cur->sk_collation,
1290  procinfo,
1291  cur->sk_argument);
1292  }
1293  else
1294  {
1295  RegProcedure cmp_proc;
1296 
1297  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1298  rel->rd_opcintype[i],
1299  cur->sk_subtype,
1300  BTORDER_PROC);
1301  if (!RegProcedureIsValid(cmp_proc))
1302  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1303  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1304  cur->sk_attno, RelationGetRelationName(rel));
1305  ScanKeyEntryInitialize(inskey.scankeys + i,
1306  cur->sk_flags,
1307  cur->sk_attno,
1309  cur->sk_subtype,
1310  cur->sk_collation,
1311  cmp_proc,
1312  cur->sk_argument);
1313  }
1314  }
1315  }
1316 
1317  /*----------
1318  * Examine the selected initial-positioning strategy to determine exactly
1319  * where we need to start the scan, and set flag variables to control the
1320  * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1321  * page _bt_search returns).
1322  *----------
1323  */
1324  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1325  inskey.anynullkeys = false; /* unused */
1326  inskey.scantid = NULL;
1327  inskey.keysz = keysz;
1328  switch (strat_total)
1329  {
1330  case BTLessStrategyNumber:
1331 
1332  inskey.nextkey = false;
1333  inskey.backward = true;
1334  break;
1335 
1337 
1338  inskey.nextkey = true;
1339  inskey.backward = true;
1340  break;
1341 
1342  case BTEqualStrategyNumber:
1343 
1344  /*
1345  * If a backward scan was specified, need to start with last equal
1346  * item not first one.
1347  */
1348  if (ScanDirectionIsBackward(dir))
1349  {
1350  /*
1351  * This is the same as the <= strategy
1352  */
1353  inskey.nextkey = true;
1354  inskey.backward = true;
1355  }
1356  else
1357  {
1358  /*
1359  * This is the same as the >= strategy
1360  */
1361  inskey.nextkey = false;
1362  inskey.backward = false;
1363  }
1364  break;
1365 
1367 
1368  /*
1369  * Find first item >= scankey
1370  */
1371  inskey.nextkey = false;
1372  inskey.backward = false;
1373  break;
1374 
1376 
1377  /*
1378  * Find first item > scankey
1379  */
1380  inskey.nextkey = true;
1381  inskey.backward = false;
1382  break;
1383 
1384  default:
1385  /* can't get here, but keep compiler quiet */
1386  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1387  return false;
1388  }
1389 
1390  /*
1391  * Use the manufactured insertion scan key to descend the tree and
1392  * position ourselves on the target leaf page.
1393  */
1394  Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1395  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1396 
1397  /* don't need to keep the stack around... */
1398  _bt_freestack(stack);
1399 
1400  if (!BufferIsValid(buf))
1401  {
1402  /*
1403  * We only get here if the index is completely empty. Lock relation
1404  * because nothing finer to lock exists. Without a buffer lock, it's
1405  * possible for another transaction to insert data between
1406  * _bt_search() and PredicateLockRelation(). We have to try again
1407  * after taking the relation-level predicate lock, to close a narrow
1408  * window where we wouldn't scan concurrently inserted tuples, but the
1409  * writer wouldn't see our predicate lock.
1410  */
1412  {
1413  PredicateLockRelation(rel, scan->xs_snapshot);
1414  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1415  _bt_freestack(stack);
1416  }
1417 
1418  if (!BufferIsValid(buf))
1419  {
1420  /*
1421  * Mark parallel scan as done, so that all the workers can finish
1422  * their scan.
1423  */
1424  _bt_parallel_done(scan);
1426  return false;
1427  }
1428  }
1429 
1431 
1432  _bt_initialize_more_data(so, dir);
1433 
1434  /* position to the precise item on the page */
1435  offnum = _bt_binsrch(rel, &inskey, buf);
1437  so->currPos.buf = buf;
1438 
1439  /*
1440  * Now load data from the first page of the scan.
1441  *
1442  * If inskey.nextkey = false and inskey.backward = false, offnum is
1443  * positioned at the first non-pivot tuple >= inskey.scankeys.
1444  *
1445  * If inskey.nextkey = false and inskey.backward = true, offnum is
1446  * positioned at the last non-pivot tuple < inskey.scankeys.
1447  *
1448  * If inskey.nextkey = true and inskey.backward = false, offnum is
1449  * positioned at the first non-pivot tuple > inskey.scankeys.
1450  *
1451  * If inskey.nextkey = true and inskey.backward = true, offnum is
1452  * positioned at the last non-pivot tuple <= inskey.scankeys.
1453  *
1454  * It's possible that _bt_binsrch returned an offnum that is out of bounds
1455  * for the page. For example, when inskey is both < the leaf page's high
1456  * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1457  */
1458  if (!_bt_readpage(scan, dir, offnum, true))
1459  {
1460  /*
1461  * There's no actually-matching data on this page. Try to advance to
1462  * the next page. Return false if there's no matching data at all.
1463  */
1464  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1465  if (!_bt_steppage(scan, dir))
1466  return false;
1467  }
1468  else
1469  {
1470  /* We have at least one item to return as scan's first item */
1472  }
1473 
1474 readcomplete:
1475  /* OK, itemIndex says what to return */
1476  currItem = &so->currPos.items[so->currPos.itemIndex];
1477  scan->xs_heaptid = currItem->heapTid;
1478  if (scan->xs_want_itup)
1479  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1480 
1481  return true;
1482 }
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define RegProcedureIsValid(p)
Definition: c.h:780
regproc RegProcedure
Definition: c.h:653
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:862
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:796
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:735
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
Definition: nbtree.c:603
#define BTORDER_PROC
Definition: nbtree.h:707
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1016
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:719
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2354
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:340
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2580
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:99
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:221
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1352
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:2556
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:660
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
bool needPrimScan
Definition: nbtree.h:1049
ScanKey keyData
Definition: nbtree.h:1045
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:167
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_start_array_keys(), _bt_steppage(), _bt_unlockbuf(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid(), cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2499 of file nbtsearch.c.

2500 {
2501  Buffer buf;
2502  Page page;
2503  BTPageOpaque opaque;
2504  OffsetNumber offnum;
2505  BlockNumber blkno;
2506  IndexTuple itup;
2507 
2508  /*
2509  * If we are looking for a leaf page, okay to descend from fast root;
2510  * otherwise better descend from true root. (There is no point in being
2511  * smarter about intermediate levels.)
2512  */
2513  if (level == 0)
2514  buf = _bt_getroot(rel, NULL, BT_READ);
2515  else
2516  buf = _bt_gettrueroot(rel);
2517 
2518  if (!BufferIsValid(buf))
2519  return InvalidBuffer;
2520 
2521  page = BufferGetPage(buf);
2522  opaque = BTPageGetOpaque(page);
2523 
2524  for (;;)
2525  {
2526  /*
2527  * If we landed on a deleted page, step right to find a live page
2528  * (there must be one). Also, if we want the rightmost page, step
2529  * right if needed to get to it (this could happen if the page split
2530  * since we obtained a pointer to it).
2531  */
2532  while (P_IGNORE(opaque) ||
2533  (rightmost && !P_RIGHTMOST(opaque)))
2534  {
2535  blkno = opaque->btpo_next;
2536  if (blkno == P_NONE)
2537  elog(ERROR, "fell off the end of index \"%s\"",
2539  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2540  page = BufferGetPage(buf);
2541  opaque = BTPageGetOpaque(page);
2542  }
2543 
2544  /* Done? */
2545  if (opaque->btpo_level == level)
2546  break;
2547  if (opaque->btpo_level < level)
2548  ereport(ERROR,
2549  (errcode(ERRCODE_INDEX_CORRUPTED),
2550  errmsg_internal("btree level %u not found in index \"%s\"",
2551  level, RelationGetRelationName(rel))));
2552 
2553  /* Descend to leftmost or rightmost child page */
2554  if (rightmost)
2555  offnum = PageGetMaxOffsetNumber(page);
2556  else
2557  offnum = P_FIRSTDATAKEY(opaque);
2558 
2559  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2560  blkno = BTreeTupleGetDownLink(itup);
2561 
2562  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2563  page = BufferGetPage(buf);
2564  opaque = BTPageGetOpaque(page);
2565  }
2566 
2567  return buf;
2568 }
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define P_IGNORE(opaque)
Definition: nbtree.h:225
BlockNumber btpo_next
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_initialize_more_data()

static void _bt_initialize_more_data ( BTScanOpaque  so,
ScanDirection  dir 
)
inlinestatic

Definition at line 2670 of file nbtsearch.c.

2671 {
2672  so->currPos.dir = dir;
2673  if (so->needPrimScan)
2674  {
2675  Assert(so->numArrayKeys);
2676 
2677  so->currPos.moreLeft = true;
2678  so->currPos.moreRight = true;
2679  so->needPrimScan = false;
2680  }
2681  else if (ScanDirectionIsForward(dir))
2682  {
2683  so->currPos.moreLeft = false;
2684  so->currPos.moreRight = true;
2685  }
2686  else
2687  {
2688  so->currPos.moreLeft = true;
2689  so->currPos.moreRight = false;
2690  }
2691  so->numKilled = 0; /* just paranoia */
2692  so->markItemIndex = -1; /* ditto */
2693 }
bool moreRight
Definition: nbtree.h:966
bool moreLeft
Definition: nbtree.h:965
ScanDirection dir
Definition: nbtree.h:975

References Assert, BTScanOpaqueData::currPos, BTScanPosData::dir, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_parallel_readpage().

◆ _bt_moveright()

static Buffer _bt_moveright ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access 
)
static

Definition at line 238 of file nbtsearch.c.

245 {
246  Page page;
247  BTPageOpaque opaque;
248  int32 cmpval;
249 
250  Assert(!forupdate || heaprel != NULL);
251 
252  /*
253  * When nextkey = false (normal case): if the scan key that brought us to
254  * this page is > the high key stored on the page, then the page has split
255  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
256  * have some duplicates to the right as well as the left, but that's
257  * something that's only ever dealt with on the leaf level, after
258  * _bt_search has found an initial leaf page.)
259  *
260  * When nextkey = true: move right if the scan key is >= page's high key.
261  * (Note that key.scantid cannot be set in this case.)
262  *
263  * The page could even have split more than once, so scan as far as
264  * needed.
265  *
266  * We also have to move right if we followed a link that brought us to a
267  * dead page.
268  */
269  cmpval = key->nextkey ? 0 : 1;
270 
271  for (;;)
272  {
273  page = BufferGetPage(buf);
274  opaque = BTPageGetOpaque(page);
275 
276  if (P_RIGHTMOST(opaque))
277  break;
278 
279  /*
280  * Finish any incomplete splits we encounter along the way.
281  */
282  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
283  {
285 
286  /* upgrade our lock if necessary */
287  if (access == BT_READ)
288  {
289  _bt_unlockbuf(rel, buf);
290  _bt_lockbuf(rel, buf, BT_WRITE);
291  }
292 
293  if (P_INCOMPLETE_SPLIT(opaque))
294  _bt_finish_split(rel, heaprel, buf, stack);
295  else
296  _bt_relbuf(rel, buf);
297 
298  /* re-acquire the lock in the right mode, and re-check */
299  buf = _bt_getbuf(rel, blkno, access);
300  continue;
301  }
302 
303  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
304  {
305  /* step right one page */
306  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
307  continue;
308  }
309  else
310  break;
311  }
312 
313  if (P_IGNORE(opaque))
314  elog(ERROR, "fell off the end of index \"%s\"",
316 
317  return buf;
318 }
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
#define P_HIKEY
Definition: nbtree.h:367
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BT_WRITE
Definition: nbtree.h:720
short access
Definition: preproc-type.c:36

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage(), elog, ERROR, sort-test::key, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_search().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1499 of file nbtsearch.c.

1500 {
1501  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1502  BTScanPosItem *currItem;
1503 
1504  /*
1505  * Advance to next tuple on current page; or if there's no more, try to
1506  * step to the next page with data.
1507  */
1508  if (ScanDirectionIsForward(dir))
1509  {
1510  if (++so->currPos.itemIndex > so->currPos.lastItem)
1511  {
1512  if (!_bt_steppage(scan, dir))
1513  return false;
1514  }
1515  }
1516  else
1517  {
1518  if (--so->currPos.itemIndex < so->currPos.firstItem)
1519  {
1520  if (!_bt_steppage(scan, dir))
1521  return false;
1522  }
1523  }
1524 
1525  /* OK, itemIndex says what to return */
1526  currItem = &so->currPos.items[so->currPos.itemIndex];
1527  scan->xs_heaptid = currItem->heapTid;
1528  if (scan->xs_want_itup)
1529  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1530 
1531  return true;
1532 }
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
int firstItem
Definition: nbtree.h:990
int lastItem
Definition: nbtree.h:991

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, if(), BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_parallel_readpage()

static bool _bt_parallel_readpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2354 of file nbtsearch.c.

2355 {
2356  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2357 
2358  Assert(!so->needPrimScan);
2359 
2360  _bt_initialize_more_data(so, dir);
2361 
2362  if (!_bt_readnextpage(scan, blkno, dir))
2363  return false;
2364 
2365  /* We have at least one item to return as scan's next item */
2367 
2368  return true;
2369 }
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2179

References _bt_drop_lock_and_maybe_pin(), _bt_initialize_more_data(), _bt_readnextpage(), Assert, BTScanOpaqueData::currPos, BTScanOpaqueData::needPrimScan, and IndexScanDescData::opaque.

Referenced by _bt_first().

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2179 of file nbtsearch.c.

2180 {
2181  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2182  Relation rel;
2183  Page page;
2184  BTPageOpaque opaque;
2185  bool status;
2186 
2187  rel = scan->indexRelation;
2188 
2189  if (ScanDirectionIsForward(dir))
2190  {
2191  for (;;)
2192  {
2193  /*
2194  * if we're at end of scan, give up and mark parallel scan as
2195  * done, so that all the workers can finish their scan
2196  */
2197  if (blkno == P_NONE || !so->currPos.moreRight)
2198  {
2199  _bt_parallel_done(scan);
2201  return false;
2202  }
2203  /* check for interrupts while we're not holding any buffer lock */
2205  /* step right one page */
2206  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2207  page = BufferGetPage(so->currPos.buf);
2208  opaque = BTPageGetOpaque(page);
2209  /* check for deleted page */
2210  if (!P_IGNORE(opaque))
2211  {
2212  PredicateLockPage(rel, blkno, scan->xs_snapshot);
2213  /* see if there are any matches on this page */
2214  /* note that this will clear moreRight if we can stop */
2215  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2216  break;
2217  }
2218  else if (scan->parallel_scan != NULL)
2219  {
2220  /* allow next page be processed by parallel worker */
2221  _bt_parallel_release(scan, opaque->btpo_next);
2222  }
2223 
2224  /* nope, keep going */
2225  if (scan->parallel_scan != NULL)
2226  {
2227  _bt_relbuf(rel, so->currPos.buf);
2228  status = _bt_parallel_seize(scan, &blkno, false);
2229  if (!status)
2230  {
2232  return false;
2233  }
2234  }
2235  else
2236  {
2237  blkno = opaque->btpo_next;
2238  _bt_relbuf(rel, so->currPos.buf);
2239  }
2240  }
2241  }
2242  else
2243  {
2244  /*
2245  * Should only happen in parallel cases, when some other backend
2246  * advanced the scan.
2247  */
2248  if (so->currPos.currPage != blkno)
2249  {
2251  so->currPos.currPage = blkno;
2252  }
2253 
2254  /* Done if we know that the left sibling link isn't of interest */
2255  if (!so->currPos.moreLeft)
2256  {
2258  _bt_parallel_done(scan);
2260  return false;
2261  }
2262 
2263  /*
2264  * Walk left to the next page with data. This is much more complex
2265  * than the walk-right case because of the possibility that the page
2266  * to our left splits while we are in flight to it, plus the
2267  * possibility that the page we were on gets deleted after we leave
2268  * it. See nbtree/README for details.
2269  *
2270  * It might be possible to rearrange this code to have less overhead
2271  * in pinning and locking, but that would require capturing the left
2272  * sibling block number when the page is initially read, and then
2273  * optimistically starting there (rather than pinning the page twice).
2274  * It is not clear that this would be worth the complexity.
2275  */
2276  if (BTScanPosIsPinned(so->currPos))
2277  _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2278  else
2279  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2280 
2281  for (;;)
2282  {
2283  /* Done if we know that the left sibling link isn't of interest */
2284  if (!so->currPos.moreLeft)
2285  {
2286  _bt_relbuf(rel, so->currPos.buf);
2287  _bt_parallel_done(scan);
2289  return false;
2290  }
2291 
2292  /* Step to next physical page */
2293  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2294 
2295  /* if we're physically at end of index, return failure */
2296  if (so->currPos.buf == InvalidBuffer)
2297  {
2298  _bt_parallel_done(scan);
2300  return false;
2301  }
2302 
2303  /*
2304  * Okay, we managed to move left to a non-deleted page. Done if
2305  * it's not half-dead and contains matching tuples. Else loop back
2306  * and do it all again.
2307  */
2308  page = BufferGetPage(so->currPos.buf);
2309  opaque = BTPageGetOpaque(page);
2310  if (!P_IGNORE(opaque))
2311  {
2313  /* see if there are any matches on this page */
2314  /* note that this will clear moreLeft if we can stop */
2315  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2316  break;
2317  }
2318  else if (scan->parallel_scan != NULL)
2319  {
2320  /* allow next page be processed by parallel worker */
2322  }
2323 
2324  /*
2325  * For parallel scans, get the last page scanned as it is quite
2326  * possible that by the time we try to seize the scan, some other
2327  * worker has already advanced the scan to a different page. We
2328  * must continue based on the latest page scanned by any worker.
2329  */
2330  if (scan->parallel_scan != NULL)
2331  {
2332  _bt_relbuf(rel, so->currPos.buf);
2333  status = _bt_parallel_seize(scan, &blkno, false);
2334  if (!status)
2335  {
2337  return false;
2338  }
2339  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2340  }
2341  }
2342  }
2343 
2344  return true;
2345 }
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:712
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:999
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1010
static Buffer _bt_walk_left(Relation rel, Buffer buf)
Definition: nbtsearch.c:2385
BlockNumber currPage
Definition: nbtree.h:956

References _bt_getbuf(), _bt_lockbuf(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), BT_READ, BTPageGetOpaque, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, if(), IndexScanDescData::indexRelation, InvalidBuffer, BTScanPosData::moreLeft, BTScanPosData::moreRight, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), ScanDirectionIsForward, and IndexScanDescData::xs_snapshot.

Referenced by _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum,
bool  firstPage 
)
static

Definition at line 1563 of file nbtsearch.c.

1565 {
1566  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1567  Page page;
1568  BTPageOpaque opaque;
1569  OffsetNumber minoff;
1570  OffsetNumber maxoff;
1571  BTReadPageState pstate;
1572  bool arrayKeys;
1573  int itemIndex,
1574  indnatts;
1575 
1576  /*
1577  * We must have the buffer pinned and locked, but the usual macro can't be
1578  * used here; this function is what makes it good for currPos.
1579  */
1581 
1582  page = BufferGetPage(so->currPos.buf);
1583  opaque = BTPageGetOpaque(page);
1584 
1585  /* allow next page be processed by parallel worker */
1586  if (scan->parallel_scan)
1587  {
1588  if (ScanDirectionIsForward(dir))
1589  pstate.prev_scan_page = opaque->btpo_next;
1590  else
1591  pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
1592 
1593  _bt_parallel_release(scan, pstate.prev_scan_page);
1594  }
1595 
1597  arrayKeys = so->numArrayKeys != 0;
1598  minoff = P_FIRSTDATAKEY(opaque);
1599  maxoff = PageGetMaxOffsetNumber(page);
1600 
1601  /* initialize page-level state that we'll pass to _bt_checkkeys */
1602  pstate.dir = dir;
1603  pstate.minoff = minoff;
1604  pstate.maxoff = maxoff;
1605  pstate.finaltup = NULL;
1606  pstate.page = page;
1607  pstate.offnum = InvalidOffsetNumber;
1608  pstate.skip = InvalidOffsetNumber;
1609  pstate.continuescan = true; /* default assumption */
1610  pstate.prechecked = false;
1611  pstate.firstmatch = false;
1612  pstate.rechecks = 0;
1613  pstate.targetdistance = 0;
1614 
1615  /*
1616  * We note the buffer's block number so that we can release the pin later.
1617  * This allows us to re-read the buffer if it is needed again for hinting.
1618  */
1620 
1621  /*
1622  * We save the LSN of the page as we read it, so that we know whether it
1623  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1624  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1625  */
1627 
1628  /*
1629  * we must save the page's right-link while scanning it; this tells us
1630  * where to step right to after we're done with these items. There is no
1631  * corresponding need for the left-link, since splits always go right.
1632  */
1633  so->currPos.nextPage = opaque->btpo_next;
1634 
1635  /* initialize tuple workspace to empty */
1636  so->currPos.nextTupleOffset = 0;
1637 
1638  /*
1639  * Now that the current page has been made consistent, the macro should be
1640  * good.
1641  */
1643 
1644  /*
1645  * Prechecking the value of the continuescan flag for the last item on the
1646  * page (for backwards scan it will be the first item on a page). If we
1647  * observe it to be true, then it should be true for all other items. This
1648  * allows us to do significant optimizations in the _bt_checkkeys()
1649  * function for all the items on the page.
1650  *
1651  * With the forward scan, we do this check for the last item on the page
1652  * instead of the high key. It's relatively likely that the most
1653  * significant column in the high key will be different from the
1654  * corresponding value from the last item on the page. So checking with
1655  * the last item on the page would give a more precise answer.
1656  *
1657  * We skip this for the first page read by each (primitive) scan, to avoid
1658  * slowing down point queries. They typically don't stand to gain much
1659  * when the optimization can be applied, and are more likely to notice the
1660  * overhead of the precheck.
1661  *
1662  * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1663  * just set a low-order required array's key to the best available match
1664  * for a truncated -inf attribute value from the prior page's high key
1665  * (array element 0 is always the best available match in this scenario).
1666  * It's quite likely that matches for array element 0 begin on this page,
1667  * but the start of matches won't necessarily align with page boundaries.
1668  * When the start of matches is somewhere in the middle of this page, it
1669  * would be wrong to treat page's final non-pivot tuple as representative.
1670  * Doing so might lead us to treat some of the page's earlier tuples as
1671  * being part of a group of tuples thought to satisfy the required keys.
1672  *
1673  * Note: Conversely, in the case where the scan's arrays just advanced
1674  * using the prior page's HIKEY _without_ advancement setting scanBehind,
1675  * the start of matches must be aligned with page boundaries, which makes
1676  * it safe to attempt the optimization here now. It's also safe when the
1677  * prior page's HIKEY simply didn't need to advance any required array. In
1678  * both cases we can safely assume that the _first_ tuple from this page
1679  * must be >= the current set of array keys/equality constraints. And so
1680  * if the final tuple is == those same keys (and also satisfies any
1681  * required < or <= strategy scan keys) during the precheck, we can safely
1682  * assume that this must also be true of all earlier tuples from the page.
1683  */
1684  if (!firstPage && !so->scanBehind && minoff < maxoff)
1685  {
1686  ItemId iid;
1687  IndexTuple itup;
1688 
1689  iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1690  itup = (IndexTuple) PageGetItem(page, iid);
1691 
1692  /* Call with arrayKeys=false to avoid undesirable side-effects */
1693  _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1694  pstate.prechecked = pstate.continuescan;
1695  pstate.continuescan = true; /* reset */
1696  }
1697 
1698  if (ScanDirectionIsForward(dir))
1699  {
1700  /* SK_SEARCHARRAY forward scans must provide high key up front */
1701  if (arrayKeys && !P_RIGHTMOST(opaque))
1702  {
1703  ItemId iid = PageGetItemId(page, P_HIKEY);
1704 
1705  pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1706  }
1707 
1708  /* load items[] in ascending order */
1709  itemIndex = 0;
1710 
1711  offnum = Max(offnum, minoff);
1712 
1713  while (offnum <= maxoff)
1714  {
1715  ItemId iid = PageGetItemId(page, offnum);
1716  IndexTuple itup;
1717  bool passes_quals;
1718 
1719  /*
1720  * If the scan specifies not to return killed tuples, then we
1721  * treat a killed tuple as not passing the qual
1722  */
1723  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1724  {
1725  offnum = OffsetNumberNext(offnum);
1726  continue;
1727  }
1728 
1729  itup = (IndexTuple) PageGetItem(page, iid);
1730  Assert(!BTreeTupleIsPivot(itup));
1731 
1732  pstate.offnum = offnum;
1733  passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1734  itup, indnatts);
1735 
1736  /*
1737  * Check if we need to skip ahead to a later tuple (only possible
1738  * when the scan uses array keys)
1739  */
1740  if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1741  {
1742  Assert(!passes_quals && pstate.continuescan);
1743  Assert(offnum < pstate.skip);
1744 
1745  offnum = pstate.skip;
1746  pstate.skip = InvalidOffsetNumber;
1747  continue;
1748  }
1749 
1750  if (passes_quals)
1751  {
1752  /* tuple passes all scan key conditions */
1753  pstate.firstmatch = true;
1754  if (!BTreeTupleIsPosting(itup))
1755  {
1756  /* Remember it */
1757  _bt_saveitem(so, itemIndex, offnum, itup);
1758  itemIndex++;
1759  }
1760  else
1761  {
1762  int tupleOffset;
1763 
1764  /*
1765  * Set up state to return posting list, and remember first
1766  * TID
1767  */
1768  tupleOffset =
1769  _bt_setuppostingitems(so, itemIndex, offnum,
1770  BTreeTupleGetPostingN(itup, 0),
1771  itup);
1772  itemIndex++;
1773  /* Remember additional TIDs */
1774  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1775  {
1776  _bt_savepostingitem(so, itemIndex, offnum,
1777  BTreeTupleGetPostingN(itup, i),
1778  tupleOffset);
1779  itemIndex++;
1780  }
1781  }
1782  }
1783  /* When !continuescan, there can't be any more matches, so stop */
1784  if (!pstate.continuescan)
1785  break;
1786 
1787  offnum = OffsetNumberNext(offnum);
1788  }
1789 
1790  /*
1791  * We don't need to visit page to the right when the high key
1792  * indicates that no more matches will be found there.
1793  *
1794  * Checking the high key like this works out more often than you might
1795  * think. Leaf page splits pick a split point between the two most
1796  * dissimilar tuples (this is weighed against the need to evenly share
1797  * free space). Leaf pages with high key attribute values that can
1798  * only appear on non-pivot tuples on the right sibling page are
1799  * common.
1800  */
1801  if (pstate.continuescan && !P_RIGHTMOST(opaque))
1802  {
1803  ItemId iid = PageGetItemId(page, P_HIKEY);
1804  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1805  int truncatt;
1806 
1807  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1808  pstate.prechecked = false; /* precheck didn't cover HIKEY */
1809  _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1810  }
1811 
1812  if (!pstate.continuescan)
1813  so->currPos.moreRight = false;
1814 
1815  Assert(itemIndex <= MaxTIDsPerBTreePage);
1816  so->currPos.firstItem = 0;
1817  so->currPos.lastItem = itemIndex - 1;
1818  so->currPos.itemIndex = 0;
1819  }
1820  else
1821  {
1822  /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1823  if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1824  {
1825  ItemId iid = PageGetItemId(page, minoff);
1826 
1827  pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1828  }
1829 
1830  /* load items[] in descending order */
1831  itemIndex = MaxTIDsPerBTreePage;
1832 
1833  offnum = Min(offnum, maxoff);
1834 
1835  while (offnum >= minoff)
1836  {
1837  ItemId iid = PageGetItemId(page, offnum);
1838  IndexTuple itup;
1839  bool tuple_alive;
1840  bool passes_quals;
1841 
1842  /*
1843  * If the scan specifies not to return killed tuples, then we
1844  * treat a killed tuple as not passing the qual. Most of the
1845  * time, it's a win to not bother examining the tuple's index
1846  * keys, but just skip to the next tuple (previous, actually,
1847  * since we're scanning backwards). However, if this is the first
1848  * tuple on the page, we do check the index keys, to prevent
1849  * uselessly advancing to the page to the left. This is similar
1850  * to the high key optimization used by forward scans.
1851  */
1852  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1853  {
1854  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1855  if (offnum > P_FIRSTDATAKEY(opaque))
1856  {
1857  offnum = OffsetNumberPrev(offnum);
1858  continue;
1859  }
1860 
1861  tuple_alive = false;
1862  }
1863  else
1864  tuple_alive = true;
1865 
1866  itup = (IndexTuple) PageGetItem(page, iid);
1867  Assert(!BTreeTupleIsPivot(itup));
1868 
1869  pstate.offnum = offnum;
1870  passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1871  itup, indnatts);
1872 
1873  /*
1874  * Check if we need to skip ahead to a later tuple (only possible
1875  * when the scan uses array keys)
1876  */
1877  if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1878  {
1879  Assert(!passes_quals && pstate.continuescan);
1880  Assert(offnum > pstate.skip);
1881 
1882  offnum = pstate.skip;
1883  pstate.skip = InvalidOffsetNumber;
1884  continue;
1885  }
1886 
1887  if (passes_quals && tuple_alive)
1888  {
1889  /* tuple passes all scan key conditions */
1890  pstate.firstmatch = true;
1891  if (!BTreeTupleIsPosting(itup))
1892  {
1893  /* Remember it */
1894  itemIndex--;
1895  _bt_saveitem(so, itemIndex, offnum, itup);
1896  }
1897  else
1898  {
1899  int tupleOffset;
1900 
1901  /*
1902  * Set up state to return posting list, and remember first
1903  * TID.
1904  *
1905  * Note that we deliberately save/return items from
1906  * posting lists in ascending heap TID order for backwards
1907  * scans. This allows _bt_killitems() to make a
1908  * consistent assumption about the order of items
1909  * associated with the same posting list tuple.
1910  */
1911  itemIndex--;
1912  tupleOffset =
1913  _bt_setuppostingitems(so, itemIndex, offnum,
1914  BTreeTupleGetPostingN(itup, 0),
1915  itup);
1916  /* Remember additional TIDs */
1917  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1918  {
1919  itemIndex--;
1920  _bt_savepostingitem(so, itemIndex, offnum,
1921  BTreeTupleGetPostingN(itup, i),
1922  tupleOffset);
1923  }
1924  }
1925  }
1926  /* When !continuescan, there can't be any more matches, so stop */
1927  if (!pstate.continuescan)
1928  break;
1929 
1930  offnum = OffsetNumberPrev(offnum);
1931  }
1932 
1933  /*
1934  * We don't need to visit page to the left when no more matches will
1935  * be found there
1936  */
1937  if (!pstate.continuescan || P_LEFTMOST(opaque))
1938  so->currPos.moreLeft = false;
1939 
1940  Assert(itemIndex >= 0);
1941  so->currPos.firstItem = itemIndex;
1944  }
1945 
1946  return (so->currPos.firstItem <= so->currPos.lastItem);
1947 }
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3985
#define Max(x, y)
Definition: c.h:1001
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1951
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1981
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:2019
bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtutils.c:3491
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:517
int nextTupleOffset
Definition: nbtree.h:981
BlockNumber nextPage
Definition: nbtree.h:957
XLogRecPtr lsn
Definition: nbtree.h:955
bool ignore_killed_tuples
Definition: relscan.h:130

References _bt_checkkeys(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_setuppostingitems(), Assert, BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage(), BufferIsValid(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, InvalidOffsetNumber, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, OffsetNumberIsValid, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_HIKEY, P_LEFTMOST, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, BTScanOpaqueData::scanBehind, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_readnextpage().

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1951 of file nbtsearch.c.

1953 {
1954  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1955 
1956  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1957 
1958  currItem->heapTid = itup->t_tid;
1959  currItem->indexOffset = offnum;
1960  if (so->currTuples)
1961  {
1962  Size itupsz = IndexTupleSize(itup);
1963 
1964  currItem->tupleOffset = so->currPos.nextTupleOffset;
1965  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1966  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1967  }
1968 }
#define MAXALIGN(LEN)
Definition: c.h:814
size_t Size
Definition: c.h:608
#define IndexTupleSize(itup)
Definition: itup.h:70
ItemPointerData heapTid
Definition: nbtree.h:946
LocationIndex tupleOffset
Definition: nbtree.h:948
OffsetNumber indexOffset
Definition: nbtree.h:947
ItemPointerData t_tid
Definition: itup.h:37

References Assert, BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_savepostingitem()

static void _bt_savepostingitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
int  tupleOffset 
)
inlinestatic

Definition at line 2019 of file nbtsearch.c.

2021 {
2022  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2023 
2024  currItem->heapTid = *heapTid;
2025  currItem->indexOffset = offnum;
2026 
2027  /*
2028  * Have index-only scans return the same base IndexTuple for every TID
2029  * that originates from the same posting list
2030  */
2031  if (so->currTuples)
2032  currItem->tupleOffset = tupleOffset;
2033 }

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 99 of file nbtsearch.c.

101 {
102  BTStack stack_in = NULL;
103  int page_access = BT_READ;
104 
105  /* heaprel must be set whenever _bt_allocbuf is reachable */
106  Assert(access == BT_READ || access == BT_WRITE);
107  Assert(access == BT_READ || heaprel != NULL);
108 
109  /* Get the root page to start with */
110  *bufP = _bt_getroot(rel, heaprel, access);
111 
112  /* If index is empty and access = BT_READ, no root page is created. */
113  if (!BufferIsValid(*bufP))
114  return (BTStack) NULL;
115 
116  /* Loop iterates once per level descended in the tree */
117  for (;;)
118  {
119  Page page;
120  BTPageOpaque opaque;
121  OffsetNumber offnum;
122  ItemId itemid;
123  IndexTuple itup;
124  BlockNumber child;
125  BTStack new_stack;
126 
127  /*
128  * Race -- the page we just grabbed may have split since we read its
129  * downlink in its parent page (or the metapage). If it has, we may
130  * need to move right to its new sibling. Do that.
131  *
132  * In write-mode, allow _bt_moveright to finish any incomplete splits
133  * along the way. Strictly speaking, we'd only need to finish an
134  * incomplete split on the leaf page we're about to insert to, not on
135  * any of the upper levels (internal pages with incomplete splits are
136  * also taken care of in _bt_getstackbuf). But this is a good
137  * opportunity to finish splits of internal pages too.
138  */
139  *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
140  stack_in, page_access);
141 
142  /* if this is a leaf page, we're done */
143  page = BufferGetPage(*bufP);
144  opaque = BTPageGetOpaque(page);
145  if (P_ISLEAF(opaque))
146  break;
147 
148  /*
149  * Find the appropriate pivot tuple on this page. Its downlink points
150  * to the child page that we're about to descend to.
151  */
152  offnum = _bt_binsrch(rel, key, *bufP);
153  itemid = PageGetItemId(page, offnum);
154  itup = (IndexTuple) PageGetItem(page, itemid);
155  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
156  child = BTreeTupleGetDownLink(itup);
157 
158  /*
159  * We need to save the location of the pivot tuple we chose in a new
160  * stack entry for this page/level. If caller ends up splitting a
161  * page one level down, it usually ends up inserting a new pivot
162  * tuple/downlink immediately after the location recorded here.
163  */
164  new_stack = (BTStack) palloc(sizeof(BTStackData));
165  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
166  new_stack->bts_offset = offnum;
167  new_stack->bts_parent = stack_in;
168 
169  /*
170  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
171  * we're on the level 1 and asked to lock leaf page in write mode,
172  * then lock next page in write mode, because it must be a leaf.
173  */
174  if (opaque->btpo_level == 1 && access == BT_WRITE)
175  page_access = BT_WRITE;
176 
177  /* drop the read lock on the page, then acquire one on its child */
178  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
179 
180  /* okay, all set to move down a level */
181  stack_in = new_stack;
182  }
183 
184  /*
185  * If we're asked to lock leaf in write mode, but didn't manage to, then
186  * relock. This should only happen when the root page is a leaf page (and
187  * the only page in the index other than the metapage).
188  */
189  if (access == BT_WRITE && page_access == BT_READ)
190  {
191  /* trade in our read lock for a write lock */
192  _bt_unlockbuf(rel, *bufP);
193  _bt_lockbuf(rel, *bufP, BT_WRITE);
194 
195  /*
196  * Race -- the leaf page may have split after we dropped the read lock
197  * but before we acquired a write lock. If it has, we may need to
198  * move right to its new sibling. Do that.
199  */
200  *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
201  }
202 
203  return stack_in;
204 }
void * palloc(Size size)
Definition: mcxt.c:1317
BTStackData * BTStack
Definition: nbtree.h:739
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:238
BlockNumber bts_blkno
Definition: nbtree.h:734
struct BTStackData * bts_parent
Definition: nbtree.h:736
OffsetNumber bts_offset
Definition: nbtree.h:735

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_setuppostingitems()

static int _bt_setuppostingitems ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
IndexTuple  itup 
)
static

Definition at line 1981 of file nbtsearch.c.

1983 {
1984  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1985 
1986  Assert(BTreeTupleIsPosting(itup));
1987 
1988  currItem->heapTid = *heapTid;
1989  currItem->indexOffset = offnum;
1990  if (so->currTuples)
1991  {
1992  /* Save base IndexTuple (truncate posting list) */
1993  IndexTuple base;
1994  Size itupsz = BTreeTupleGetPostingOffset(itup);
1995 
1996  itupsz = MAXALIGN(itupsz);
1997  currItem->tupleOffset = so->currPos.nextTupleOffset;
1998  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1999  memcpy(base, itup, itupsz);
2000  /* Defensively reduce work area index tuple header size */
2001  base->t_info &= ~INDEX_SIZE_MASK;
2002  base->t_info |= itupsz;
2003  so->currPos.nextTupleOffset += itupsz;
2004 
2005  return currItem->tupleOffset;
2006  }
2007 
2008  return 0;
2009 }
#define INDEX_SIZE_MASK
Definition: itup.h:65
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529
unsigned short t_info
Definition: itup.h:49

References Assert, BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, INDEX_SIZE_MASK, BTScanPosItem::indexOffset, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_info, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2047 of file nbtsearch.c.

2048 {
2049  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2051  bool status;
2052 
2054 
2055  /* Before leaving current page, deal with any killed items */
2056  if (so->numKilled > 0)
2057  _bt_killitems(scan);
2058 
2059  /*
2060  * Before we modify currPos, make a copy of the page data if there was a
2061  * mark position that needs it.
2062  */
2063  if (so->markItemIndex >= 0)
2064  {
2065  /* bump pin on current buffer for assignment to mark buffer */
2066  if (BTScanPosIsPinned(so->currPos))
2068  memcpy(&so->markPos, &so->currPos,
2069  offsetof(BTScanPosData, items[1]) +
2070  so->currPos.lastItem * sizeof(BTScanPosItem));
2071  if (so->markTuples)
2072  memcpy(so->markTuples, so->currTuples,
2073  so->currPos.nextTupleOffset);
2074  so->markPos.itemIndex = so->markItemIndex;
2075  so->markItemIndex = -1;
2076 
2077  /*
2078  * If we're just about to start the next primitive index scan
2079  * (possible with a scan that has arrays keys, and needs to skip to
2080  * continue in the current scan direction), moreLeft/moreRight only
2081  * indicate the end of the current primitive index scan. They must
2082  * never be taken to indicate that the top-level index scan has ended
2083  * (that would be wrong).
2084  *
2085  * We could handle this case by treating the current array keys as
2086  * markPos state. But depending on the current array state like this
2087  * would add complexity. Instead, we just unset markPos's copy of
2088  * moreRight or moreLeft (whichever might be affected), while making
2089  * btrestpos reset the scan's arrays to their initial scan positions.
2090  * In effect, btrestpos leaves advancing the arrays up to the first
2091  * _bt_readpage call (that takes place after it has restored markPos).
2092  */
2093  Assert(so->markPos.dir == dir);
2094  if (so->needPrimScan)
2095  {
2096  if (ScanDirectionIsForward(dir))
2097  so->markPos.moreRight = true;
2098  else
2099  so->markPos.moreLeft = true;
2100  }
2101  }
2102 
2103  if (ScanDirectionIsForward(dir))
2104  {
2105  /* Walk right to the next page with data */
2106  if (scan->parallel_scan != NULL)
2107  {
2108  /*
2109  * Seize the scan to get the next block number; if the scan has
2110  * ended already, bail out.
2111  */
2112  status = _bt_parallel_seize(scan, &blkno, false);
2113  if (!status)
2114  {
2115  /* release the previous buffer, if pinned */
2118  return false;
2119  }
2120  }
2121  else
2122  {
2123  /* Not parallel, so use the previously-saved nextPage link. */
2124  blkno = so->currPos.nextPage;
2125  }
2126 
2127  /* Remember we left a page with data */
2128  so->currPos.moreLeft = true;
2129 
2130  /* release the previous buffer, if pinned */
2132  }
2133  else
2134  {
2135  /* Remember we left a page with data */
2136  so->currPos.moreRight = true;
2137 
2138  if (scan->parallel_scan != NULL)
2139  {
2140  /*
2141  * Seize the scan to get the current block number; if the scan has
2142  * ended already, bail out.
2143  */
2144  status = _bt_parallel_seize(scan, &blkno, false);
2146  if (!status)
2147  {
2149  return false;
2150  }
2151  }
2152  else
2153  {
2154  /* Not parallel, so just use our own notion of the current page */
2155  blkno = so->currPos.currPage;
2156  }
2157  }
2158 
2159  if (!_bt_readnextpage(scan, blkno, dir))
2160  return false;
2161 
2162  /* We have at least one item to return as scan's next item */
2164 
2165  return true;
2166 }
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4956
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:4154
char * markTuples
Definition: nbtree.h:1065
BTScanPosData markPos
Definition: nbtree.h:1078
static ItemArray items
Definition: test_tidstore.c:49

References _bt_drop_lock_and_maybe_pin(), _bt_killitems(), _bt_parallel_seize(), _bt_readnextpage(), Assert, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, IncrBufferRefCount(), InvalidBlockNumber, BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

◆ _bt_walk_left()

static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf 
)
static

Definition at line 2385 of file nbtsearch.c.

2386 {
2387  Page page;
2388  BTPageOpaque opaque;
2389 
2390  page = BufferGetPage(buf);
2391  opaque = BTPageGetOpaque(page);
2392 
2393  for (;;)
2394  {
2395  BlockNumber obknum;
2396  BlockNumber lblkno;
2397  BlockNumber blkno;
2398  int tries;
2399 
2400  /* if we're at end of tree, release buf and return failure */
2401  if (P_LEFTMOST(opaque))
2402  {
2403  _bt_relbuf(rel, buf);
2404  break;
2405  }
2406  /* remember original page we are stepping left from */
2407  obknum = BufferGetBlockNumber(buf);
2408  /* step left */
2409  blkno = lblkno = opaque->btpo_prev;
2410  _bt_relbuf(rel, buf);
2411  /* check for interrupts while we're not holding any buffer lock */
2413  buf = _bt_getbuf(rel, blkno, BT_READ);
2414  page = BufferGetPage(buf);
2415  opaque = BTPageGetOpaque(page);
2416 
2417  /*
2418  * If this isn't the page we want, walk right till we find what we
2419  * want --- but go no more than four hops (an arbitrary limit). If we
2420  * don't find the correct page by then, the most likely bet is that
2421  * the original page got deleted and isn't in the sibling chain at all
2422  * anymore, not that its left sibling got split more than four times.
2423  *
2424  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2425  * because half-dead pages are still in the sibling chain.
2426  */
2427  tries = 0;
2428  for (;;)
2429  {
2430  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2431  {
2432  /* Found desired page, return it */
2433  return buf;
2434  }
2435  if (P_RIGHTMOST(opaque) || ++tries > 4)
2436  break;
2437  blkno = opaque->btpo_next;
2438  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2439  page = BufferGetPage(buf);
2440  opaque = BTPageGetOpaque(page);
2441  }
2442 
2443  /* Return to the original page to see what's up */
2444  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2445  page = BufferGetPage(buf);
2446  opaque = BTPageGetOpaque(page);
2447  if (P_ISDELETED(opaque))
2448  {
2449  /*
2450  * It was deleted. Move right to first nondeleted page (there
2451  * must be one); that is the page that has acquired the deleted
2452  * one's keyspace, so stepping left from it will take us where we
2453  * want to be.
2454  */
2455  for (;;)
2456  {
2457  if (P_RIGHTMOST(opaque))
2458  elog(ERROR, "fell off the end of index \"%s\"",
2460  blkno = opaque->btpo_next;
2461  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2462  page = BufferGetPage(buf);
2463  opaque = BTPageGetOpaque(page);
2464  if (!P_ISDELETED(opaque))
2465  break;
2466  }
2467 
2468  /*
2469  * Now return to top of loop, resetting obknum to point to this
2470  * nondeleted page, and try again.
2471  */
2472  }
2473  else
2474  {
2475  /*
2476  * It wasn't deleted; the explanation had better be that the page
2477  * to the left got split or deleted. Without this check, we'd go
2478  * into an infinite loop if there's anything wrong.
2479  */
2480  if (opaque->btpo_prev == lblkno)
2481  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2482  obknum, RelationGetRelationName(rel));
2483  /* Okay to try again with new lblkno value */
2484  }
2485  }
2486 
2487  return InvalidBuffer;
2488 }
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
BlockNumber btpo_prev
Definition: nbtree.h:64

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_readnextpage().