PostgreSQL Source Code  git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static Buffer _bt_moveright (Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static int _bt_setuppostingitems (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
 
static void _bt_savepostingitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
 
static void _bt_returnitem (IndexScanDesc scan, BTScanOpaque so)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readfirstpage (IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
 
static Buffer _bt_lock_and_validate_left (Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 343 of file nbtsearch.c.

346 {
347  Page page;
348  BTPageOpaque opaque;
349  OffsetNumber low,
350  high;
351  int32 result,
352  cmpval;
353 
354  page = BufferGetPage(buf);
355  opaque = BTPageGetOpaque(page);
356 
357  /* Requesting nextkey semantics while using scantid seems nonsensical */
358  Assert(!key->nextkey || key->scantid == NULL);
359  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
360  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
361 
362  low = P_FIRSTDATAKEY(opaque);
363  high = PageGetMaxOffsetNumber(page);
364 
365  /*
366  * If there are no keys on the page, return the first available slot. Note
367  * this covers two cases: the page is really empty (no keys), or it
368  * contains only a high key. The latter case is possible after vacuuming.
369  * This can never happen on an internal page, however, since they are
370  * never empty (an internal page must have at least one child).
371  */
372  if (unlikely(high < low))
373  return low;
374 
375  /*
376  * Binary search to find the first key on the page >= scan key, or first
377  * key > scankey when nextkey is true.
378  *
379  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381  *
382  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384  *
385  * We can fall out when high == low.
386  */
387  high++; /* establish the loop invariant for high */
388 
389  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
390 
391  while (high > low)
392  {
393  OffsetNumber mid = low + ((high - low) / 2);
394 
395  /* We have low <= mid < high, so mid points at a real slot */
396 
397  result = _bt_compare(rel, key, page, mid);
398 
399  if (result >= cmpval)
400  low = mid + 1;
401  else
402  high = mid;
403  }
404 
405  /*
406  * At this point we have high == low.
407  *
408  * On a leaf page we always return the first non-pivot tuple >= scan key
409  * (resp. > scan key) for forward scan callers. For backward scans, it's
410  * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
411  */
412  if (P_ISLEAF(opaque))
413  {
414  /*
415  * In the backward scan case we're supposed to locate the last
416  * matching tuple on the leaf level -- not the first matching tuple
417  * (the last tuple will be the first one returned by the scan).
418  *
419  * At this point we've located the first non-pivot tuple immediately
420  * after the last matching tuple (which might just be maxoff + 1).
421  * Compensate by stepping back.
422  */
423  if (key->backward)
424  return OffsetNumberPrev(low);
425 
426  return low;
427  }
428 
429  /*
430  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
431  * There must be one if _bt_compare() is playing by the rules.
432  *
433  * _bt_compare() will seldom see any exactly-matching pivot tuples, since
434  * a truncated -inf heap TID is usually enough to prevent it altogether.
435  * Even omitted scan key entries are treated as > truncated attributes.
436  *
437  * However, during backward scans _bt_compare() interprets omitted scan
438  * key attributes as == corresponding truncated -inf attributes instead.
439  * This works just like < would work here. Under this scheme, < strategy
440  * backward scans will always directly descend to the correct leaf page.
441  * In particular, they will never incur an "extra" leaf page access with a
442  * scan key that happens to contain the same prefix of values as some
443  * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
444  * it uses a leaf page high key to "re-find" a page undergoing deletion.
445  */
446  Assert(low > P_FIRSTDATAKEY(opaque));
447 
448  return OffsetNumberPrev(low);
449 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
Pointer Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
signed int int32
Definition: c.h:508
#define Assert(condition)
Definition: c.h:863
#define unlikely(x)
Definition: c.h:326
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:72

References _bt_compare(), Assert, BTPageGetOpaque, buf, BufferGetPage(), sort-test::key, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), and unlikely.

Referenced by _bt_first(), and _bt_search().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 474 of file nbtsearch.c.

475 {
476  BTScanInsert key = insertstate->itup_key;
477  Page page;
478  BTPageOpaque opaque;
479  OffsetNumber low,
480  high,
481  stricthigh;
482  int32 result,
483  cmpval;
484 
485  page = BufferGetPage(insertstate->buf);
486  opaque = BTPageGetOpaque(page);
487 
488  Assert(P_ISLEAF(opaque));
489  Assert(!key->nextkey);
490  Assert(insertstate->postingoff == 0);
491 
492  if (!insertstate->bounds_valid)
493  {
494  /* Start new binary search */
495  low = P_FIRSTDATAKEY(opaque);
496  high = PageGetMaxOffsetNumber(page);
497  }
498  else
499  {
500  /* Restore result of previous binary search against same page */
501  low = insertstate->low;
502  high = insertstate->stricthigh;
503  }
504 
505  /* If there are no keys on the page, return the first available slot */
506  if (unlikely(high < low))
507  {
508  /* Caller can't reuse bounds */
509  insertstate->low = InvalidOffsetNumber;
510  insertstate->stricthigh = InvalidOffsetNumber;
511  insertstate->bounds_valid = false;
512  return low;
513  }
514 
515  /*
516  * Binary search to find the first key on the page >= scan key. (nextkey
517  * is always false when inserting).
518  *
519  * The loop invariant is: all slots before 'low' are < scan key, all slots
520  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
521  * maintained to save additional search effort for caller.
522  *
523  * We can fall out when high == low.
524  */
525  if (!insertstate->bounds_valid)
526  high++; /* establish the loop invariant for high */
527  stricthigh = high; /* high initially strictly higher */
528 
529  cmpval = 1; /* !nextkey comparison value */
530 
531  while (high > low)
532  {
533  OffsetNumber mid = low + ((high - low) / 2);
534 
535  /* We have low <= mid < high, so mid points at a real slot */
536 
537  result = _bt_compare(rel, key, page, mid);
538 
539  if (result >= cmpval)
540  low = mid + 1;
541  else
542  {
543  high = mid;
544  if (result != 0)
545  stricthigh = high;
546  }
547 
548  /*
549  * If tuple at offset located by binary search is a posting list whose
550  * TID range overlaps with caller's scantid, perform posting list
551  * binary search to set postingoff for caller. Caller must split the
552  * posting list when postingoff is set. This should happen
553  * infrequently.
554  */
555  if (unlikely(result == 0 && key->scantid != NULL))
556  {
557  /*
558  * postingoff should never be set more than once per leaf page
559  * binary search. That would mean that there are duplicate table
560  * TIDs in the index, which is never okay. Check for that here.
561  */
562  if (insertstate->postingoff != 0)
563  ereport(ERROR,
564  (errcode(ERRCODE_INDEX_CORRUPTED),
565  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
566  ItemPointerGetBlockNumber(key->scantid),
568  low, stricthigh,
569  BufferGetBlockNumber(insertstate->buf),
570  RelationGetRelationName(rel))));
571 
572  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
573  }
574  }
575 
576  /*
577  * On a leaf page, a binary search always returns the first key >= scan
578  * key (at least in !nextkey case), which could be the last slot + 1. This
579  * is also the lower bound of cached search.
580  *
581  * stricthigh may also be the last slot + 1, which prevents caller from
582  * using bounds directly, but is still useful to us if we're called a
583  * second time with cached bounds (cached low will be < stricthigh when
584  * that happens).
585  */
586  insertstate->low = low;
587  insertstate->stricthigh = stricthigh;
588  insertstate->bounds_valid = true;
589 
590  return low;
591 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3724
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:602
#define InvalidOffsetNumber
Definition: off.h:26
#define RelationGetRelationName(relation)
Definition: rel.h:539
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 602 of file nbtsearch.c.

603 {
604  IndexTuple itup;
605  ItemId itemid;
606  int low,
607  high,
608  mid,
609  res;
610 
611  /*
612  * If this isn't a posting tuple, then the index must be corrupt (if it is
613  * an ordinary non-pivot tuple then there must be an existing tuple with a
614  * heap TID that equals inserter's new heap TID/scantid). Defensively
615  * check that tuple is a posting list tuple whose posting list range
616  * includes caller's scantid.
617  *
618  * (This is also needed because contrib/amcheck's rootdescend option needs
619  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
620  */
621  itemid = PageGetItemId(page, offnum);
622  itup = (IndexTuple) PageGetItem(page, itemid);
623  if (!BTreeTupleIsPosting(itup))
624  return 0;
625 
626  Assert(key->heapkeyspace && key->allequalimage);
627 
628  /*
629  * In the event that posting list tuple has LP_DEAD bit set, indicate this
630  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
631  * second call to _bt_binsrch_insert() can take place when its caller has
632  * removed the dead item.
633  */
634  if (ItemIdIsDead(itemid))
635  return -1;
636 
637  /* "high" is past end of posting list for loop invariant */
638  low = 0;
639  high = BTreeTupleGetNPosting(itup);
640  Assert(high >= 2);
641 
642  while (high > low)
643  {
644  mid = low + ((high - low) / 2);
645  res = ItemPointerCompare(key->scantid,
646  BTreeTupleGetPostingN(itup, mid));
647 
648  if (res > 0)
649  low = mid + 1;
650  else if (res < 0)
651  high = mid;
652  else
653  return mid;
654  }
655 
656  /* Exact match not found */
657  return low;
658 }
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), ItemIdIsDead, ItemPointerCompare(), sort-test::key, PageGetItem(), PageGetItemId(), and res.

Referenced by _bt_binsrch_insert().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 688 of file nbtsearch.c.

692 {
693  TupleDesc itupdesc = RelationGetDescr(rel);
694  BTPageOpaque opaque = BTPageGetOpaque(page);
695  IndexTuple itup;
696  ItemPointer heapTid;
697  ScanKey scankey;
698  int ncmpkey;
699  int ntupatts;
700  int32 result;
701 
702  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
704  Assert(key->heapkeyspace || key->scantid == NULL);
705 
706  /*
707  * Force result ">" if target item is first data item on an internal page
708  * --- see NOTE above.
709  */
710  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
711  return 1;
712 
713  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
714  ntupatts = BTreeTupleGetNAtts(itup, rel);
715 
716  /*
717  * The scan key is set up with the attribute number associated with each
718  * term in the key. It is important that, if the index is multi-key, the
719  * scan contain the first k key attributes, and that they be in order. If
720  * you think about how multi-key ordering works, you'll understand why
721  * this is.
722  *
723  * We don't test for violation of this condition here, however. The
724  * initial setup for the index scan had better have gotten it right (see
725  * _bt_first).
726  */
727 
728  ncmpkey = Min(ntupatts, key->keysz);
729  Assert(key->heapkeyspace || ncmpkey == key->keysz);
730  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
731  scankey = key->scankeys;
732  for (int i = 1; i <= ncmpkey; i++)
733  {
734  Datum datum;
735  bool isNull;
736 
737  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
738 
739  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
740  {
741  if (isNull)
742  result = 0; /* NULL "=" NULL */
743  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
744  result = -1; /* NULL "<" NOT_NULL */
745  else
746  result = 1; /* NULL ">" NOT_NULL */
747  }
748  else if (isNull) /* key is NOT_NULL and item is NULL */
749  {
750  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
751  result = 1; /* NOT_NULL ">" NULL */
752  else
753  result = -1; /* NOT_NULL "<" NULL */
754  }
755  else
756  {
757  /*
758  * The sk_func needs to be passed the index value as left arg and
759  * the sk_argument as right arg (they might be of different
760  * types). Since it is convenient for callers to think of
761  * _bt_compare as comparing the scankey to the index item, we have
762  * to flip the sign of the comparison result. (Unless it's a DESC
763  * column, in which case we *don't* flip the sign.)
764  */
765  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
766  scankey->sk_collation,
767  datum,
768  scankey->sk_argument));
769 
770  if (!(scankey->sk_flags & SK_BT_DESC))
771  INVERT_COMPARE_RESULT(result);
772  }
773 
774  /* if the keys are unequal, return the difference */
775  if (result != 0)
776  return result;
777 
778  scankey++;
779  }
780 
781  /*
782  * All non-truncated attributes (other than heap TID) were found to be
783  * equal. Treat truncated attributes as minus infinity when scankey has a
784  * key attribute value that would otherwise be compared directly.
785  *
786  * Note: it doesn't matter if ntupatts includes non-key attributes;
787  * scankey won't, so explicitly excluding non-key attributes isn't
788  * necessary.
789  */
790  if (key->keysz > ntupatts)
791  return 1;
792 
793  /*
794  * Use the heap TID attribute and scantid to try to break the tie. The
795  * rules are the same as any other key attribute -- only the
796  * representation differs.
797  */
798  heapTid = BTreeTupleGetHeapTID(itup);
799  if (key->scantid == NULL)
800  {
801  /*
802  * Forward scans have a scankey that is considered greater than a
803  * truncated pivot tuple if and when the scankey has equal values for
804  * attributes up to and including the least significant untruncated
805  * attribute in tuple. Even attributes that were omitted from the
806  * scan key are considered greater than -inf truncated attributes.
807  * (See _bt_binsrch for an explanation of our backward scan behavior.)
808  *
809  * For example, if an index has the minimum two attributes (single
810  * user key attribute, plus heap TID attribute), and a page's high key
811  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
812  * will not descend to the page to the left. The search will descend
813  * right instead. The truncated attribute in pivot tuple means that
814  * all non-pivot tuples on the page to the left are strictly < 'foo',
815  * so it isn't necessary to descend left. In other words, search
816  * doesn't have to descend left because it isn't interested in a match
817  * that has a heap TID value of -inf.
818  *
819  * Note: the heap TID part of the test ensures that scankey is being
820  * compared to a pivot tuple with one or more truncated -inf key
821  * attributes. The heap TID attribute is the last key attribute in
822  * every index, of course, but other than that it isn't special.
823  */
824  if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
825  key->heapkeyspace)
826  return 1;
827 
828  /* All provided scankey arguments found to be equal */
829  return 0;
830  }
831 
832  /*
833  * Treat truncated heap TID as minus infinity, since scankey has a key
834  * attribute value (scantid) that would otherwise be compared directly
835  */
837  if (heapTid == NULL)
838  return 1;
839 
840  /*
841  * Scankey must be treated as equal to a posting list tuple if its scantid
842  * value falls within the range of the posting list. In all other cases
843  * there can only be a single heap TID value, which is compared directly
844  * with scantid.
845  */
847  result = ItemPointerCompare(key->scantid, heapTid);
848  if (result <= 0 || !BTreeTupleIsPosting(itup))
849  return result;
850  else
851  {
852  result = ItemPointerCompare(key->scantid,
854  if (result > 0)
855  return 1;
856  }
857 
858  return 0;
859 }
#define Min(x, y)
Definition: c.h:1009
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1111
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int i
Definition: isn.c:72
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1118
#define SK_BT_DESC
Definition: nbtree.h:1117
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4924
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define RelationGetDescr(relation)
Definition: rel.h:531
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 67 of file nbtsearch.c.

68 {
69  _bt_unlockbuf(scan->indexRelation, sp->buf);
70 
71  if (IsMVCCSnapshot(scan->xs_snapshot) &&
73  !scan->xs_want_itup)
74  {
75  ReleaseBuffer(sp->buf);
76  sp->buf = InvalidBuffer;
77  }
78 }
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4924
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
Buffer buf
Definition: nbtree.h:953
Relation indexRelation
Definition: relscan.h:141
struct SnapshotData * xs_snapshot
Definition: relscan.h:142

References _bt_unlockbuf(), BTScanPosData::buf, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_readfirstpage(), and _bt_readnextpage().

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2547 of file nbtsearch.c.

2548 {
2549  Relation rel = scan->indexRelation;
2550  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2551  Page page;
2552  BTPageOpaque opaque;
2554 
2556 
2557  /*
2558  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2559  * version of _bt_search().
2560  */
2562 
2563  if (!BufferIsValid(so->currPos.buf))
2564  {
2565  /*
2566  * Empty index. Lock the whole relation, as nothing finer to lock
2567  * exists.
2568  */
2569  PredicateLockRelation(rel, scan->xs_snapshot);
2570  _bt_parallel_done(scan);
2571  return false;
2572  }
2573 
2574  page = BufferGetPage(so->currPos.buf);
2575  opaque = BTPageGetOpaque(page);
2576  Assert(P_ISLEAF(opaque));
2577 
2578  if (ScanDirectionIsForward(dir))
2579  {
2580  /* There could be dead pages to the left, so not this: */
2581  /* Assert(P_LEFTMOST(opaque)); */
2582 
2583  start = P_FIRSTDATAKEY(opaque);
2584  }
2585  else if (ScanDirectionIsBackward(dir))
2586  {
2587  Assert(P_RIGHTMOST(opaque));
2588 
2589  start = PageGetMaxOffsetNumber(page);
2590  }
2591  else
2592  {
2593  elog(ERROR, "invalid scan direction: %d", (int) dir);
2594  start = 0; /* keep compiler quiet */
2595  }
2596 
2597  /*
2598  * Now load data from the first page of the scan.
2599  */
2600  if (!_bt_readfirstpage(scan, start, dir))
2601  return false;
2602 
2603  _bt_returnitem(scan, so);
2604  return true;
2605 }
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
#define elog(elevel,...)
Definition: elog.h:225
return str start
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:774
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1010
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1073
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2464
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2127
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:1998
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2566
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
BTScanPosData currPos
Definition: nbtree.h:1069

References _bt_get_endpoint(), _bt_parallel_done(), _bt_readfirstpage(), _bt_returnitem(), Assert, BTPageGetOpaque, BTScanPosIsValid, BTScanPosData::buf, BufferGetPage(), BufferIsValid(), BTScanOpaqueData::currPos, elog, ERROR, IndexScanDescData::indexRelation, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, start, and IndexScanDescData::xs_snapshot.

Referenced by _bt_first().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 882 of file nbtsearch.c.

883 {
884  Relation rel = scan->indexRelation;
885  BTScanOpaque so = (BTScanOpaque) scan->opaque;
886  BTStack stack;
887  OffsetNumber offnum;
888  BTScanInsertData inskey;
889  ScanKey startKeys[INDEX_MAX_KEYS];
890  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
891  int keysz = 0;
892  StrategyNumber strat_total;
893 
895 
896  /*
897  * Examine the scan keys and eliminate any redundant keys; also mark the
898  * keys that must be matched to continue the scan.
899  */
900  _bt_preprocess_keys(scan);
901 
902  /*
903  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
904  * never be satisfied (eg, x == 1 AND x > 2).
905  */
906  if (!so->qual_ok)
907  {
908  _bt_parallel_done(scan);
909  return false;
910  }
911 
912  /*
913  * For parallel scans, get the starting page from shared state. If the
914  * scan has not started, proceed to find out first leaf page in the usual
915  * way while keeping other participating processes waiting. If the scan
916  * has already begun, use the page number from the shared structure.
917  *
918  * When a parallel scan has another primitive index scan scheduled, a
919  * parallel worker will seize the scan for that purpose now. This is
920  * similar to the case where the top-level scan hasn't started.
921  */
922  if (scan->parallel_scan != NULL)
923  {
924  BlockNumber blkno,
925  lastcurrblkno;
926 
927  if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
928  return false;
929 
930  /*
931  * Successfully seized the scan, which _bt_readfirstpage or possibly
932  * _bt_readnextpage will release (unless the scan ends right away, in
933  * which case we'll call _bt_parallel_done directly).
934  *
935  * Initialize arrays (when _bt_parallel_seize didn't already set up
936  * the next primitive index scan).
937  */
938  if (so->numArrayKeys && !so->needPrimScan)
939  _bt_start_array_keys(scan, dir);
940 
941  Assert(blkno != P_NONE);
942  if (blkno != InvalidBlockNumber)
943  {
944  Assert(!so->needPrimScan);
945 
946  /*
947  * We anticipated starting another primitive scan, but some other
948  * worker bet us to it
949  */
950  if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
951  return false;
952 
953  _bt_returnitem(scan, so);
954  return true;
955  }
956  }
957  else if (so->numArrayKeys && !so->needPrimScan)
958  {
959  /*
960  * First _bt_first call (for current btrescan) without parallelism.
961  *
962  * Initialize arrays, and the corresponding scan keys that were just
963  * output by _bt_preprocess_keys.
964  */
965  _bt_start_array_keys(scan, dir);
966  }
967 
968  /*
969  * Count an indexscan for stats, now that we know that we'll call
970  * _bt_search/_bt_endpoint below
971  */
973 
974  /*----------
975  * Examine the scan keys to discover where we need to start the scan.
976  *
977  * We want to identify the keys that can be used as starting boundaries;
978  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
979  * a backwards scan. We can use keys for multiple attributes so long as
980  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
981  * a > or < boundary or find an attribute with no boundary (which can be
982  * thought of as the same as "> -infinity"), we can't use keys for any
983  * attributes to its right, because it would break our simplistic notion
984  * of what initial positioning strategy to use.
985  *
986  * When the scan keys include cross-type operators, _bt_preprocess_keys
987  * may not be able to eliminate redundant keys; in such cases we will
988  * arbitrarily pick a usable one for each attribute. This is correct
989  * but possibly not optimal behavior. (For example, with keys like
990  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
991  * x=5 would be more efficient.) Since the situation only arises given
992  * a poorly-worded query plus an incomplete opfamily, live with it.
993  *
994  * When both equality and inequality keys appear for a single attribute
995  * (again, only possible when cross-type operators appear), we *must*
996  * select one of the equality keys for the starting point, because
997  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
998  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
999  * start at x=4, we will fail and stop before reaching x=10. If multiple
1000  * equality quals survive preprocessing, however, it doesn't matter which
1001  * one we use --- by definition, they are either redundant or
1002  * contradictory.
1003  *
1004  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
1005  * If the index stores nulls at the end of the index we'll be starting
1006  * from, and we have no boundary key for the column (which means the key
1007  * we deduced NOT NULL from is an inequality key that constrains the other
1008  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1009  * use as a boundary key. If we didn't do this, we might find ourselves
1010  * traversing a lot of null entries at the start of the scan.
1011  *
1012  * In this loop, row-comparison keys are treated the same as keys on their
1013  * first (leftmost) columns. We'll add on lower-order columns of the row
1014  * comparison below, if possible.
1015  *
1016  * The selected scan keys (at most one per index column) are remembered by
1017  * storing their addresses into the local startKeys[] array.
1018  *
1019  * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1020  * the next primitive index scan (for scans with array keys) based in part
1021  * on an understanding of how it'll enable us to reposition the scan.
1022  * They're directly aware of how we'll sometimes cons up an explicit
1023  * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1024  * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1025  * scans to skip large groups of NULLs through repeated deductions about
1026  * key strictness (for a required inequality key) and whether NULLs in the
1027  * key's index column are stored last or first (relative to non-NULLs).
1028  * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1029  * need to be kept in sync.
1030  *----------
1031  */
1032  strat_total = BTEqualStrategyNumber;
1033  if (so->numberOfKeys > 0)
1034  {
1035  AttrNumber curattr;
1036  ScanKey chosen;
1037  ScanKey impliesNN;
1038  ScanKey cur;
1039 
1040  /*
1041  * chosen is the so-far-chosen key for the current attribute, if any.
1042  * We don't cast the decision in stone until we reach keys for the
1043  * next attribute.
1044  */
1045  cur = so->keyData;
1046  curattr = 1;
1047  chosen = NULL;
1048  /* Also remember any scankey that implies a NOT NULL constraint */
1049  impliesNN = NULL;
1050 
1051  /*
1052  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1053  * pass to handle after-last-key processing. Actual exit from the
1054  * loop is at one of the "break" statements below.
1055  */
1056  for (int i = 0;; cur++, i++)
1057  {
1058  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1059  {
1060  /*
1061  * Done looking at keys for curattr. If we didn't find a
1062  * usable boundary key, see if we can deduce a NOT NULL key.
1063  */
1064  if (chosen == NULL && impliesNN != NULL &&
1065  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1066  ScanDirectionIsForward(dir) :
1068  {
1069  /* Yes, so build the key in notnullkeys[keysz] */
1070  chosen = &notnullkeys[keysz];
1071  ScanKeyEntryInitialize(chosen,
1073  (impliesNN->sk_flags &
1075  curattr,
1076  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1079  InvalidOid,
1080  InvalidOid,
1081  InvalidOid,
1082  (Datum) 0);
1083  }
1084 
1085  /*
1086  * If we still didn't find a usable boundary key, quit; else
1087  * save the boundary key pointer in startKeys.
1088  */
1089  if (chosen == NULL)
1090  break;
1091  startKeys[keysz++] = chosen;
1092 
1093  /* Quit if we have stored a > or < key */
1094  strat_total = chosen->sk_strategy;
1095  if (strat_total == BTGreaterStrategyNumber ||
1096  strat_total == BTLessStrategyNumber)
1097  break;
1098 
1099  /*
1100  * Done if that was the last attribute, or if next key is not
1101  * in sequence (implying no boundary key is available for the
1102  * next attribute).
1103  */
1104  if (i >= so->numberOfKeys ||
1105  cur->sk_attno != curattr + 1)
1106  break;
1107 
1108  /*
1109  * Reset for next attr.
1110  */
1111  curattr = cur->sk_attno;
1112  chosen = NULL;
1113  impliesNN = NULL;
1114  }
1115 
1116  /*
1117  * Can we use this key as a starting boundary for this attr?
1118  *
1119  * If not, does it imply a NOT NULL constraint? (Because
1120  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1121  * *any* inequality key works for that; we need not test.)
1122  */
1123  switch (cur->sk_strategy)
1124  {
1125  case BTLessStrategyNumber:
1127  if (chosen == NULL)
1128  {
1129  if (ScanDirectionIsBackward(dir))
1130  chosen = cur;
1131  else
1132  impliesNN = cur;
1133  }
1134  break;
1135  case BTEqualStrategyNumber:
1136  /* override any non-equality choice */
1137  chosen = cur;
1138  break;
1141  if (chosen == NULL)
1142  {
1143  if (ScanDirectionIsForward(dir))
1144  chosen = cur;
1145  else
1146  impliesNN = cur;
1147  }
1148  break;
1149  }
1150  }
1151  }
1152 
1153  /*
1154  * If we found no usable boundary keys, we have to start from one end of
1155  * the tree. Walk down that edge to the first or last key, and scan from
1156  * there.
1157  *
1158  * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1159  */
1160  if (keysz == 0)
1161  return _bt_endpoint(scan, dir);
1162 
1163  /*
1164  * We want to start the scan somewhere within the index. Set up an
1165  * insertion scankey we can use to search for the boundary point we
1166  * identified above. The insertion scankey is built using the keys
1167  * identified by startKeys[]. (Remaining insertion scankey fields are
1168  * initialized after initial-positioning scan keys are finalized.)
1169  */
1170  Assert(keysz <= INDEX_MAX_KEYS);
1171  for (int i = 0; i < keysz; i++)
1172  {
1173  ScanKey cur = startKeys[i];
1174 
1175  Assert(cur->sk_attno == i + 1);
1176 
1177  if (cur->sk_flags & SK_ROW_HEADER)
1178  {
1179  /*
1180  * Row comparison header: look to the first row member instead.
1181  *
1182  * The member scankeys are already in insertion format (ie, they
1183  * have sk_func = 3-way-comparison function), but we have to watch
1184  * out for nulls, which _bt_preprocess_keys didn't check. A null
1185  * in the first row member makes the condition unmatchable, just
1186  * like qual_ok = false.
1187  */
1188  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1189 
1190  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1191  if (subkey->sk_flags & SK_ISNULL)
1192  {
1193  _bt_parallel_done(scan);
1194  return false;
1195  }
1196  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1197 
1198  /*
1199  * If the row comparison is the last positioning key we accepted,
1200  * try to add additional keys from the lower-order row members.
1201  * (If we accepted independent conditions on additional index
1202  * columns, we use those instead --- doesn't seem worth trying to
1203  * determine which is more restrictive.) Note that this is OK
1204  * even if the row comparison is of ">" or "<" type, because the
1205  * condition applied to all but the last row member is effectively
1206  * ">=" or "<=", and so the extra keys don't break the positioning
1207  * scheme. But, by the same token, if we aren't able to use all
1208  * the row members, then the part of the row comparison that we
1209  * did use has to be treated as just a ">=" or "<=" condition, and
1210  * so we'd better adjust strat_total accordingly.
1211  */
1212  if (i == keysz - 1)
1213  {
1214  bool used_all_subkeys = false;
1215 
1216  Assert(!(subkey->sk_flags & SK_ROW_END));
1217  for (;;)
1218  {
1219  subkey++;
1220  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1221  if (subkey->sk_attno != keysz + 1)
1222  break; /* out-of-sequence, can't use it */
1223  if (subkey->sk_strategy != cur->sk_strategy)
1224  break; /* wrong direction, can't use it */
1225  if (subkey->sk_flags & SK_ISNULL)
1226  break; /* can't use null keys */
1227  Assert(keysz < INDEX_MAX_KEYS);
1228  memcpy(inskey.scankeys + keysz, subkey,
1229  sizeof(ScanKeyData));
1230  keysz++;
1231  if (subkey->sk_flags & SK_ROW_END)
1232  {
1233  used_all_subkeys = true;
1234  break;
1235  }
1236  }
1237  if (!used_all_subkeys)
1238  {
1239  switch (strat_total)
1240  {
1241  case BTLessStrategyNumber:
1242  strat_total = BTLessEqualStrategyNumber;
1243  break;
1245  strat_total = BTGreaterEqualStrategyNumber;
1246  break;
1247  }
1248  }
1249  break; /* done with outer loop */
1250  }
1251  }
1252  else
1253  {
1254  /*
1255  * Ordinary comparison key. Transform the search-style scan key
1256  * to an insertion scan key by replacing the sk_func with the
1257  * appropriate btree comparison function.
1258  *
1259  * If scankey operator is not a cross-type comparison, we can use
1260  * the cached comparison function; otherwise gotta look it up in
1261  * the catalogs. (That can't lead to infinite recursion, since no
1262  * indexscan initiated by syscache lookup will use cross-data-type
1263  * operators.)
1264  *
1265  * We support the convention that sk_subtype == InvalidOid means
1266  * the opclass input type; this is a hack to simplify life for
1267  * ScanKeyInit().
1268  */
1269  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1270  cur->sk_subtype == InvalidOid)
1271  {
1272  FmgrInfo *procinfo;
1273 
1274  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1275  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1276  cur->sk_flags,
1277  cur->sk_attno,
1279  cur->sk_subtype,
1280  cur->sk_collation,
1281  procinfo,
1282  cur->sk_argument);
1283  }
1284  else
1285  {
1286  RegProcedure cmp_proc;
1287 
1288  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1289  rel->rd_opcintype[i],
1290  cur->sk_subtype,
1291  BTORDER_PROC);
1292  if (!RegProcedureIsValid(cmp_proc))
1293  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1294  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1295  cur->sk_attno, RelationGetRelationName(rel));
1296  ScanKeyEntryInitialize(inskey.scankeys + i,
1297  cur->sk_flags,
1298  cur->sk_attno,
1300  cur->sk_subtype,
1301  cur->sk_collation,
1302  cmp_proc,
1303  cur->sk_argument);
1304  }
1305  }
1306  }
1307 
1308  /*----------
1309  * Examine the selected initial-positioning strategy to determine exactly
1310  * where we need to start the scan, and set flag variables to control the
1311  * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1312  * page _bt_search returns).
1313  *----------
1314  */
1315  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1316  inskey.anynullkeys = false; /* unused */
1317  inskey.scantid = NULL;
1318  inskey.keysz = keysz;
1319  switch (strat_total)
1320  {
1321  case BTLessStrategyNumber:
1322 
1323  inskey.nextkey = false;
1324  inskey.backward = true;
1325  break;
1326 
1328 
1329  inskey.nextkey = true;
1330  inskey.backward = true;
1331  break;
1332 
1333  case BTEqualStrategyNumber:
1334 
1335  /*
1336  * If a backward scan was specified, need to start with last equal
1337  * item not first one.
1338  */
1339  if (ScanDirectionIsBackward(dir))
1340  {
1341  /*
1342  * This is the same as the <= strategy
1343  */
1344  inskey.nextkey = true;
1345  inskey.backward = true;
1346  }
1347  else
1348  {
1349  /*
1350  * This is the same as the >= strategy
1351  */
1352  inskey.nextkey = false;
1353  inskey.backward = false;
1354  }
1355  break;
1356 
1358 
1359  /*
1360  * Find first item >= scankey
1361  */
1362  inskey.nextkey = false;
1363  inskey.backward = false;
1364  break;
1365 
1367 
1368  /*
1369  * Find first item > scankey
1370  */
1371  inskey.nextkey = true;
1372  inskey.backward = false;
1373  break;
1374 
1375  default:
1376  /* can't get here, but keep compiler quiet */
1377  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1378  return false;
1379  }
1380 
1381  /*
1382  * Use the manufactured insertion scan key to descend the tree and
1383  * position ourselves on the target leaf page.
1384  */
1385  Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1386  stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1387 
1388  /* don't need to keep the stack around... */
1389  _bt_freestack(stack);
1390 
1391  if (!BufferIsValid(so->currPos.buf))
1392  {
1393  /*
1394  * We only get here if the index is completely empty. Lock relation
1395  * because nothing finer to lock exists. Without a buffer lock, it's
1396  * possible for another transaction to insert data between
1397  * _bt_search() and PredicateLockRelation(). We have to try again
1398  * after taking the relation-level predicate lock, to close a narrow
1399  * window where we wouldn't scan concurrently inserted tuples, but the
1400  * writer wouldn't see our predicate lock.
1401  */
1403  {
1404  PredicateLockRelation(rel, scan->xs_snapshot);
1405  stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1406  _bt_freestack(stack);
1407  }
1408 
1409  if (!BufferIsValid(so->currPos.buf))
1410  {
1411  _bt_parallel_done(scan);
1412  return false;
1413  }
1414  }
1415 
1416  /* position to the precise item on the page */
1417  offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1418 
1419  /*
1420  * Now load data from the first page of the scan (usually the page
1421  * currently in so->currPos.buf).
1422  *
1423  * If inskey.nextkey = false and inskey.backward = false, offnum is
1424  * positioned at the first non-pivot tuple >= inskey.scankeys.
1425  *
1426  * If inskey.nextkey = false and inskey.backward = true, offnum is
1427  * positioned at the last non-pivot tuple < inskey.scankeys.
1428  *
1429  * If inskey.nextkey = true and inskey.backward = false, offnum is
1430  * positioned at the first non-pivot tuple > inskey.scankeys.
1431  *
1432  * If inskey.nextkey = true and inskey.backward = true, offnum is
1433  * positioned at the last non-pivot tuple <= inskey.scankeys.
1434  *
1435  * It's possible that _bt_binsrch returned an offnum that is out of bounds
1436  * for the page. For example, when inskey is both < the leaf page's high
1437  * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1438  */
1439  if (!_bt_readfirstpage(scan, offnum, dir))
1440  return false;
1441 
1442  _bt_returnitem(scan, so);
1443  return true;
1444 }
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define RegProcedureIsValid(p)
Definition: c.h:782
regproc RegProcedure
Definition: c.h:655
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:862
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:796
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:605
#define BTORDER_PROC
Definition: nbtree.h:707
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:719
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2213
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2547
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:221
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1352
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:2530
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:664
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
bool needPrimScan
Definition: nbtree.h:1040
ScanKey keyData
Definition: nbtree.h:1036
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:189
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BTScanPosData::buf, BufferIsValid(), cur, BTScanOpaqueData::currPos, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanOpaqueData::keyData, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2464 of file nbtsearch.c.

2465 {
2466  Buffer buf;
2467  Page page;
2468  BTPageOpaque opaque;
2469  OffsetNumber offnum;
2470  BlockNumber blkno;
2471  IndexTuple itup;
2472 
2473  /*
2474  * If we are looking for a leaf page, okay to descend from fast root;
2475  * otherwise better descend from true root. (There is no point in being
2476  * smarter about intermediate levels.)
2477  */
2478  if (level == 0)
2479  buf = _bt_getroot(rel, NULL, BT_READ);
2480  else
2481  buf = _bt_gettrueroot(rel);
2482 
2483  if (!BufferIsValid(buf))
2484  return InvalidBuffer;
2485 
2486  page = BufferGetPage(buf);
2487  opaque = BTPageGetOpaque(page);
2488 
2489  for (;;)
2490  {
2491  /*
2492  * If we landed on a deleted page, step right to find a live page
2493  * (there must be one). Also, if we want the rightmost page, step
2494  * right if needed to get to it (this could happen if the page split
2495  * since we obtained a pointer to it).
2496  */
2497  while (P_IGNORE(opaque) ||
2498  (rightmost && !P_RIGHTMOST(opaque)))
2499  {
2500  blkno = opaque->btpo_next;
2501  if (blkno == P_NONE)
2502  elog(ERROR, "fell off the end of index \"%s\"",
2504  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2505  page = BufferGetPage(buf);
2506  opaque = BTPageGetOpaque(page);
2507  }
2508 
2509  /* Done? */
2510  if (opaque->btpo_level == level)
2511  break;
2512  if (opaque->btpo_level < level)
2513  ereport(ERROR,
2514  (errcode(ERRCODE_INDEX_CORRUPTED),
2515  errmsg_internal("btree level %u not found in index \"%s\"",
2516  level, RelationGetRelationName(rel))));
2517 
2518  /* Descend to leftmost or rightmost child page */
2519  if (rightmost)
2520  offnum = PageGetMaxOffsetNumber(page);
2521  else
2522  offnum = P_FIRSTDATAKEY(opaque);
2523 
2524  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2525  blkno = BTreeTupleGetDownLink(itup);
2526 
2527  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2528  page = BufferGetPage(buf);
2529  opaque = BTPageGetOpaque(page);
2530  }
2531 
2532  return buf;
2533 }
int Buffer
Definition: buf.h:23
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define P_IGNORE(opaque)
Definition: nbtree.h:225
BlockNumber btpo_next
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_lock_and_validate_left()

static Buffer _bt_lock_and_validate_left ( Relation  rel,
BlockNumber blkno,
BlockNumber  lastcurrblkno 
)
static

Definition at line 2347 of file nbtsearch.c.

2349 {
2350  BlockNumber origblkno = *blkno; /* detects circular links */
2351 
2352  for (;;)
2353  {
2354  Buffer buf;
2355  Page page;
2356  BTPageOpaque opaque;
2357  int tries;
2358 
2359  /* check for interrupts while we're not holding any buffer lock */
2361  buf = _bt_getbuf(rel, *blkno, BT_READ);
2362  page = BufferGetPage(buf);
2363  opaque = BTPageGetOpaque(page);
2364 
2365  /*
2366  * If this isn't the page we want, walk right till we find what we
2367  * want --- but go no more than four hops (an arbitrary limit). If we
2368  * don't find the correct page by then, the most likely bet is that
2369  * lastcurrblkno got deleted and isn't in the sibling chain at all
2370  * anymore, not that its left sibling got split more than four times.
2371  *
2372  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2373  * because half-dead pages are still in the sibling chain.
2374  */
2375  tries = 0;
2376  for (;;)
2377  {
2378  if (likely(!P_ISDELETED(opaque) &&
2379  opaque->btpo_next == lastcurrblkno))
2380  {
2381  /* Found desired page, return it */
2382  return buf;
2383  }
2384  if (P_RIGHTMOST(opaque) || ++tries > 4)
2385  break;
2386  /* step right */
2387  *blkno = opaque->btpo_next;
2388  buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2389  page = BufferGetPage(buf);
2390  opaque = BTPageGetOpaque(page);
2391  }
2392 
2393  /*
2394  * Return to the original page (usually the page most recently read by
2395  * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2396  * what's up with its prev sibling link
2397  */
2398  buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2399  page = BufferGetPage(buf);
2400  opaque = BTPageGetOpaque(page);
2401  if (P_ISDELETED(opaque))
2402  {
2403  /*
2404  * It was deleted. Move right to first nondeleted page (there
2405  * must be one); that is the page that has acquired the deleted
2406  * one's keyspace, so stepping left from it will take us where we
2407  * want to be.
2408  */
2409  for (;;)
2410  {
2411  if (P_RIGHTMOST(opaque))
2412  elog(ERROR, "fell off the end of index \"%s\"",
2414  lastcurrblkno = opaque->btpo_next;
2415  buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2416  page = BufferGetPage(buf);
2417  opaque = BTPageGetOpaque(page);
2418  if (!P_ISDELETED(opaque))
2419  break;
2420  }
2421  }
2422  else
2423  {
2424  /*
2425  * Original lastcurrblkno wasn't deleted; the explanation had
2426  * better be that the page to the left got split or deleted.
2427  * Without this check, we risk going into an infinite loop.
2428  */
2429  if (opaque->btpo_prev == origblkno)
2430  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2431  lastcurrblkno, RelationGetRelationName(rel));
2432  /* Okay to try again, since left sibling link changed */
2433  }
2434 
2435  /*
2436  * Original lastcurrblkno from caller was concurrently deleted (could
2437  * also have been a great many concurrent left sibling page splits).
2438  * Found a non-deleted page that should now act as our lastcurrblkno.
2439  */
2440  if (P_LEFTMOST(opaque))
2441  {
2442  /* New lastcurrblkno has no left sibling (concurrently deleted) */
2443  _bt_relbuf(rel, buf);
2444  break;
2445  }
2446 
2447  /* Start from scratch with new lastcurrblkno's blkno/prev link */
2448  *blkno = origblkno = opaque->btpo_prev;
2449  _bt_relbuf(rel, buf);
2450  }
2451 
2452  return InvalidBuffer;
2453 }
#define likely(x)
Definition: c.h:325
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
BlockNumber btpo_prev
Definition: nbtree.h:64

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetPage(), CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, likely, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_readnextpage().

◆ _bt_moveright()

static Buffer _bt_moveright ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access 
)
static

Definition at line 241 of file nbtsearch.c.

248 {
249  Page page;
250  BTPageOpaque opaque;
251  int32 cmpval;
252 
253  Assert(!forupdate || heaprel != NULL);
254 
255  /*
256  * When nextkey = false (normal case): if the scan key that brought us to
257  * this page is > the high key stored on the page, then the page has split
258  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
259  * have some duplicates to the right as well as the left, but that's
260  * something that's only ever dealt with on the leaf level, after
261  * _bt_search has found an initial leaf page.)
262  *
263  * When nextkey = true: move right if the scan key is >= page's high key.
264  * (Note that key.scantid cannot be set in this case.)
265  *
266  * The page could even have split more than once, so scan as far as
267  * needed.
268  *
269  * We also have to move right if we followed a link that brought us to a
270  * dead page.
271  */
272  cmpval = key->nextkey ? 0 : 1;
273 
274  for (;;)
275  {
276  page = BufferGetPage(buf);
277  opaque = BTPageGetOpaque(page);
278 
279  if (P_RIGHTMOST(opaque))
280  break;
281 
282  /*
283  * Finish any incomplete splits we encounter along the way.
284  */
285  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
286  {
288 
289  /* upgrade our lock if necessary */
290  if (access == BT_READ)
291  {
292  _bt_unlockbuf(rel, buf);
293  _bt_lockbuf(rel, buf, BT_WRITE);
294  }
295 
296  if (P_INCOMPLETE_SPLIT(opaque))
297  _bt_finish_split(rel, heaprel, buf, stack);
298  else
299  _bt_relbuf(rel, buf);
300 
301  /* re-acquire the lock in the right mode, and re-check */
302  buf = _bt_getbuf(rel, blkno, access);
303  continue;
304  }
305 
306  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
307  {
308  /* step right one page */
309  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
310  continue;
311  }
312  else
313  break;
314  }
315 
316  if (P_IGNORE(opaque))
317  elog(ERROR, "fell off the end of index \"%s\"",
319 
320  return buf;
321 }
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
#define P_HIKEY
Definition: nbtree.h:367
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BT_WRITE
Definition: nbtree.h:720
short access
Definition: preproc-type.c:36

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage(), elog, ERROR, sort-test::key, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_search().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1461 of file nbtsearch.c.

1462 {
1463  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1464 
1466 
1467  /*
1468  * Advance to next tuple on current page; or if there's no more, try to
1469  * step to the next page with data.
1470  */
1471  if (ScanDirectionIsForward(dir))
1472  {
1473  if (++so->currPos.itemIndex > so->currPos.lastItem)
1474  {
1475  if (!_bt_steppage(scan, dir))
1476  return false;
1477  }
1478  }
1479  else
1480  {
1481  if (--so->currPos.itemIndex < so->currPos.firstItem)
1482  {
1483  if (!_bt_steppage(scan, dir))
1484  return false;
1485  }
1486  }
1487 
1488  _bt_returnitem(scan, so);
1489  return true;
1490 }
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2024
int firstItem
Definition: nbtree.h:984
int lastItem
Definition: nbtree.h:985
int itemIndex
Definition: nbtree.h:986

References _bt_returnitem(), _bt_steppage(), Assert, BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_readfirstpage()

static bool _bt_readfirstpage ( IndexScanDesc  scan,
OffsetNumber  offnum,
ScanDirection  dir 
)
static

Definition at line 2127 of file nbtsearch.c.

2128 {
2129  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2130 
2131  so->numKilled = 0; /* just paranoia */
2132  so->markItemIndex = -1; /* ditto */
2133 
2134  /* Initialize so->currPos for the first page (page in so->currPos.buf) */
2135  if (so->needPrimScan)
2136  {
2137  Assert(so->numArrayKeys);
2138 
2139  so->currPos.moreLeft = true;
2140  so->currPos.moreRight = true;
2141  so->needPrimScan = false;
2142  }
2143  else if (ScanDirectionIsForward(dir))
2144  {
2145  so->currPos.moreLeft = false;
2146  so->currPos.moreRight = true;
2147  }
2148  else
2149  {
2150  so->currPos.moreLeft = true;
2151  so->currPos.moreRight = false;
2152  }
2153 
2154  /*
2155  * Attempt to load matching tuples from the first page.
2156  *
2157  * Note that _bt_readpage will finish initializing the so->currPos fields.
2158  * _bt_readpage also releases parallel scan (even when it returns false).
2159  */
2160  if (_bt_readpage(scan, dir, offnum, true))
2161  {
2162  /*
2163  * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2164  * so->currPos.buf in preparation for btgettuple returning tuples.
2165  */
2168  return true;
2169  }
2170 
2171  /* There's no actually-matching data on the page in so->currPos.buf */
2172  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2173 
2174  /* Call _bt_readnextpage using its _bt_steppage wrapper function */
2175  if (!_bt_steppage(scan, dir))
2176  return false;
2177 
2178  /* _bt_readpage for a later page (now in so->currPos) succeeded */
2179  return true;
2180 }
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:993
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:67
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
Definition: nbtsearch.c:1513
bool moreRight
Definition: nbtree.h:975
bool moreLeft
Definition: nbtree.h:974

References _bt_drop_lock_and_maybe_pin(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert, BTScanPosIsPinned, BTScanPosData::buf, BTScanOpaqueData::currPos, if(), IndexScanDescData::indexRelation, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), and _bt_first().

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
BlockNumber  lastcurrblkno,
ScanDirection  dir,
bool  seized 
)
static

Definition at line 2213 of file nbtsearch.c.

2215 {
2216  Relation rel = scan->indexRelation;
2217  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2218 
2219  Assert(so->currPos.currPage == lastcurrblkno || seized);
2221 
2222  /*
2223  * Remember that the scan already read lastcurrblkno, a page to the left
2224  * of blkno (or remember reading a page to the right, for backwards scans)
2225  */
2226  if (ScanDirectionIsForward(dir))
2227  so->currPos.moreLeft = true;
2228  else
2229  so->currPos.moreRight = true;
2230 
2231  for (;;)
2232  {
2233  Page page;
2234  BTPageOpaque opaque;
2235 
2236  if (blkno == P_NONE ||
2237  (ScanDirectionIsForward(dir) ?
2238  !so->currPos.moreRight : !so->currPos.moreLeft))
2239  {
2240  /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
2241  Assert(so->currPos.currPage == lastcurrblkno && !seized);
2243  _bt_parallel_done(scan); /* iff !so->needPrimScan */
2244  return false;
2245  }
2246 
2247  Assert(!so->needPrimScan);
2248 
2249  /* parallel scan must never actually visit so->currPos blkno */
2250  if (!seized && scan->parallel_scan != NULL &&
2251  !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
2252  {
2253  /* whole scan is now done (or another primitive scan required) */
2255  return false;
2256  }
2257 
2258  if (ScanDirectionIsForward(dir))
2259  {
2260  /* read blkno, but check for interrupts first */
2262  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2263  }
2264  else
2265  {
2266  /* read blkno, avoiding race (also checks for interrupts) */
2267  so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
2268  lastcurrblkno);
2269  if (so->currPos.buf == InvalidBuffer)
2270  {
2271  /* must have been a concurrent deletion of leftmost page */
2273  _bt_parallel_done(scan);
2274  return false;
2275  }
2276  }
2277 
2278  page = BufferGetPage(so->currPos.buf);
2279  opaque = BTPageGetOpaque(page);
2280  lastcurrblkno = blkno;
2281  if (likely(!P_IGNORE(opaque)))
2282  {
2283  /* see if there are any matches on this page */
2284  if (ScanDirectionIsForward(dir))
2285  {
2286  /* note that this will clear moreRight if we can stop */
2287  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2288  break;
2289  blkno = so->currPos.nextPage;
2290  }
2291  else
2292  {
2293  /* note that this will clear moreLeft if we can stop */
2294  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2295  break;
2296  blkno = so->currPos.prevPage;
2297  }
2298  }
2299  else
2300  {
2301  /* _bt_readpage not called, so do all this for ourselves */
2302  if (ScanDirectionIsForward(dir))
2303  blkno = opaque->btpo_next;
2304  else
2305  blkno = opaque->btpo_prev;
2306  if (scan->parallel_scan != NULL)
2307  _bt_parallel_release(scan, blkno, lastcurrblkno);
2308  }
2309 
2310  /* no matching tuples on this page */
2311  _bt_relbuf(rel, so->currPos.buf);
2312  seized = false; /* released by _bt_readpage (or by us) */
2313  }
2314 
2315  /*
2316  * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2317  * so->currPos.buf in preparation for btgettuple returning tuples.
2318  */
2319  Assert(so->currPos.currPage == blkno);
2322 
2323  return true;
2324 }
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:747
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1016
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition: nbtsearch.c:2347
BlockNumber currPage
Definition: nbtree.h:956
BlockNumber prevPage
Definition: nbtree.h:957
BlockNumber nextPage
Definition: nbtree.h:958

References _bt_drop_lock_and_maybe_pin(), _bt_getbuf(), _bt_lock_and_validate_left(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), Assert, BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosData::buf, BufferGetPage(), CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBuffer, likely, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, BTScanPosData::prevPage, and ScanDirectionIsForward.

Referenced by _bt_first(), and _bt_steppage().

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum,
bool  firstPage 
)
static

Definition at line 1513 of file nbtsearch.c.

1515 {
1516  Relation rel = scan->indexRelation;
1517  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1518  Page page;
1519  BTPageOpaque opaque;
1520  OffsetNumber minoff;
1521  OffsetNumber maxoff;
1522  BTReadPageState pstate;
1523  bool arrayKeys;
1524  int itemIndex,
1525  indnatts;
1526 
1527  /* save the page/buffer block number, along with its sibling links */
1528  page = BufferGetPage(so->currPos.buf);
1529  opaque = BTPageGetOpaque(page);
1531  so->currPos.prevPage = opaque->btpo_prev;
1532  so->currPos.nextPage = opaque->btpo_next;
1533 
1534  Assert(!P_IGNORE(opaque));
1536  Assert(!so->needPrimScan);
1537 
1538  if (scan->parallel_scan)
1539  {
1540  /* allow next/prev page to be read by other worker without delay */
1541  if (ScanDirectionIsForward(dir))
1543  so->currPos.currPage);
1544  else
1546  so->currPos.currPage);
1547  }
1548 
1549  /* initialize remaining currPos fields related to current page */
1551  so->currPos.dir = dir;
1552  so->currPos.nextTupleOffset = 0;
1553  /* either moreLeft or moreRight should be set now (may be unset later) */
1555  so->currPos.moreLeft);
1556 
1557  PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
1558 
1559  /* initialize local variables */
1560  indnatts = IndexRelationGetNumberOfAttributes(rel);
1561  arrayKeys = so->numArrayKeys != 0;
1562  minoff = P_FIRSTDATAKEY(opaque);
1563  maxoff = PageGetMaxOffsetNumber(page);
1564 
1565  /* initialize page-level state that we'll pass to _bt_checkkeys */
1566  pstate.minoff = minoff;
1567  pstate.maxoff = maxoff;
1568  pstate.finaltup = NULL;
1569  pstate.page = page;
1570  pstate.offnum = InvalidOffsetNumber;
1571  pstate.skip = InvalidOffsetNumber;
1572  pstate.continuescan = true; /* default assumption */
1573  pstate.prechecked = false;
1574  pstate.firstmatch = false;
1575  pstate.rechecks = 0;
1576  pstate.targetdistance = 0;
1577 
1578  /*
1579  * Prechecking the value of the continuescan flag for the last item on the
1580  * page (for backwards scan it will be the first item on a page). If we
1581  * observe it to be true, then it should be true for all other items. This
1582  * allows us to do significant optimizations in the _bt_checkkeys()
1583  * function for all the items on the page.
1584  *
1585  * With the forward scan, we do this check for the last item on the page
1586  * instead of the high key. It's relatively likely that the most
1587  * significant column in the high key will be different from the
1588  * corresponding value from the last item on the page. So checking with
1589  * the last item on the page would give a more precise answer.
1590  *
1591  * We skip this for the first page read by each (primitive) scan, to avoid
1592  * slowing down point queries. They typically don't stand to gain much
1593  * when the optimization can be applied, and are more likely to notice the
1594  * overhead of the precheck.
1595  *
1596  * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1597  * just set a low-order required array's key to the best available match
1598  * for a truncated -inf attribute value from the prior page's high key
1599  * (array element 0 is always the best available match in this scenario).
1600  * It's quite likely that matches for array element 0 begin on this page,
1601  * but the start of matches won't necessarily align with page boundaries.
1602  * When the start of matches is somewhere in the middle of this page, it
1603  * would be wrong to treat page's final non-pivot tuple as representative.
1604  * Doing so might lead us to treat some of the page's earlier tuples as
1605  * being part of a group of tuples thought to satisfy the required keys.
1606  *
1607  * Note: Conversely, in the case where the scan's arrays just advanced
1608  * using the prior page's HIKEY _without_ advancement setting scanBehind,
1609  * the start of matches must be aligned with page boundaries, which makes
1610  * it safe to attempt the optimization here now. It's also safe when the
1611  * prior page's HIKEY simply didn't need to advance any required array. In
1612  * both cases we can safely assume that the _first_ tuple from this page
1613  * must be >= the current set of array keys/equality constraints. And so
1614  * if the final tuple is == those same keys (and also satisfies any
1615  * required < or <= strategy scan keys) during the precheck, we can safely
1616  * assume that this must also be true of all earlier tuples from the page.
1617  */
1618  if (!firstPage && !so->scanBehind && minoff < maxoff)
1619  {
1620  ItemId iid;
1621  IndexTuple itup;
1622 
1623  iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1624  itup = (IndexTuple) PageGetItem(page, iid);
1625 
1626  /* Call with arrayKeys=false to avoid undesirable side-effects */
1627  _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1628  pstate.prechecked = pstate.continuescan;
1629  pstate.continuescan = true; /* reset */
1630  }
1631 
1632  if (ScanDirectionIsForward(dir))
1633  {
1634  /* SK_SEARCHARRAY forward scans must provide high key up front */
1635  if (arrayKeys && !P_RIGHTMOST(opaque))
1636  {
1637  ItemId iid = PageGetItemId(page, P_HIKEY);
1638 
1639  pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1640 
1641  if (unlikely(so->oppositeDirCheck))
1642  {
1643  Assert(so->scanBehind);
1644 
1645  /*
1646  * Last _bt_readpage call scheduled a recheck of finaltup for
1647  * required scan keys up to and including a > or >= scan key.
1648  *
1649  * _bt_checkkeys won't consider the scanBehind flag unless the
1650  * scan is stopped by a scan key required in the current scan
1651  * direction. We need this recheck so that we'll notice when
1652  * all tuples on this page are still before the _bt_first-wise
1653  * start of matches for the current set of array keys.
1654  */
1655  if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
1656  {
1657  /* Schedule another primitive index scan after all */
1658  so->currPos.moreRight = false;
1659  so->needPrimScan = true;
1660  return false;
1661  }
1662 
1663  /* Deliberately don't unset scanBehind flag just yet */
1664  }
1665  }
1666 
1667  /* load items[] in ascending order */
1668  itemIndex = 0;
1669 
1670  offnum = Max(offnum, minoff);
1671 
1672  while (offnum <= maxoff)
1673  {
1674  ItemId iid = PageGetItemId(page, offnum);
1675  IndexTuple itup;
1676  bool passes_quals;
1677 
1678  /*
1679  * If the scan specifies not to return killed tuples, then we
1680  * treat a killed tuple as not passing the qual
1681  */
1682  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1683  {
1684  offnum = OffsetNumberNext(offnum);
1685  continue;
1686  }
1687 
1688  itup = (IndexTuple) PageGetItem(page, iid);
1689  Assert(!BTreeTupleIsPivot(itup));
1690 
1691  pstate.offnum = offnum;
1692  passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1693  itup, indnatts);
1694 
1695  /*
1696  * Check if we need to skip ahead to a later tuple (only possible
1697  * when the scan uses array keys)
1698  */
1699  if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1700  {
1701  Assert(!passes_quals && pstate.continuescan);
1702  Assert(offnum < pstate.skip);
1703 
1704  offnum = pstate.skip;
1705  pstate.skip = InvalidOffsetNumber;
1706  continue;
1707  }
1708 
1709  if (passes_quals)
1710  {
1711  /* tuple passes all scan key conditions */
1712  pstate.firstmatch = true;
1713  if (!BTreeTupleIsPosting(itup))
1714  {
1715  /* Remember it */
1716  _bt_saveitem(so, itemIndex, offnum, itup);
1717  itemIndex++;
1718  }
1719  else
1720  {
1721  int tupleOffset;
1722 
1723  /*
1724  * Set up state to return posting list, and remember first
1725  * TID
1726  */
1727  tupleOffset =
1728  _bt_setuppostingitems(so, itemIndex, offnum,
1729  BTreeTupleGetPostingN(itup, 0),
1730  itup);
1731  itemIndex++;
1732  /* Remember additional TIDs */
1733  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1734  {
1735  _bt_savepostingitem(so, itemIndex, offnum,
1736  BTreeTupleGetPostingN(itup, i),
1737  tupleOffset);
1738  itemIndex++;
1739  }
1740  }
1741  }
1742  /* When !continuescan, there can't be any more matches, so stop */
1743  if (!pstate.continuescan)
1744  break;
1745 
1746  offnum = OffsetNumberNext(offnum);
1747  }
1748 
1749  /*
1750  * We don't need to visit page to the right when the high key
1751  * indicates that no more matches will be found there.
1752  *
1753  * Checking the high key like this works out more often than you might
1754  * think. Leaf page splits pick a split point between the two most
1755  * dissimilar tuples (this is weighed against the need to evenly share
1756  * free space). Leaf pages with high key attribute values that can
1757  * only appear on non-pivot tuples on the right sibling page are
1758  * common.
1759  */
1760  if (pstate.continuescan && !P_RIGHTMOST(opaque))
1761  {
1762  ItemId iid = PageGetItemId(page, P_HIKEY);
1763  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1764  int truncatt;
1765 
1766  truncatt = BTreeTupleGetNAtts(itup, rel);
1767  pstate.prechecked = false; /* precheck didn't cover HIKEY */
1768  _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1769  }
1770 
1771  if (!pstate.continuescan)
1772  so->currPos.moreRight = false;
1773 
1774  Assert(itemIndex <= MaxTIDsPerBTreePage);
1775  so->currPos.firstItem = 0;
1776  so->currPos.lastItem = itemIndex - 1;
1777  so->currPos.itemIndex = 0;
1778  }
1779  else
1780  {
1781  /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1782  if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1783  {
1784  ItemId iid = PageGetItemId(page, minoff);
1785 
1786  pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1787  }
1788 
1789  /* load items[] in descending order */
1790  itemIndex = MaxTIDsPerBTreePage;
1791 
1792  offnum = Min(offnum, maxoff);
1793 
1794  while (offnum >= minoff)
1795  {
1796  ItemId iid = PageGetItemId(page, offnum);
1797  IndexTuple itup;
1798  bool tuple_alive;
1799  bool passes_quals;
1800 
1801  /*
1802  * If the scan specifies not to return killed tuples, then we
1803  * treat a killed tuple as not passing the qual. Most of the
1804  * time, it's a win to not bother examining the tuple's index
1805  * keys, but just skip to the next tuple (previous, actually,
1806  * since we're scanning backwards). However, if this is the first
1807  * tuple on the page, we do check the index keys, to prevent
1808  * uselessly advancing to the page to the left. This is similar
1809  * to the high key optimization used by forward scans.
1810  */
1811  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1812  {
1813  if (offnum > minoff)
1814  {
1815  offnum = OffsetNumberPrev(offnum);
1816  continue;
1817  }
1818 
1819  tuple_alive = false;
1820  }
1821  else
1822  tuple_alive = true;
1823 
1824  itup = (IndexTuple) PageGetItem(page, iid);
1825  Assert(!BTreeTupleIsPivot(itup));
1826 
1827  pstate.offnum = offnum;
1828  passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1829  itup, indnatts);
1830 
1831  /*
1832  * Check if we need to skip ahead to a later tuple (only possible
1833  * when the scan uses array keys)
1834  */
1835  if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1836  {
1837  Assert(!passes_quals && pstate.continuescan);
1838  Assert(offnum > pstate.skip);
1839 
1840  offnum = pstate.skip;
1841  pstate.skip = InvalidOffsetNumber;
1842  continue;
1843  }
1844 
1845  if (passes_quals && tuple_alive)
1846  {
1847  /* tuple passes all scan key conditions */
1848  pstate.firstmatch = true;
1849  if (!BTreeTupleIsPosting(itup))
1850  {
1851  /* Remember it */
1852  itemIndex--;
1853  _bt_saveitem(so, itemIndex, offnum, itup);
1854  }
1855  else
1856  {
1857  int tupleOffset;
1858 
1859  /*
1860  * Set up state to return posting list, and remember first
1861  * TID.
1862  *
1863  * Note that we deliberately save/return items from
1864  * posting lists in ascending heap TID order for backwards
1865  * scans. This allows _bt_killitems() to make a
1866  * consistent assumption about the order of items
1867  * associated with the same posting list tuple.
1868  */
1869  itemIndex--;
1870  tupleOffset =
1871  _bt_setuppostingitems(so, itemIndex, offnum,
1872  BTreeTupleGetPostingN(itup, 0),
1873  itup);
1874  /* Remember additional TIDs */
1875  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1876  {
1877  itemIndex--;
1878  _bt_savepostingitem(so, itemIndex, offnum,
1879  BTreeTupleGetPostingN(itup, i),
1880  tupleOffset);
1881  }
1882  }
1883  }
1884  /* When !continuescan, there can't be any more matches, so stop */
1885  if (!pstate.continuescan)
1886  break;
1887 
1888  offnum = OffsetNumberPrev(offnum);
1889  }
1890 
1891  /*
1892  * We don't need to visit page to the left when no more matches will
1893  * be found there
1894  */
1895  if (!pstate.continuescan)
1896  so->currPos.moreLeft = false;
1897 
1898  Assert(itemIndex >= 0);
1899  so->currPos.firstItem = itemIndex;
1902  }
1903 
1904  return (so->currPos.firstItem <= so->currPos.lastItem);
1905 }
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3985
#define Max(x, y)
Definition: c.h:1003
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1909
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1939
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:1977
bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtutils.c:3472
bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
Definition: nbtutils.c:3627
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2589
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:517
bool oppositeDirCheck
Definition: nbtree.h:1042
int nextTupleOffset
Definition: nbtree.h:968
ScanDirection dir
Definition: nbtree.h:962
XLogRecPtr lsn
Definition: nbtree.h:959
bool ignore_killed_tuples
Definition: relscan.h:152

References _bt_checkkeys(), _bt_oppodir_checkkeys(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_setuppostingitems(), Assert, BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::dir, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, InvalidOffsetNumber, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, OffsetNumberIsValid, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_LEFTMOST, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), BTScanPosData::prevPage, BTScanOpaqueData::scanBehind, ScanDirectionIsForward, unlikely, and IndexScanDescData::xs_snapshot.

Referenced by _bt_readfirstpage(), and _bt_readnextpage().

◆ _bt_returnitem()

static void _bt_returnitem ( IndexScanDesc  scan,
BTScanOpaque  so 
)
inlinestatic

Definition at line 1998 of file nbtsearch.c.

1999 {
2000  BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
2001 
2002  /* Most recent _bt_readpage must have succeeded */
2005  Assert(so->currPos.itemIndex <= so->currPos.lastItem);
2006 
2007  /* Return next item, per amgettuple contract */
2008  scan->xs_heaptid = currItem->heapTid;
2009  if (so->currTuples)
2010  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2011 }
char * currTuples
Definition: nbtree.h:1056
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:988
ItemPointerData heapTid
Definition: nbtree.h:946
LocationIndex tupleOffset
Definition: nbtree.h:948
IndexTuple xs_itup
Definition: relscan.h:165
ItemPointerData xs_heaptid
Definition: relscan.h:170

References Assert, BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, BTScanPosItem::tupleOffset, IndexScanDescData::xs_heaptid, and IndexScanDescData::xs_itup.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1909 of file nbtsearch.c.

1911 {
1912  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1913 
1914  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1915 
1916  currItem->heapTid = itup->t_tid;
1917  currItem->indexOffset = offnum;
1918  if (so->currTuples)
1919  {
1920  Size itupsz = IndexTupleSize(itup);
1921 
1922  currItem->tupleOffset = so->currPos.nextTupleOffset;
1923  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1924  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1925  }
1926 }
#define MAXALIGN(LEN)
Definition: c.h:816
size_t Size
Definition: c.h:610
#define IndexTupleSize(itup)
Definition: itup.h:70
OffsetNumber indexOffset
Definition: nbtree.h:947
ItemPointerData t_tid
Definition: itup.h:37

References Assert, BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_savepostingitem()

static void _bt_savepostingitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
int  tupleOffset 
)
inlinestatic

Definition at line 1977 of file nbtsearch.c.

1979 {
1980  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1981 
1982  currItem->heapTid = *heapTid;
1983  currItem->indexOffset = offnum;
1984 
1985  /*
1986  * Have index-only scans return the same base IndexTuple for every TID
1987  * that originates from the same posting list
1988  */
1989  if (so->currTuples)
1990  currItem->tupleOffset = tupleOffset;
1991 }

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 102 of file nbtsearch.c.

104 {
105  BTStack stack_in = NULL;
106  int page_access = BT_READ;
107 
108  /* heaprel must be set whenever _bt_allocbuf is reachable */
109  Assert(access == BT_READ || access == BT_WRITE);
110  Assert(access == BT_READ || heaprel != NULL);
111 
112  /* Get the root page to start with */
113  *bufP = _bt_getroot(rel, heaprel, access);
114 
115  /* If index is empty and access = BT_READ, no root page is created. */
116  if (!BufferIsValid(*bufP))
117  return (BTStack) NULL;
118 
119  /* Loop iterates once per level descended in the tree */
120  for (;;)
121  {
122  Page page;
123  BTPageOpaque opaque;
124  OffsetNumber offnum;
125  ItemId itemid;
126  IndexTuple itup;
127  BlockNumber child;
128  BTStack new_stack;
129 
130  /*
131  * Race -- the page we just grabbed may have split since we read its
132  * downlink in its parent page (or the metapage). If it has, we may
133  * need to move right to its new sibling. Do that.
134  *
135  * In write-mode, allow _bt_moveright to finish any incomplete splits
136  * along the way. Strictly speaking, we'd only need to finish an
137  * incomplete split on the leaf page we're about to insert to, not on
138  * any of the upper levels (internal pages with incomplete splits are
139  * also taken care of in _bt_getstackbuf). But this is a good
140  * opportunity to finish splits of internal pages too.
141  */
142  *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
143  stack_in, page_access);
144 
145  /* if this is a leaf page, we're done */
146  page = BufferGetPage(*bufP);
147  opaque = BTPageGetOpaque(page);
148  if (P_ISLEAF(opaque))
149  break;
150 
151  /*
152  * Find the appropriate pivot tuple on this page. Its downlink points
153  * to the child page that we're about to descend to.
154  */
155  offnum = _bt_binsrch(rel, key, *bufP);
156  itemid = PageGetItemId(page, offnum);
157  itup = (IndexTuple) PageGetItem(page, itemid);
158  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
159  child = BTreeTupleGetDownLink(itup);
160 
161  /*
162  * We need to save the location of the pivot tuple we chose in a new
163  * stack entry for this page/level. If caller ends up splitting a
164  * page one level down, it usually ends up inserting a new pivot
165  * tuple/downlink immediately after the location recorded here.
166  */
167  new_stack = (BTStack) palloc(sizeof(BTStackData));
168  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169  new_stack->bts_offset = offnum;
170  new_stack->bts_parent = stack_in;
171 
172  /*
173  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
174  * we're on the level 1 and asked to lock leaf page in write mode,
175  * then lock next page in write mode, because it must be a leaf.
176  */
177  if (opaque->btpo_level == 1 && access == BT_WRITE)
178  page_access = BT_WRITE;
179 
180  /* drop the read lock on the page, then acquire one on its child */
181  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
182 
183  /* okay, all set to move down a level */
184  stack_in = new_stack;
185  }
186 
187  /*
188  * If we're asked to lock leaf in write mode, but didn't manage to, then
189  * relock. This should only happen when the root page is a leaf page (and
190  * the only page in the index other than the metapage).
191  */
192  if (access == BT_WRITE && page_access == BT_READ)
193  {
194  /* trade in our read lock for a write lock */
195  _bt_unlockbuf(rel, *bufP);
196  _bt_lockbuf(rel, *bufP, BT_WRITE);
197 
198  /*
199  * Race -- the leaf page may have split after we dropped the read lock
200  * but before we acquired a write lock. If it has, we may need to
201  * move right to its new sibling. Do that.
202  */
203  *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
204  }
205 
206  return stack_in;
207 }
void * palloc(Size size)
Definition: mcxt.c:1317
BTStackData * BTStack
Definition: nbtree.h:739
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:241
BlockNumber bts_blkno
Definition: nbtree.h:734
struct BTStackData * bts_parent
Definition: nbtree.h:736
OffsetNumber bts_offset
Definition: nbtree.h:735

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_setuppostingitems()

static int _bt_setuppostingitems ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
IndexTuple  itup 
)
static

Definition at line 1939 of file nbtsearch.c.

1941 {
1942  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1943 
1944  Assert(BTreeTupleIsPosting(itup));
1945 
1946  currItem->heapTid = *heapTid;
1947  currItem->indexOffset = offnum;
1948  if (so->currTuples)
1949  {
1950  /* Save base IndexTuple (truncate posting list) */
1951  IndexTuple base;
1952  Size itupsz = BTreeTupleGetPostingOffset(itup);
1953 
1954  itupsz = MAXALIGN(itupsz);
1955  currItem->tupleOffset = so->currPos.nextTupleOffset;
1956  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1957  memcpy(base, itup, itupsz);
1958  /* Defensively reduce work area index tuple header size */
1959  base->t_info &= ~INDEX_SIZE_MASK;
1960  base->t_info |= itupsz;
1961  so->currPos.nextTupleOffset += itupsz;
1962 
1963  return currItem->tupleOffset;
1964  }
1965 
1966  return 0;
1967 }
#define INDEX_SIZE_MASK
Definition: itup.h:65
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529
unsigned short t_info
Definition: itup.h:49

References Assert, BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, INDEX_SIZE_MASK, BTScanPosItem::indexOffset, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_info, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2024 of file nbtsearch.c.

2025 {
2026  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2027  BlockNumber blkno,
2028  lastcurrblkno;
2029 
2031 
2032  /* Before leaving current page, deal with any killed items */
2033  if (so->numKilled > 0)
2034  _bt_killitems(scan);
2035 
2036  /*
2037  * Before we modify currPos, make a copy of the page data if there was a
2038  * mark position that needs it.
2039  */
2040  if (so->markItemIndex >= 0)
2041  {
2042  /* bump pin on current buffer for assignment to mark buffer */
2043  if (BTScanPosIsPinned(so->currPos))
2045  memcpy(&so->markPos, &so->currPos,
2046  offsetof(BTScanPosData, items[1]) +
2047  so->currPos.lastItem * sizeof(BTScanPosItem));
2048  if (so->markTuples)
2049  memcpy(so->markTuples, so->currTuples,
2050  so->currPos.nextTupleOffset);
2051  so->markPos.itemIndex = so->markItemIndex;
2052  so->markItemIndex = -1;
2053 
2054  /*
2055  * If we're just about to start the next primitive index scan
2056  * (possible with a scan that has arrays keys, and needs to skip to
2057  * continue in the current scan direction), moreLeft/moreRight only
2058  * indicate the end of the current primitive index scan. They must
2059  * never be taken to indicate that the top-level index scan has ended
2060  * (that would be wrong).
2061  *
2062  * We could handle this case by treating the current array keys as
2063  * markPos state. But depending on the current array state like this
2064  * would add complexity. Instead, we just unset markPos's copy of
2065  * moreRight or moreLeft (whichever might be affected), while making
2066  * btrestpos reset the scan's arrays to their initial scan positions.
2067  * In effect, btrestpos leaves advancing the arrays up to the first
2068  * _bt_readpage call (that takes place after it has restored markPos).
2069  */
2070  if (so->needPrimScan)
2071  {
2073  so->markPos.moreRight = true;
2074  else
2075  so->markPos.moreLeft = true;
2076  }
2077 
2078  /* mark/restore not supported by parallel scans */
2079  Assert(!scan->parallel_scan);
2080  }
2081 
2083 
2084  /* Walk to the next page with data */
2085  if (ScanDirectionIsForward(dir))
2086  blkno = so->currPos.nextPage;
2087  else
2088  blkno = so->currPos.prevPage;
2089  lastcurrblkno = so->currPos.currPage;
2090 
2091  /*
2092  * Cancel primitive index scans that were scheduled when the call to
2093  * _bt_readpage for currPos happened to use the opposite direction to the
2094  * one that we're stepping in now. (It's okay to leave the scan's array
2095  * keys as-is, since the next _bt_readpage will advance them.)
2096  */
2097  if (so->currPos.dir != dir)
2098  so->needPrimScan = false;
2099 
2100  return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
2101 }
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4956
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1004
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:4178
char * markTuples
Definition: nbtree.h:1057
BTScanPosData markPos
Definition: nbtree.h:1070
static ItemArray items
Definition: test_tidstore.c:48

References _bt_killitems(), _bt_readnextpage(), Assert, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, IncrBufferRefCount(), BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTScanPosData::prevPage, and ScanDirectionIsForward.

Referenced by _bt_next(), and _bt_readfirstpage().