PostgreSQL Source Code  git master
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static int _bt_setuppostingitems (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
 
static void _bt_savepostingitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static bool _bt_parallel_readpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_initialize_more_data (BTScanOpaque so, ScanDirection dir)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
Buffer _bt_moveright (Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 337 of file nbtsearch.c.

340 {
341  Page page;
342  BTPageOpaque opaque;
343  OffsetNumber low,
344  high;
345  int32 result,
346  cmpval;
347 
348  page = BufferGetPage(buf);
349  opaque = BTPageGetOpaque(page);
350 
351  /* Requesting nextkey semantics while using scantid seems nonsensical */
352  Assert(!key->nextkey || key->scantid == NULL);
353  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
354  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
355 
356  low = P_FIRSTDATAKEY(opaque);
357  high = PageGetMaxOffsetNumber(page);
358 
359  /*
360  * If there are no keys on the page, return the first available slot. Note
361  * this covers two cases: the page is really empty (no keys), or it
362  * contains only a high key. The latter case is possible after vacuuming.
363  * This can never happen on an internal page, however, since they are
364  * never empty (an internal page must have at least one child).
365  */
366  if (unlikely(high < low))
367  return low;
368 
369  /*
370  * Binary search to find the first key on the page >= scan key, or first
371  * key > scankey when nextkey is true.
372  *
373  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
374  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
375  *
376  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
377  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
378  *
379  * We can fall out when high == low.
380  */
381  high++; /* establish the loop invariant for high */
382 
383  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
384 
385  while (high > low)
386  {
387  OffsetNumber mid = low + ((high - low) / 2);
388 
389  /* We have low <= mid < high, so mid points at a real slot */
390 
391  result = _bt_compare(rel, key, page, mid);
392 
393  if (result >= cmpval)
394  low = mid + 1;
395  else
396  high = mid;
397  }
398 
399  /*
400  * At this point we have high == low.
401  *
402  * On a leaf page we always return the first non-pivot tuple >= scan key
403  * (resp. > scan key) for forward scan callers. For backward scans, it's
404  * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
405  */
406  if (P_ISLEAF(opaque))
407  {
408  /*
409  * In the backward scan case we're supposed to locate the last
410  * matching tuple on the leaf level -- not the first matching tuple
411  * (the last tuple will be the first one returned by the scan).
412  *
413  * At this point we've located the first non-pivot tuple immediately
414  * after the last matching tuple (which might just be maxoff + 1).
415  * Compensate by stepping back.
416  */
417  if (key->backward)
418  return OffsetNumberPrev(low);
419 
420  return low;
421  }
422 
423  /*
424  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
425  * There must be one if _bt_compare() is playing by the rules.
426  *
427  * _bt_compare() will seldom see any exactly-matching pivot tuples, since
428  * a truncated -inf heap TID is usually enough to prevent it altogether.
429  * Even omitted scan key entries are treated as > truncated attributes.
430  *
431  * However, during backward scans _bt_compare() interprets omitted scan
432  * key attributes as == corresponding truncated -inf attributes instead.
433  * This works just like < would work here. Under this scheme, < strategy
434  * backward scans will always directly descend to the correct leaf page.
435  * In particular, they will never incur an "extra" leaf page access with a
436  * scan key that happens to contain the same prefix of values as some
437  * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
438  * it uses a leaf page high key to "re-find" a page undergoing deletion.
439  */
440  Assert(low > P_FIRSTDATAKEY(opaque));
441 
442  return OffsetNumberPrev(low);
443 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
Pointer Page
Definition: bufpage.h:78
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
signed int int32
Definition: c.h:483
#define unlikely(x)
Definition: c.h:300
Assert(fmt[strlen(fmt) - 1] !='\n')
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:682
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:73

References _bt_compare(), Assert(), BTPageGetOpaque, buf, BufferGetPage(), sort-test::key, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), and unlikely.

Referenced by _bt_first(), and _bt_search().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 468 of file nbtsearch.c.

469 {
470  BTScanInsert key = insertstate->itup_key;
471  Page page;
472  BTPageOpaque opaque;
473  OffsetNumber low,
474  high,
475  stricthigh;
476  int32 result,
477  cmpval;
478 
479  page = BufferGetPage(insertstate->buf);
480  opaque = BTPageGetOpaque(page);
481 
482  Assert(P_ISLEAF(opaque));
483  Assert(!key->nextkey);
484  Assert(insertstate->postingoff == 0);
485 
486  if (!insertstate->bounds_valid)
487  {
488  /* Start new binary search */
489  low = P_FIRSTDATAKEY(opaque);
490  high = PageGetMaxOffsetNumber(page);
491  }
492  else
493  {
494  /* Restore result of previous binary search against same page */
495  low = insertstate->low;
496  high = insertstate->stricthigh;
497  }
498 
499  /* If there are no keys on the page, return the first available slot */
500  if (unlikely(high < low))
501  {
502  /* Caller can't reuse bounds */
503  insertstate->low = InvalidOffsetNumber;
504  insertstate->stricthigh = InvalidOffsetNumber;
505  insertstate->bounds_valid = false;
506  return low;
507  }
508 
509  /*
510  * Binary search to find the first key on the page >= scan key. (nextkey
511  * is always false when inserting).
512  *
513  * The loop invariant is: all slots before 'low' are < scan key, all slots
514  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
515  * maintained to save additional search effort for caller.
516  *
517  * We can fall out when high == low.
518  */
519  if (!insertstate->bounds_valid)
520  high++; /* establish the loop invariant for high */
521  stricthigh = high; /* high initially strictly higher */
522 
523  cmpval = 1; /* !nextkey comparison value */
524 
525  while (high > low)
526  {
527  OffsetNumber mid = low + ((high - low) / 2);
528 
529  /* We have low <= mid < high, so mid points at a real slot */
530 
531  result = _bt_compare(rel, key, page, mid);
532 
533  if (result >= cmpval)
534  low = mid + 1;
535  else
536  {
537  high = mid;
538  if (result != 0)
539  stricthigh = high;
540  }
541 
542  /*
543  * If tuple at offset located by binary search is a posting list whose
544  * TID range overlaps with caller's scantid, perform posting list
545  * binary search to set postingoff for caller. Caller must split the
546  * posting list when postingoff is set. This should happen
547  * infrequently.
548  */
549  if (unlikely(result == 0 && key->scantid != NULL))
550  {
551  /*
552  * postingoff should never be set more than once per leaf page
553  * binary search. That would mean that there are duplicate table
554  * TIDs in the index, which is never okay. Check for that here.
555  */
556  if (insertstate->postingoff != 0)
557  ereport(ERROR,
558  (errcode(ERRCODE_INDEX_CORRUPTED),
559  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
560  ItemPointerGetBlockNumber(key->scantid),
562  low, stricthigh,
563  BufferGetBlockNumber(insertstate->buf),
564  RelationGetRelationName(rel))));
565 
566  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
567  }
568  }
569 
570  /*
571  * On a leaf page, a binary search always returns the first key >= scan
572  * key (at least in !nextkey case), which could be the last slot + 1. This
573  * is also the lower bound of cached search.
574  *
575  * stricthigh may also be the last slot + 1, which prevents caller from
576  * using bounds directly, but is still useful to us if we're called a
577  * second time with cached bounds (cached low will be < stricthigh when
578  * that happens).
579  */
580  insertstate->low = low;
581  insertstate->stricthigh = stricthigh;
582  insertstate->bounds_valid = true;
583 
584  return low;
585 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3378
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1162
int errcode(int sqlerrcode)
Definition: elog.c:860
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:596
#define InvalidOffsetNumber
Definition: off.h:26
#define RelationGetRelationName(relation)
Definition: rel.h:538
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_posting(), _bt_compare(), Assert(), BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 596 of file nbtsearch.c.

597 {
598  IndexTuple itup;
599  ItemId itemid;
600  int low,
601  high,
602  mid,
603  res;
604 
605  /*
606  * If this isn't a posting tuple, then the index must be corrupt (if it is
607  * an ordinary non-pivot tuple then there must be an existing tuple with a
608  * heap TID that equals inserter's new heap TID/scantid). Defensively
609  * check that tuple is a posting list tuple whose posting list range
610  * includes caller's scantid.
611  *
612  * (This is also needed because contrib/amcheck's rootdescend option needs
613  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
614  */
615  itemid = PageGetItemId(page, offnum);
616  itup = (IndexTuple) PageGetItem(page, itemid);
617  if (!BTreeTupleIsPosting(itup))
618  return 0;
619 
620  Assert(key->heapkeyspace && key->allequalimage);
621 
622  /*
623  * In the event that posting list tuple has LP_DEAD bit set, indicate this
624  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
625  * second call to _bt_binsrch_insert() can take place when its caller has
626  * removed the dead item.
627  */
628  if (ItemIdIsDead(itemid))
629  return -1;
630 
631  /* "high" is past end of posting list for loop invariant */
632  low = 0;
633  high = BTreeTupleGetNPosting(itup);
634  Assert(high >= 2);
635 
636  while (high > low)
637  {
638  mid = low + ((high - low) / 2);
639  res = ItemPointerCompare(key->scantid,
640  BTreeTupleGetPostingN(itup, mid));
641 
642  if (res > 0)
643  low = mid + 1;
644  else if (res < 0)
645  high = mid;
646  else
647  return mid;
648  }
649 
650  /* Exact match not found */
651  return low;
652 }
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), ItemIdIsDead, ItemPointerCompare(), sort-test::key, PageGetItem(), PageGetItemId(), and res.

Referenced by _bt_binsrch_insert().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 682 of file nbtsearch.c.

686 {
687  TupleDesc itupdesc = RelationGetDescr(rel);
688  BTPageOpaque opaque = BTPageGetOpaque(page);
689  IndexTuple itup;
690  ItemPointer heapTid;
691  ScanKey scankey;
692  int ncmpkey;
693  int ntupatts;
694  int32 result;
695 
696  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
698  Assert(key->heapkeyspace || key->scantid == NULL);
699 
700  /*
701  * Force result ">" if target item is first data item on an internal page
702  * --- see NOTE above.
703  */
704  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
705  return 1;
706 
707  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
708  ntupatts = BTreeTupleGetNAtts(itup, rel);
709 
710  /*
711  * The scan key is set up with the attribute number associated with each
712  * term in the key. It is important that, if the index is multi-key, the
713  * scan contain the first k key attributes, and that they be in order. If
714  * you think about how multi-key ordering works, you'll understand why
715  * this is.
716  *
717  * We don't test for violation of this condition here, however. The
718  * initial setup for the index scan had better have gotten it right (see
719  * _bt_first).
720  */
721 
722  ncmpkey = Min(ntupatts, key->keysz);
723  Assert(key->heapkeyspace || ncmpkey == key->keysz);
724  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
725  scankey = key->scankeys;
726  for (int i = 1; i <= ncmpkey; i++)
727  {
728  Datum datum;
729  bool isNull;
730 
731  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
732 
733  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
734  {
735  if (isNull)
736  result = 0; /* NULL "=" NULL */
737  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
738  result = -1; /* NULL "<" NOT_NULL */
739  else
740  result = 1; /* NULL ">" NOT_NULL */
741  }
742  else if (isNull) /* key is NOT_NULL and item is NULL */
743  {
744  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
745  result = 1; /* NOT_NULL ">" NULL */
746  else
747  result = -1; /* NOT_NULL "<" NULL */
748  }
749  else
750  {
751  /*
752  * The sk_func needs to be passed the index value as left arg and
753  * the sk_argument as right arg (they might be of different
754  * types). Since it is convenient for callers to think of
755  * _bt_compare as comparing the scankey to the index item, we have
756  * to flip the sign of the comparison result. (Unless it's a DESC
757  * column, in which case we *don't* flip the sign.)
758  */
759  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
760  scankey->sk_collation,
761  datum,
762  scankey->sk_argument));
763 
764  if (!(scankey->sk_flags & SK_BT_DESC))
765  INVERT_COMPARE_RESULT(result);
766  }
767 
768  /* if the keys are unequal, return the difference */
769  if (result != 0)
770  return result;
771 
772  scankey++;
773  }
774 
775  /*
776  * All non-truncated attributes (other than heap TID) were found to be
777  * equal. Treat truncated attributes as minus infinity when scankey has a
778  * key attribute value that would otherwise be compared directly.
779  *
780  * Note: it doesn't matter if ntupatts includes non-key attributes;
781  * scankey won't, so explicitly excluding non-key attributes isn't
782  * necessary.
783  */
784  if (key->keysz > ntupatts)
785  return 1;
786 
787  /*
788  * Use the heap TID attribute and scantid to try to break the tie. The
789  * rules are the same as any other key attribute -- only the
790  * representation differs.
791  */
792  heapTid = BTreeTupleGetHeapTID(itup);
793  if (key->scantid == NULL)
794  {
795  /*
796  * Forward scans have a scankey that is considered greater than a
797  * truncated pivot tuple if and when the scankey has equal values for
798  * attributes up to and including the least significant untruncated
799  * attribute in tuple. Even attributes that were omitted from the
800  * scan key are considered greater than -inf truncated attributes.
801  * (See _bt_binsrch for an explanation of our backward scan behavior.)
802  *
803  * For example, if an index has the minimum two attributes (single
804  * user key attribute, plus heap TID attribute), and a page's high key
805  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
806  * will not descend to the page to the left. The search will descend
807  * right instead. The truncated attribute in pivot tuple means that
808  * all non-pivot tuples on the page to the left are strictly < 'foo',
809  * so it isn't necessary to descend left. In other words, search
810  * doesn't have to descend left because it isn't interested in a match
811  * that has a heap TID value of -inf.
812  *
813  * Note: the heap TID part of the test ensures that scankey is being
814  * compared to a pivot tuple with one or more truncated -inf key
815  * attributes. The heap TID attribute is the last key attribute in
816  * every index, of course, but other than that it isn't special.
817  */
818  if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
819  key->heapkeyspace)
820  return 1;
821 
822  /* All provided scankey arguments found to be equal */
823  return 0;
824  }
825 
826  /*
827  * Treat truncated heap TID as minus infinity, since scankey has a key
828  * attribute value (scantid) that would otherwise be compared directly
829  */
831  if (heapTid == NULL)
832  return 1;
833 
834  /*
835  * Scankey must be treated as equal to a posting list tuple if its scantid
836  * value falls within the range of the posting list. In all other cases
837  * there can only be a single heap TID value, which is compared directly
838  * with scantid.
839  */
841  result = ItemPointerCompare(key->scantid, heapTid);
842  if (result <= 0 || !BTreeTupleIsPosting(itup))
843  return result;
844  else
845  {
846  result = ItemPointerCompare(key->scantid,
848  if (result > 0)
849  return 1;
850  }
851 
852  return 0;
853 }
#define Min(x, y)
Definition: c.h:993
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1095
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int i
Definition: isn.c:73
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1087
#define SK_BT_DESC
Definition: nbtree.h:1086
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2523
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define RelationGetDescr(relation)
Definition: rel.h:530
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:523
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 61 of file nbtsearch.c.

62 {
63  _bt_unlockbuf(scan->indexRelation, sp->buf);
64 
65  if (IsMVCCSnapshot(scan->xs_snapshot) &&
67  !scan->xs_want_itup)
68  {
69  ReleaseBuffer(sp->buf);
70  sp->buf = InvalidBuffer;
71  }
72 }
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4561
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1071
#define RelationNeedsWAL(relation)
Definition: rel.h:627
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
Buffer buf
Definition: nbtree.h:953
Relation indexRelation
Definition: relscan.h:118
struct SnapshotData * xs_snapshot
Definition: relscan.h:119

References _bt_unlockbuf(), BTScanPosData::buf, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2441 of file nbtsearch.c.

2442 {
2443  Relation rel = scan->indexRelation;
2444  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2445  Buffer buf;
2446  Page page;
2447  BTPageOpaque opaque;
2448  OffsetNumber start;
2449  BTScanPosItem *currItem;
2450 
2451  /*
2452  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2453  * version of _bt_search(). We don't maintain a stack since we know we
2454  * won't need it.
2455  */
2457 
2458  if (!BufferIsValid(buf))
2459  {
2460  /*
2461  * Empty index. Lock the whole relation, as nothing finer to lock
2462  * exists.
2463  */
2464  PredicateLockRelation(rel, scan->xs_snapshot);
2466  return false;
2467  }
2468 
2470  page = BufferGetPage(buf);
2471  opaque = BTPageGetOpaque(page);
2472  Assert(P_ISLEAF(opaque));
2473 
2474  if (ScanDirectionIsForward(dir))
2475  {
2476  /* There could be dead pages to the left, so not this: */
2477  /* Assert(P_LEFTMOST(opaque)); */
2478 
2479  start = P_FIRSTDATAKEY(opaque);
2480  }
2481  else if (ScanDirectionIsBackward(dir))
2482  {
2483  Assert(P_RIGHTMOST(opaque));
2484 
2485  start = PageGetMaxOffsetNumber(page);
2486  }
2487  else
2488  {
2489  elog(ERROR, "invalid scan direction: %d", (int) dir);
2490  start = 0; /* keep compiler quiet */
2491  }
2492 
2493  /* remember which buffer we have pinned */
2494  so->currPos.buf = buf;
2495 
2496  _bt_initialize_more_data(so, dir);
2497 
2498  /*
2499  * Now load data from the first page of the scan.
2500  */
2501  if (!_bt_readpage(scan, dir, start, false))
2502  {
2503  /*
2504  * There's no actually-matching data on this page. Try to advance to
2505  * the next page. Return false if there's no matching data at all.
2506  */
2507  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2508  if (!_bt_steppage(scan, dir))
2509  return false;
2510  }
2511  else
2512  {
2513  /* We have at least one item to return as scan's first item */
2515  }
2516 
2517  /* OK, itemIndex says what to return */
2518  currItem = &so->currPos.items[so->currPos.itemIndex];
2519  scan->xs_heaptid = currItem->heapTid;
2520  if (scan->xs_want_itup)
2521  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2522 
2523  return true;
2524 }
int Buffer
Definition: buf.h:23
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1013
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1076
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2360
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:61
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1944
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
Definition: nbtsearch.c:1522
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2531
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2569
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2546
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
BTScanPosData currPos
Definition: nbtree.h:1072
char * currTuples
Definition: nbtree.h:1059
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:985
int itemIndex
Definition: nbtree.h:983
IndexTuple xs_itup
Definition: relscan.h:142
ItemPointerData xs_heaptid
Definition: relscan.h:147

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_initialize_more_data(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert(), BTPageGetOpaque, BTScanPosInvalidate, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog(), ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber(), PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 876 of file nbtsearch.c.

877 {
878  Relation rel = scan->indexRelation;
879  BTScanOpaque so = (BTScanOpaque) scan->opaque;
880  Buffer buf;
881  BTStack stack;
882  OffsetNumber offnum;
883  StrategyNumber strat;
884  BTScanInsertData inskey;
885  ScanKey startKeys[INDEX_MAX_KEYS];
886  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
887  int keysz = 0;
888  int i;
889  bool status;
890  StrategyNumber strat_total;
891  BTScanPosItem *currItem;
892  BlockNumber blkno;
893 
895 
897 
898  /*
899  * Examine the scan keys and eliminate any redundant keys; also mark the
900  * keys that must be matched to continue the scan.
901  */
902  _bt_preprocess_keys(scan);
903 
904  /*
905  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906  * never be satisfied (eg, x == 1 AND x > 2).
907  */
908  if (!so->qual_ok)
909  {
910  /* Notify any other workers that we're done with this scan key. */
911  _bt_parallel_done(scan);
912  return false;
913  }
914 
915  /*
916  * For parallel scans, get the starting page from shared state. If the
917  * scan has not started, proceed to find out first leaf page in the usual
918  * way while keeping other participating processes waiting. If the scan
919  * has already begun, use the page number from the shared structure.
920  */
921  if (scan->parallel_scan != NULL)
922  {
923  status = _bt_parallel_seize(scan, &blkno);
924  if (!status)
925  return false;
926  else if (blkno == P_NONE)
927  {
928  _bt_parallel_done(scan);
929  return false;
930  }
931  else if (blkno != InvalidBlockNumber)
932  {
933  if (!_bt_parallel_readpage(scan, blkno, dir))
934  return false;
935  goto readcomplete;
936  }
937  }
938 
939  /*----------
940  * Examine the scan keys to discover where we need to start the scan.
941  *
942  * We want to identify the keys that can be used as starting boundaries;
943  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
944  * a backwards scan. We can use keys for multiple attributes so long as
945  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
946  * a > or < boundary or find an attribute with no boundary (which can be
947  * thought of as the same as "> -infinity"), we can't use keys for any
948  * attributes to its right, because it would break our simplistic notion
949  * of what initial positioning strategy to use.
950  *
951  * When the scan keys include cross-type operators, _bt_preprocess_keys
952  * may not be able to eliminate redundant keys; in such cases we will
953  * arbitrarily pick a usable one for each attribute. This is correct
954  * but possibly not optimal behavior. (For example, with keys like
955  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
956  * x=5 would be more efficient.) Since the situation only arises given
957  * a poorly-worded query plus an incomplete opfamily, live with it.
958  *
959  * When both equality and inequality keys appear for a single attribute
960  * (again, only possible when cross-type operators appear), we *must*
961  * select one of the equality keys for the starting point, because
962  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
963  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
964  * start at x=4, we will fail and stop before reaching x=10. If multiple
965  * equality quals survive preprocessing, however, it doesn't matter which
966  * one we use --- by definition, they are either redundant or
967  * contradictory.
968  *
969  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
970  * If the index stores nulls at the end of the index we'll be starting
971  * from, and we have no boundary key for the column (which means the key
972  * we deduced NOT NULL from is an inequality key that constrains the other
973  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
974  * use as a boundary key. If we didn't do this, we might find ourselves
975  * traversing a lot of null entries at the start of the scan.
976  *
977  * In this loop, row-comparison keys are treated the same as keys on their
978  * first (leftmost) columns. We'll add on lower-order columns of the row
979  * comparison below, if possible.
980  *
981  * The selected scan keys (at most one per index column) are remembered by
982  * storing their addresses into the local startKeys[] array.
983  *----------
984  */
985  strat_total = BTEqualStrategyNumber;
986  if (so->numberOfKeys > 0)
987  {
988  AttrNumber curattr;
989  ScanKey chosen;
990  ScanKey impliesNN;
991  ScanKey cur;
992 
993  /*
994  * chosen is the so-far-chosen key for the current attribute, if any.
995  * We don't cast the decision in stone until we reach keys for the
996  * next attribute.
997  */
998  curattr = 1;
999  chosen = NULL;
1000  /* Also remember any scankey that implies a NOT NULL constraint */
1001  impliesNN = NULL;
1002 
1003  /*
1004  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1005  * pass to handle after-last-key processing. Actual exit from the
1006  * loop is at one of the "break" statements below.
1007  */
1008  for (cur = so->keyData, i = 0;; cur++, i++)
1009  {
1010  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1011  {
1012  /*
1013  * Done looking at keys for curattr. If we didn't find a
1014  * usable boundary key, see if we can deduce a NOT NULL key.
1015  */
1016  if (chosen == NULL && impliesNN != NULL &&
1017  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1018  ScanDirectionIsForward(dir) :
1020  {
1021  /* Yes, so build the key in notnullkeys[keysCount] */
1022  chosen = &notnullkeys[keysz];
1023  ScanKeyEntryInitialize(chosen,
1025  (impliesNN->sk_flags &
1027  curattr,
1028  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1031  InvalidOid,
1032  InvalidOid,
1033  InvalidOid,
1034  (Datum) 0);
1035  }
1036 
1037  /*
1038  * If we still didn't find a usable boundary key, quit; else
1039  * save the boundary key pointer in startKeys.
1040  */
1041  if (chosen == NULL)
1042  break;
1043  startKeys[keysz++] = chosen;
1044 
1045  /*
1046  * Adjust strat_total, and quit if we have stored a > or <
1047  * key.
1048  */
1049  strat = chosen->sk_strategy;
1050  if (strat != BTEqualStrategyNumber)
1051  {
1052  strat_total = strat;
1053  if (strat == BTGreaterStrategyNumber ||
1054  strat == BTLessStrategyNumber)
1055  break;
1056  }
1057 
1058  /*
1059  * Done if that was the last attribute, or if next key is not
1060  * in sequence (implying no boundary key is available for the
1061  * next attribute).
1062  */
1063  if (i >= so->numberOfKeys ||
1064  cur->sk_attno != curattr + 1)
1065  break;
1066 
1067  /*
1068  * Reset for next attr.
1069  */
1070  curattr = cur->sk_attno;
1071  chosen = NULL;
1072  impliesNN = NULL;
1073  }
1074 
1075  /*
1076  * Can we use this key as a starting boundary for this attr?
1077  *
1078  * If not, does it imply a NOT NULL constraint? (Because
1079  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1080  * *any* inequality key works for that; we need not test.)
1081  */
1082  switch (cur->sk_strategy)
1083  {
1084  case BTLessStrategyNumber:
1086  if (chosen == NULL)
1087  {
1088  if (ScanDirectionIsBackward(dir))
1089  chosen = cur;
1090  else
1091  impliesNN = cur;
1092  }
1093  break;
1094  case BTEqualStrategyNumber:
1095  /* override any non-equality choice */
1096  chosen = cur;
1097  break;
1100  if (chosen == NULL)
1101  {
1102  if (ScanDirectionIsForward(dir))
1103  chosen = cur;
1104  else
1105  impliesNN = cur;
1106  }
1107  break;
1108  }
1109  }
1110  }
1111 
1112  /*
1113  * If we found no usable boundary keys, we have to start from one end of
1114  * the tree. Walk down that edge to the first or last key, and scan from
1115  * there.
1116  */
1117  if (keysz == 0)
1118  {
1119  bool match;
1120 
1121  match = _bt_endpoint(scan, dir);
1122 
1123  if (!match)
1124  {
1125  /* No match, so mark (parallel) scan finished */
1126  _bt_parallel_done(scan);
1127  }
1128 
1129  return match;
1130  }
1131 
1132  /*
1133  * We want to start the scan somewhere within the index. Set up an
1134  * insertion scankey we can use to search for the boundary point we
1135  * identified above. The insertion scankey is built using the keys
1136  * identified by startKeys[]. (Remaining insertion scankey fields are
1137  * initialized after initial-positioning strategy is finalized.)
1138  */
1139  Assert(keysz <= INDEX_MAX_KEYS);
1140  for (i = 0; i < keysz; i++)
1141  {
1142  ScanKey cur = startKeys[i];
1143 
1144  Assert(cur->sk_attno == i + 1);
1145 
1146  if (cur->sk_flags & SK_ROW_HEADER)
1147  {
1148  /*
1149  * Row comparison header: look to the first row member instead.
1150  *
1151  * The member scankeys are already in insertion format (ie, they
1152  * have sk_func = 3-way-comparison function), but we have to watch
1153  * out for nulls, which _bt_preprocess_keys didn't check. A null
1154  * in the first row member makes the condition unmatchable, just
1155  * like qual_ok = false.
1156  */
1157  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1158 
1159  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1160  if (subkey->sk_flags & SK_ISNULL)
1161  {
1162  _bt_parallel_done(scan);
1163  return false;
1164  }
1165  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1166 
1167  /*
1168  * If the row comparison is the last positioning key we accepted,
1169  * try to add additional keys from the lower-order row members.
1170  * (If we accepted independent conditions on additional index
1171  * columns, we use those instead --- doesn't seem worth trying to
1172  * determine which is more restrictive.) Note that this is OK
1173  * even if the row comparison is of ">" or "<" type, because the
1174  * condition applied to all but the last row member is effectively
1175  * ">=" or "<=", and so the extra keys don't break the positioning
1176  * scheme. But, by the same token, if we aren't able to use all
1177  * the row members, then the part of the row comparison that we
1178  * did use has to be treated as just a ">=" or "<=" condition, and
1179  * so we'd better adjust strat_total accordingly.
1180  */
1181  if (i == keysz - 1)
1182  {
1183  bool used_all_subkeys = false;
1184 
1185  Assert(!(subkey->sk_flags & SK_ROW_END));
1186  for (;;)
1187  {
1188  subkey++;
1189  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1190  if (subkey->sk_attno != keysz + 1)
1191  break; /* out-of-sequence, can't use it */
1192  if (subkey->sk_strategy != cur->sk_strategy)
1193  break; /* wrong direction, can't use it */
1194  if (subkey->sk_flags & SK_ISNULL)
1195  break; /* can't use null keys */
1196  Assert(keysz < INDEX_MAX_KEYS);
1197  memcpy(inskey.scankeys + keysz, subkey,
1198  sizeof(ScanKeyData));
1199  keysz++;
1200  if (subkey->sk_flags & SK_ROW_END)
1201  {
1202  used_all_subkeys = true;
1203  break;
1204  }
1205  }
1206  if (!used_all_subkeys)
1207  {
1208  switch (strat_total)
1209  {
1210  case BTLessStrategyNumber:
1211  strat_total = BTLessEqualStrategyNumber;
1212  break;
1214  strat_total = BTGreaterEqualStrategyNumber;
1215  break;
1216  }
1217  }
1218  break; /* done with outer loop */
1219  }
1220  }
1221  else
1222  {
1223  /*
1224  * Ordinary comparison key. Transform the search-style scan key
1225  * to an insertion scan key by replacing the sk_func with the
1226  * appropriate btree comparison function.
1227  *
1228  * If scankey operator is not a cross-type comparison, we can use
1229  * the cached comparison function; otherwise gotta look it up in
1230  * the catalogs. (That can't lead to infinite recursion, since no
1231  * indexscan initiated by syscache lookup will use cross-data-type
1232  * operators.)
1233  *
1234  * We support the convention that sk_subtype == InvalidOid means
1235  * the opclass input type; this is a hack to simplify life for
1236  * ScanKeyInit().
1237  */
1238  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1239  cur->sk_subtype == InvalidOid)
1240  {
1241  FmgrInfo *procinfo;
1242 
1243  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1244  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1245  cur->sk_flags,
1246  cur->sk_attno,
1248  cur->sk_subtype,
1249  cur->sk_collation,
1250  procinfo,
1251  cur->sk_argument);
1252  }
1253  else
1254  {
1255  RegProcedure cmp_proc;
1256 
1257  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1258  rel->rd_opcintype[i],
1259  cur->sk_subtype,
1260  BTORDER_PROC);
1261  if (!RegProcedureIsValid(cmp_proc))
1262  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1263  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1264  cur->sk_attno, RelationGetRelationName(rel));
1265  ScanKeyEntryInitialize(inskey.scankeys + i,
1266  cur->sk_flags,
1267  cur->sk_attno,
1269  cur->sk_subtype,
1270  cur->sk_collation,
1271  cmp_proc,
1272  cur->sk_argument);
1273  }
1274  }
1275  }
1276 
1277  /*----------
1278  * Examine the selected initial-positioning strategy to determine exactly
1279  * where we need to start the scan, and set flag variables to control the
1280  * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1281  * page _bt_search returns).
1282  *----------
1283  */
1284  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1285  inskey.anynullkeys = false; /* unused */
1286  inskey.scantid = NULL;
1287  inskey.keysz = keysz;
1288  switch (strat_total)
1289  {
1290  case BTLessStrategyNumber:
1291 
1292  inskey.nextkey = false;
1293  inskey.backward = true;
1294  break;
1295 
1297 
1298  inskey.nextkey = true;
1299  inskey.backward = true;
1300  break;
1301 
1302  case BTEqualStrategyNumber:
1303 
1304  /*
1305  * If a backward scan was specified, need to start with last equal
1306  * item not first one.
1307  */
1308  if (ScanDirectionIsBackward(dir))
1309  {
1310  /*
1311  * This is the same as the <= strategy
1312  */
1313  inskey.nextkey = true;
1314  inskey.backward = true;
1315  }
1316  else
1317  {
1318  /*
1319  * This is the same as the >= strategy
1320  */
1321  inskey.nextkey = false;
1322  inskey.backward = false;
1323  }
1324  break;
1325 
1327 
1328  /*
1329  * Find first item >= scankey
1330  */
1331  inskey.nextkey = false;
1332  inskey.backward = false;
1333  break;
1334 
1336 
1337  /*
1338  * Find first item > scankey
1339  */
1340  inskey.nextkey = true;
1341  inskey.backward = false;
1342  break;
1343 
1344  default:
1345  /* can't get here, but keep compiler quiet */
1346  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1347  return false;
1348  }
1349 
1350  /*
1351  * Use the manufactured insertion scan key to descend the tree and
1352  * position ourselves on the target leaf page.
1353  */
1354  Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1355  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1356 
1357  /* don't need to keep the stack around... */
1358  _bt_freestack(stack);
1359 
1360  if (!BufferIsValid(buf))
1361  {
1362  /*
1363  * We only get here if the index is completely empty. Lock relation
1364  * because nothing finer to lock exists. Without a buffer lock, it's
1365  * possible for another transaction to insert data between
1366  * _bt_search() and PredicateLockRelation(). We have to try again
1367  * after taking the relation-level predicate lock, to close a narrow
1368  * window where we wouldn't scan concurrently inserted tuples, but the
1369  * writer wouldn't see our predicate lock.
1370  */
1372  {
1373  PredicateLockRelation(rel, scan->xs_snapshot);
1374  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1375  _bt_freestack(stack);
1376  }
1377 
1378  if (!BufferIsValid(buf))
1379  {
1380  /*
1381  * Mark parallel scan as done, so that all the workers can finish
1382  * their scan.
1383  */
1384  _bt_parallel_done(scan);
1386  return false;
1387  }
1388  }
1389 
1391 
1392  _bt_initialize_more_data(so, dir);
1393 
1394  /* position to the precise item on the page */
1395  offnum = _bt_binsrch(rel, &inskey, buf);
1397  so->currPos.buf = buf;
1398 
1399  /*
1400  * Now load data from the first page of the scan.
1401  *
1402  * If inskey.nextkey = false and inskey.backward = false, offnum is
1403  * positioned at the first non-pivot tuple >= inskey.scankeys.
1404  *
1405  * If inskey.nextkey = false and inskey.backward = true, offnum is
1406  * positioned at the last non-pivot tuple < inskey.scankeys.
1407  *
1408  * If inskey.nextkey = true and inskey.backward = false, offnum is
1409  * positioned at the first non-pivot tuple > inskey.scankeys.
1410  *
1411  * If inskey.nextkey = true and inskey.backward = true, offnum is
1412  * positioned at the last non-pivot tuple <= inskey.scankeys.
1413  *
1414  * It's possible that _bt_binsrch returned an offnum that is out of bounds
1415  * for the page. For example, when inskey is both < the leaf page's high
1416  * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1417  */
1418  if (!_bt_readpage(scan, dir, offnum, true))
1419  {
1420  /*
1421  * There's no actually-matching data on this page. Try to advance to
1422  * the next page. Return false if there's no matching data at all.
1423  */
1424  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1425  if (!_bt_steppage(scan, dir))
1426  return false;
1427  }
1428  else
1429  {
1430  /* We have at least one item to return as scan's first item */
1432  }
1433 
1434 readcomplete:
1435  /* OK, itemIndex says what to return */
1436  currItem = &so->currPos.items[so->currPos.itemIndex];
1437  scan->xs_heaptid = currItem->heapTid;
1438  if (scan->xs_want_itup)
1439  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1440 
1441  return true;
1442 }
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define RegProcedureIsValid(p)
Definition: c.h:766
regproc RegProcedure
Definition: c.h:639
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:863
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:795
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:740
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
#define BTORDER_PROC
Definition: nbtree.h:707
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1007
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:719
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2217
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:337
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2441
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:96
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:174
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:756
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:625
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
ScanKey keyData
Definition: nbtree.h:1037
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
Oid * rd_opcintype
Definition: rel.h:207
Oid * rd_opfamily
Definition: rel.h:206
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), _bt_unlockbuf(), Assert(), BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid(), cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer(), elog(), ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2360 of file nbtsearch.c.

2361 {
2362  Buffer buf;
2363  Page page;
2364  BTPageOpaque opaque;
2365  OffsetNumber offnum;
2366  BlockNumber blkno;
2367  IndexTuple itup;
2368 
2369  /*
2370  * If we are looking for a leaf page, okay to descend from fast root;
2371  * otherwise better descend from true root. (There is no point in being
2372  * smarter about intermediate levels.)
2373  */
2374  if (level == 0)
2375  buf = _bt_getroot(rel, NULL, BT_READ);
2376  else
2377  buf = _bt_gettrueroot(rel);
2378 
2379  if (!BufferIsValid(buf))
2380  return InvalidBuffer;
2381 
2382  page = BufferGetPage(buf);
2383  opaque = BTPageGetOpaque(page);
2384 
2385  for (;;)
2386  {
2387  /*
2388  * If we landed on a deleted page, step right to find a live page
2389  * (there must be one). Also, if we want the rightmost page, step
2390  * right if needed to get to it (this could happen if the page split
2391  * since we obtained a pointer to it).
2392  */
2393  while (P_IGNORE(opaque) ||
2394  (rightmost && !P_RIGHTMOST(opaque)))
2395  {
2396  blkno = opaque->btpo_next;
2397  if (blkno == P_NONE)
2398  elog(ERROR, "fell off the end of index \"%s\"",
2400  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2401  page = BufferGetPage(buf);
2402  opaque = BTPageGetOpaque(page);
2403  }
2404 
2405  /* Done? */
2406  if (opaque->btpo_level == level)
2407  break;
2408  if (opaque->btpo_level < level)
2409  ereport(ERROR,
2410  (errcode(ERRCODE_INDEX_CORRUPTED),
2411  errmsg_internal("btree level %u not found in index \"%s\"",
2412  level, RelationGetRelationName(rel))));
2413 
2414  /* Descend to leftmost or rightmost child page */
2415  if (rightmost)
2416  offnum = PageGetMaxOffsetNumber(page);
2417  else
2418  offnum = P_FIRSTDATAKEY(opaque);
2419 
2420  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2421  blkno = BTreeTupleGetDownLink(itup);
2422 
2423  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2424  page = BufferGetPage(buf);
2425  opaque = BTPageGetOpaque(page);
2426  }
2427 
2428  return buf;
2429 }
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1004
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:581
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:345
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define P_IGNORE(opaque)
Definition: nbtree.h:225
BlockNumber btpo_next
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog(), ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_initialize_more_data()

static void _bt_initialize_more_data ( BTScanOpaque  so,
ScanDirection  dir 
)
inlinestatic

Definition at line 2531 of file nbtsearch.c.

2532 {
2533  /* initialize moreLeft/moreRight appropriately for scan direction */
2534  if (ScanDirectionIsForward(dir))
2535  {
2536  so->currPos.moreLeft = false;
2537  so->currPos.moreRight = true;
2538  }
2539  else
2540  {
2541  so->currPos.moreLeft = true;
2542  so->currPos.moreRight = false;
2543  }
2544  so->numKilled = 0; /* just paranoia */
2545  so->markItemIndex = -1; /* ditto */
2546 }
bool moreRight
Definition: nbtree.h:966
bool moreLeft
Definition: nbtree.h:965

References BTScanOpaqueData::currPos, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::numKilled, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_parallel_readpage().

◆ _bt_moveright()

Buffer _bt_moveright ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access 
)

Definition at line 235 of file nbtsearch.c.

242 {
243  Page page;
244  BTPageOpaque opaque;
245  int32 cmpval;
246 
247  Assert(!forupdate || heaprel != NULL);
248 
249  /*
250  * When nextkey = false (normal case): if the scan key that brought us to
251  * this page is > the high key stored on the page, then the page has split
252  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
253  * have some duplicates to the right as well as the left, but that's
254  * something that's only ever dealt with on the leaf level, after
255  * _bt_search has found an initial leaf page.)
256  *
257  * When nextkey = true: move right if the scan key is >= page's high key.
258  * (Note that key.scantid cannot be set in this case.)
259  *
260  * The page could even have split more than once, so scan as far as
261  * needed.
262  *
263  * We also have to move right if we followed a link that brought us to a
264  * dead page.
265  */
266  cmpval = key->nextkey ? 0 : 1;
267 
268  for (;;)
269  {
270  page = BufferGetPage(buf);
271  opaque = BTPageGetOpaque(page);
272 
273  if (P_RIGHTMOST(opaque))
274  break;
275 
276  /*
277  * Finish any incomplete splits we encounter along the way.
278  */
279  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
280  {
282 
283  /* upgrade our lock if necessary */
284  if (access == BT_READ)
285  {
286  _bt_unlockbuf(rel, buf);
287  _bt_lockbuf(rel, buf, BT_WRITE);
288  }
289 
290  if (P_INCOMPLETE_SPLIT(opaque))
291  _bt_finish_split(rel, heaprel, buf, stack);
292  else
293  _bt_relbuf(rel, buf);
294 
295  /* re-acquire the lock in the right mode, and re-check */
296  buf = _bt_getbuf(rel, blkno, access);
297  continue;
298  }
299 
300  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
301  {
302  /* step right one page */
303  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
304  continue;
305  }
306  else
307  break;
308  }
309 
310  if (P_IGNORE(opaque))
311  elog(ERROR, "fell off the end of index \"%s\"",
313 
314  return buf;
315 }
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2242
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:846
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1040
#define P_HIKEY
Definition: nbtree.h:367
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BT_WRITE
Definition: nbtree.h:720
short access
Definition: preproc-type.c:36

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage(), elog(), ERROR, sort-test::key, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_search().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1459 of file nbtsearch.c.

1460 {
1461  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1462  BTScanPosItem *currItem;
1463 
1464  /*
1465  * Advance to next tuple on current page; or if there's no more, try to
1466  * step to the next page with data.
1467  */
1468  if (ScanDirectionIsForward(dir))
1469  {
1470  if (++so->currPos.itemIndex > so->currPos.lastItem)
1471  {
1472  if (!_bt_steppage(scan, dir))
1473  return false;
1474  }
1475  }
1476  else
1477  {
1478  if (--so->currPos.itemIndex < so->currPos.firstItem)
1479  {
1480  if (!_bt_steppage(scan, dir))
1481  return false;
1482  }
1483  }
1484 
1485  /* OK, itemIndex says what to return */
1486  currItem = &so->currPos.items[so->currPos.itemIndex];
1487  scan->xs_heaptid = currItem->heapTid;
1488  if (scan->xs_want_itup)
1489  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1490 
1491  return true;
1492 }
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
int firstItem
Definition: nbtree.h:981
int lastItem
Definition: nbtree.h:982

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, if(), BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_parallel_readpage()

static bool _bt_parallel_readpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2217 of file nbtsearch.c.

2218 {
2219  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2220 
2221  _bt_initialize_more_data(so, dir);
2222 
2223  if (!_bt_readnextpage(scan, blkno, dir))
2224  return false;
2225 
2226  /* We have at least one item to return as scan's next item */
2228 
2229  return true;
2230 }
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2051

References _bt_drop_lock_and_maybe_pin(), _bt_initialize_more_data(), _bt_readnextpage(), BTScanOpaqueData::currPos, and IndexScanDescData::opaque.

Referenced by _bt_first().

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2051 of file nbtsearch.c.

2052 {
2053  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2054  Relation rel;
2055  Page page;
2056  BTPageOpaque opaque;
2057  bool status;
2058 
2059  rel = scan->indexRelation;
2060 
2061  if (ScanDirectionIsForward(dir))
2062  {
2063  for (;;)
2064  {
2065  /*
2066  * if we're at end of scan, give up and mark parallel scan as
2067  * done, so that all the workers can finish their scan
2068  */
2069  if (blkno == P_NONE || !so->currPos.moreRight)
2070  {
2071  _bt_parallel_done(scan);
2073  return false;
2074  }
2075  /* check for interrupts while we're not holding any buffer lock */
2077  /* step right one page */
2078  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2079  page = BufferGetPage(so->currPos.buf);
2080  opaque = BTPageGetOpaque(page);
2081  /* check for deleted page */
2082  if (!P_IGNORE(opaque))
2083  {
2084  PredicateLockPage(rel, blkno, scan->xs_snapshot);
2085  /* see if there are any matches on this page */
2086  /* note that this will clear moreRight if we can stop */
2087  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2088  break;
2089  }
2090  else if (scan->parallel_scan != NULL)
2091  {
2092  /* allow next page be processed by parallel worker */
2093  _bt_parallel_release(scan, opaque->btpo_next);
2094  }
2095 
2096  /* nope, keep going */
2097  if (scan->parallel_scan != NULL)
2098  {
2099  _bt_relbuf(rel, so->currPos.buf);
2100  status = _bt_parallel_seize(scan, &blkno);
2101  if (!status)
2102  {
2104  return false;
2105  }
2106  }
2107  else
2108  {
2109  blkno = opaque->btpo_next;
2110  _bt_relbuf(rel, so->currPos.buf);
2111  }
2112  }
2113  }
2114  else
2115  {
2116  /*
2117  * Should only happen in parallel cases, when some other backend
2118  * advanced the scan.
2119  */
2120  if (so->currPos.currPage != blkno)
2121  {
2123  so->currPos.currPage = blkno;
2124  }
2125 
2126  /*
2127  * Walk left to the next page with data. This is much more complex
2128  * than the walk-right case because of the possibility that the page
2129  * to our left splits while we are in flight to it, plus the
2130  * possibility that the page we were on gets deleted after we leave
2131  * it. See nbtree/README for details.
2132  *
2133  * It might be possible to rearrange this code to have less overhead
2134  * in pinning and locking, but that would require capturing the left
2135  * sibling block number when the page is initially read, and then
2136  * optimistically starting there (rather than pinning the page twice).
2137  * It is not clear that this would be worth the complexity.
2138  */
2139  if (BTScanPosIsPinned(so->currPos))
2140  _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2141  else
2142  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2143 
2144  for (;;)
2145  {
2146  /* Done if we know there are no matching keys to the left */
2147  if (!so->currPos.moreLeft)
2148  {
2149  _bt_relbuf(rel, so->currPos.buf);
2150  _bt_parallel_done(scan);
2152  return false;
2153  }
2154 
2155  /* Step to next physical page */
2156  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2157 
2158  /* if we're physically at end of index, return failure */
2159  if (so->currPos.buf == InvalidBuffer)
2160  {
2161  _bt_parallel_done(scan);
2163  return false;
2164  }
2165 
2166  /*
2167  * Okay, we managed to move left to a non-deleted page. Done if
2168  * it's not half-dead and contains matching tuples. Else loop back
2169  * and do it all again.
2170  */
2171  page = BufferGetPage(so->currPos.buf);
2172  opaque = BTPageGetOpaque(page);
2173  if (!P_IGNORE(opaque))
2174  {
2176  /* see if there are any matches on this page */
2177  /* note that this will clear moreLeft if we can stop */
2178  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2179  break;
2180  }
2181  else if (scan->parallel_scan != NULL)
2182  {
2183  /* allow next page be processed by parallel worker */
2185  }
2186 
2187  /*
2188  * For parallel scans, get the last page scanned as it is quite
2189  * possible that by the time we try to seize the scan, some other
2190  * worker has already advanced the scan to a different page. We
2191  * must continue based on the latest page scanned by any worker.
2192  */
2193  if (scan->parallel_scan != NULL)
2194  {
2195  _bt_relbuf(rel, so->currPos.buf);
2196  status = _bt_parallel_seize(scan, &blkno);
2197  if (!status)
2198  {
2200  return false;
2201  }
2202  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2203  }
2204  }
2205  }
2206 
2207  return true;
2208 }
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:699
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:990
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1001
static Buffer _bt_walk_left(Relation rel, Buffer buf)
Definition: nbtsearch.c:2246
BlockNumber currPage
Definition: nbtree.h:956

References _bt_getbuf(), _bt_lockbuf(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), BT_READ, BTPageGetOpaque, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, if(), IndexScanDescData::indexRelation, InvalidBuffer, BTScanPosData::moreLeft, BTScanPosData::moreRight, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), ScanDirectionIsForward, and IndexScanDescData::xs_snapshot.

Referenced by _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum,
bool  firstPage 
)
static

Definition at line 1522 of file nbtsearch.c.

1524 {
1525  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1526  Page page;
1527  BTPageOpaque opaque;
1528  OffsetNumber minoff;
1529  OffsetNumber maxoff;
1530  int itemIndex;
1531  bool continuescan;
1532  int indnatts;
1533  bool continuescanPrechecked;
1534  bool haveFirstMatch = false;
1535 
1536  /*
1537  * We must have the buffer pinned and locked, but the usual macro can't be
1538  * used here; this function is what makes it good for currPos.
1539  */
1541 
1542  page = BufferGetPage(so->currPos.buf);
1543  opaque = BTPageGetOpaque(page);
1544 
1545  /* allow next page be processed by parallel worker */
1546  if (scan->parallel_scan)
1547  {
1548  if (ScanDirectionIsForward(dir))
1549  _bt_parallel_release(scan, opaque->btpo_next);
1550  else
1552  }
1553 
1554  continuescan = true; /* default assumption */
1556  minoff = P_FIRSTDATAKEY(opaque);
1557  maxoff = PageGetMaxOffsetNumber(page);
1558 
1559  /*
1560  * We note the buffer's block number so that we can release the pin later.
1561  * This allows us to re-read the buffer if it is needed again for hinting.
1562  */
1564 
1565  /*
1566  * We save the LSN of the page as we read it, so that we know whether it
1567  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1568  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1569  */
1571 
1572  /*
1573  * we must save the page's right-link while scanning it; this tells us
1574  * where to step right to after we're done with these items. There is no
1575  * corresponding need for the left-link, since splits always go right.
1576  */
1577  so->currPos.nextPage = opaque->btpo_next;
1578 
1579  /* initialize tuple workspace to empty */
1580  so->currPos.nextTupleOffset = 0;
1581 
1582  /*
1583  * Now that the current page has been made consistent, the macro should be
1584  * good.
1585  */
1587 
1588  /*
1589  * Prechecking the value of the continuescan flag for the last item on the
1590  * page (for backwards scan it will be the first item on a page). If we
1591  * observe it to be true, then it should be true for all other items. This
1592  * allows us to do significant optimizations in the _bt_checkkeys()
1593  * function for all the items on the page.
1594  *
1595  * With the forward scan, we do this check for the last item on the page
1596  * instead of the high key. It's relatively likely that the most
1597  * significant column in the high key will be different from the
1598  * corresponding value from the last item on the page. So checking with
1599  * the last item on the page would give a more precise answer.
1600  *
1601  * We skip this for the first page in the scan to evade the possible
1602  * slowdown of the point queries.
1603  */
1604  if (!firstPage && minoff < maxoff)
1605  {
1606  ItemId iid;
1607  IndexTuple itup;
1608 
1609  iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1610  itup = (IndexTuple) PageGetItem(page, iid);
1611 
1612  /*
1613  * Do the precheck. Note that we pass the pointer to the
1614  * 'continuescanPrechecked' to the 'continuescan' argument. That will
1615  * set flag to true if all required keys are satisfied and false
1616  * otherwise.
1617  */
1618  (void) _bt_checkkeys(scan, itup, indnatts, dir,
1619  &continuescanPrechecked, false, false);
1620  }
1621  else
1622  {
1623  continuescanPrechecked = false;
1624  }
1625 
1626  if (ScanDirectionIsForward(dir))
1627  {
1628  /* load items[] in ascending order */
1629  itemIndex = 0;
1630 
1631  offnum = Max(offnum, minoff);
1632 
1633  while (offnum <= maxoff)
1634  {
1635  ItemId iid = PageGetItemId(page, offnum);
1636  IndexTuple itup;
1637  bool passes_quals;
1638 
1639  /*
1640  * If the scan specifies not to return killed tuples, then we
1641  * treat a killed tuple as not passing the qual
1642  */
1643  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1644  {
1645  offnum = OffsetNumberNext(offnum);
1646  continue;
1647  }
1648 
1649  itup = (IndexTuple) PageGetItem(page, iid);
1650  Assert(!BTreeTupleIsPivot(itup));
1651 
1652  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1653  &continuescan,
1654  continuescanPrechecked,
1655  haveFirstMatch);
1656 
1657  /*
1658  * If the result of prechecking required keys was true, then in
1659  * assert-enabled builds we also recheck that the _bt_checkkeys()
1660  * result is the same.
1661  */
1662  Assert((!continuescanPrechecked && haveFirstMatch) ||
1663  passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
1664  &continuescan, false, false));
1665  if (passes_quals)
1666  {
1667  /* tuple passes all scan key conditions */
1668  haveFirstMatch = true;
1669  if (!BTreeTupleIsPosting(itup))
1670  {
1671  /* Remember it */
1672  _bt_saveitem(so, itemIndex, offnum, itup);
1673  itemIndex++;
1674  }
1675  else
1676  {
1677  int tupleOffset;
1678 
1679  /*
1680  * Set up state to return posting list, and remember first
1681  * TID
1682  */
1683  tupleOffset =
1684  _bt_setuppostingitems(so, itemIndex, offnum,
1685  BTreeTupleGetPostingN(itup, 0),
1686  itup);
1687  itemIndex++;
1688  /* Remember additional TIDs */
1689  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1690  {
1691  _bt_savepostingitem(so, itemIndex, offnum,
1692  BTreeTupleGetPostingN(itup, i),
1693  tupleOffset);
1694  itemIndex++;
1695  }
1696  }
1697  }
1698  /* When !continuescan, there can't be any more matches, so stop */
1699  if (!continuescan)
1700  break;
1701 
1702  offnum = OffsetNumberNext(offnum);
1703  }
1704 
1705  /*
1706  * We don't need to visit page to the right when the high key
1707  * indicates that no more matches will be found there.
1708  *
1709  * Checking the high key like this works out more often than you might
1710  * think. Leaf page splits pick a split point between the two most
1711  * dissimilar tuples (this is weighed against the need to evenly share
1712  * free space). Leaf pages with high key attribute values that can
1713  * only appear on non-pivot tuples on the right sibling page are
1714  * common.
1715  */
1716  if (continuescan && !P_RIGHTMOST(opaque))
1717  {
1718  ItemId iid = PageGetItemId(page, P_HIKEY);
1719  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1720  int truncatt;
1721 
1722  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1723  _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false);
1724  }
1725 
1726  if (!continuescan)
1727  so->currPos.moreRight = false;
1728 
1729  Assert(itemIndex <= MaxTIDsPerBTreePage);
1730  so->currPos.firstItem = 0;
1731  so->currPos.lastItem = itemIndex - 1;
1732  so->currPos.itemIndex = 0;
1733  }
1734  else
1735  {
1736  /* load items[] in descending order */
1737  itemIndex = MaxTIDsPerBTreePage;
1738 
1739  offnum = Min(offnum, maxoff);
1740 
1741  while (offnum >= minoff)
1742  {
1743  ItemId iid = PageGetItemId(page, offnum);
1744  IndexTuple itup;
1745  bool tuple_alive;
1746  bool passes_quals;
1747 
1748  /*
1749  * If the scan specifies not to return killed tuples, then we
1750  * treat a killed tuple as not passing the qual. Most of the
1751  * time, it's a win to not bother examining the tuple's index
1752  * keys, but just skip to the next tuple (previous, actually,
1753  * since we're scanning backwards). However, if this is the first
1754  * tuple on the page, we do check the index keys, to prevent
1755  * uselessly advancing to the page to the left. This is similar
1756  * to the high key optimization used by forward scans.
1757  */
1758  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1759  {
1760  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1761  if (offnum > P_FIRSTDATAKEY(opaque))
1762  {
1763  offnum = OffsetNumberPrev(offnum);
1764  continue;
1765  }
1766 
1767  tuple_alive = false;
1768  }
1769  else
1770  tuple_alive = true;
1771 
1772  itup = (IndexTuple) PageGetItem(page, iid);
1773  Assert(!BTreeTupleIsPivot(itup));
1774 
1775  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1776  &continuescan,
1777  continuescanPrechecked,
1778  haveFirstMatch);
1779 
1780  /*
1781  * If the result of prechecking required keys was true, then in
1782  * assert-enabled builds we also recheck that the _bt_checkkeys()
1783  * result is the same.
1784  */
1785  Assert((!continuescanPrechecked && !haveFirstMatch) ||
1786  passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
1787  &continuescan, false, false));
1788  if (passes_quals && tuple_alive)
1789  {
1790  /* tuple passes all scan key conditions */
1791  haveFirstMatch = true;
1792  if (!BTreeTupleIsPosting(itup))
1793  {
1794  /* Remember it */
1795  itemIndex--;
1796  _bt_saveitem(so, itemIndex, offnum, itup);
1797  }
1798  else
1799  {
1800  int tupleOffset;
1801 
1802  /*
1803  * Set up state to return posting list, and remember first
1804  * TID.
1805  *
1806  * Note that we deliberately save/return items from
1807  * posting lists in ascending heap TID order for backwards
1808  * scans. This allows _bt_killitems() to make a
1809  * consistent assumption about the order of items
1810  * associated with the same posting list tuple.
1811  */
1812  itemIndex--;
1813  tupleOffset =
1814  _bt_setuppostingitems(so, itemIndex, offnum,
1815  BTreeTupleGetPostingN(itup, 0),
1816  itup);
1817  /* Remember additional TIDs */
1818  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1819  {
1820  itemIndex--;
1821  _bt_savepostingitem(so, itemIndex, offnum,
1822  BTreeTupleGetPostingN(itup, i),
1823  tupleOffset);
1824  }
1825  }
1826  }
1827  if (!continuescan)
1828  {
1829  /* there can't be any more matches, so stop */
1830  so->currPos.moreLeft = false;
1831  break;
1832  }
1833 
1834  offnum = OffsetNumberPrev(offnum);
1835  }
1836 
1837  Assert(itemIndex >= 0);
1838  so->currPos.firstItem = itemIndex;
1841  }
1842 
1843  return (so->currPos.firstItem <= so->currPos.lastItem);
1844 }
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3639
#define Max(x, y)
Definition: c.h:987
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1848
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1878
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:1916
bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan, bool continuescanPrechecked, bool haveFirstMatch)
Definition: nbtutils.c:1373
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:516
int nextTupleOffset
Definition: nbtree.h:972
BlockNumber nextPage
Definition: nbtree.h:957
XLogRecPtr lsn
Definition: nbtree.h:955
bool ignore_killed_tuples
Definition: relscan.h:129

References _bt_checkkeys(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_setuppostingitems(), Assert(), BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage(), BufferIsValid(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_HIKEY, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_readnextpage().

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1848 of file nbtsearch.c.

1850 {
1851  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1852 
1853  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1854 
1855  currItem->heapTid = itup->t_tid;
1856  currItem->indexOffset = offnum;
1857  if (so->currTuples)
1858  {
1859  Size itupsz = IndexTupleSize(itup);
1860 
1861  currItem->tupleOffset = so->currPos.nextTupleOffset;
1862  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1863  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1864  }
1865 }
#define MAXALIGN(LEN)
Definition: c.h:800
size_t Size
Definition: c.h:594
#define IndexTupleSize(itup)
Definition: itup.h:70
ItemPointerData heapTid
Definition: nbtree.h:946
LocationIndex tupleOffset
Definition: nbtree.h:948
OffsetNumber indexOffset
Definition: nbtree.h:947
ItemPointerData t_tid
Definition: itup.h:37

References Assert(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_savepostingitem()

static void _bt_savepostingitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
int  tupleOffset 
)
inlinestatic

Definition at line 1916 of file nbtsearch.c.

1918 {
1919  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1920 
1921  currItem->heapTid = *heapTid;
1922  currItem->indexOffset = offnum;
1923 
1924  /*
1925  * Have index-only scans return the same base IndexTuple for every TID
1926  * that originates from the same posting list
1927  */
1928  if (so->currTuples)
1929  currItem->tupleOffset = tupleOffset;
1930 }

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 96 of file nbtsearch.c.

98 {
99  BTStack stack_in = NULL;
100  int page_access = BT_READ;
101 
102  /* heaprel must be set whenever _bt_allocbuf is reachable */
103  Assert(access == BT_READ || access == BT_WRITE);
104  Assert(access == BT_READ || heaprel != NULL);
105 
106  /* Get the root page to start with */
107  *bufP = _bt_getroot(rel, heaprel, access);
108 
109  /* If index is empty and access = BT_READ, no root page is created. */
110  if (!BufferIsValid(*bufP))
111  return (BTStack) NULL;
112 
113  /* Loop iterates once per level descended in the tree */
114  for (;;)
115  {
116  Page page;
117  BTPageOpaque opaque;
118  OffsetNumber offnum;
119  ItemId itemid;
120  IndexTuple itup;
121  BlockNumber child;
122  BTStack new_stack;
123 
124  /*
125  * Race -- the page we just grabbed may have split since we read its
126  * downlink in its parent page (or the metapage). If it has, we may
127  * need to move right to its new sibling. Do that.
128  *
129  * In write-mode, allow _bt_moveright to finish any incomplete splits
130  * along the way. Strictly speaking, we'd only need to finish an
131  * incomplete split on the leaf page we're about to insert to, not on
132  * any of the upper levels (internal pages with incomplete splits are
133  * also taken care of in _bt_getstackbuf). But this is a good
134  * opportunity to finish splits of internal pages too.
135  */
136  *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
137  stack_in, page_access);
138 
139  /* if this is a leaf page, we're done */
140  page = BufferGetPage(*bufP);
141  opaque = BTPageGetOpaque(page);
142  if (P_ISLEAF(opaque))
143  break;
144 
145  /*
146  * Find the appropriate pivot tuple on this page. Its downlink points
147  * to the child page that we're about to descend to.
148  */
149  offnum = _bt_binsrch(rel, key, *bufP);
150  itemid = PageGetItemId(page, offnum);
151  itup = (IndexTuple) PageGetItem(page, itemid);
152  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
153  child = BTreeTupleGetDownLink(itup);
154 
155  /*
156  * We need to save the location of the pivot tuple we chose in a new
157  * stack entry for this page/level. If caller ends up splitting a
158  * page one level down, it usually ends up inserting a new pivot
159  * tuple/downlink immediately after the location recorded here.
160  */
161  new_stack = (BTStack) palloc(sizeof(BTStackData));
162  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
163  new_stack->bts_offset = offnum;
164  new_stack->bts_parent = stack_in;
165 
166  /*
167  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
168  * we're on the level 1 and asked to lock leaf page in write mode,
169  * then lock next page in write mode, because it must be a leaf.
170  */
171  if (opaque->btpo_level == 1 && access == BT_WRITE)
172  page_access = BT_WRITE;
173 
174  /* drop the read lock on the page, then acquire one on its child */
175  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
176 
177  /* okay, all set to move down a level */
178  stack_in = new_stack;
179  }
180 
181  /*
182  * If we're asked to lock leaf in write mode, but didn't manage to, then
183  * relock. This should only happen when the root page is a leaf page (and
184  * the only page in the index other than the metapage).
185  */
186  if (access == BT_WRITE && page_access == BT_READ)
187  {
188  /* trade in our read lock for a write lock */
189  _bt_unlockbuf(rel, *bufP);
190  _bt_lockbuf(rel, *bufP, BT_WRITE);
191 
192  /*
193  * Race -- the leaf page may have split after we dropped the read lock
194  * but before we acquired a write lock. If it has, we may need to
195  * move right to its new sibling. Do that.
196  */
197  *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
198  }
199 
200  return stack_in;
201 }
void * palloc(Size size)
Definition: mcxt.c:1201
BTStackData * BTStack
Definition: nbtree.h:739
Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:235
BlockNumber bts_blkno
Definition: nbtree.h:734
struct BTStackData * bts_parent
Definition: nbtree.h:736
OffsetNumber bts_offset
Definition: nbtree.h:735

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_setuppostingitems()

static int _bt_setuppostingitems ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
IndexTuple  itup 
)
static

Definition at line 1878 of file nbtsearch.c.

1880 {
1881  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1882 
1883  Assert(BTreeTupleIsPosting(itup));
1884 
1885  currItem->heapTid = *heapTid;
1886  currItem->indexOffset = offnum;
1887  if (so->currTuples)
1888  {
1889  /* Save base IndexTuple (truncate posting list) */
1890  IndexTuple base;
1891  Size itupsz = BTreeTupleGetPostingOffset(itup);
1892 
1893  itupsz = MAXALIGN(itupsz);
1894  currItem->tupleOffset = so->currPos.nextTupleOffset;
1895  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1896  memcpy(base, itup, itupsz);
1897  /* Defensively reduce work area index tuple header size */
1898  base->t_info &= ~INDEX_SIZE_MASK;
1899  base->t_info |= itupsz;
1900  so->currPos.nextTupleOffset += itupsz;
1901 
1902  return currItem->tupleOffset;
1903  }
1904 
1905  return 0;
1906 }
#define INDEX_SIZE_MASK
Definition: itup.h:65
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, INDEX_SIZE_MASK, BTScanPosItem::indexOffset, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_info, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1944 of file nbtsearch.c.

1945 {
1946  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1948  bool status;
1949 
1951 
1952  /* Before leaving current page, deal with any killed items */
1953  if (so->numKilled > 0)
1954  _bt_killitems(scan);
1955 
1956  /*
1957  * Before we modify currPos, make a copy of the page data if there was a
1958  * mark position that needs it.
1959  */
1960  if (so->markItemIndex >= 0)
1961  {
1962  /* bump pin on current buffer for assignment to mark buffer */
1963  if (BTScanPosIsPinned(so->currPos))
1965  memcpy(&so->markPos, &so->currPos,
1966  offsetof(BTScanPosData, items[1]) +
1967  so->currPos.lastItem * sizeof(BTScanPosItem));
1968  if (so->markTuples)
1969  memcpy(so->markTuples, so->currTuples,
1970  so->currPos.nextTupleOffset);
1971  so->markPos.itemIndex = so->markItemIndex;
1972  so->markItemIndex = -1;
1973  }
1974 
1975  if (ScanDirectionIsForward(dir))
1976  {
1977  /* Walk right to the next page with data */
1978  if (scan->parallel_scan != NULL)
1979  {
1980  /*
1981  * Seize the scan to get the next block number; if the scan has
1982  * ended already, bail out.
1983  */
1984  status = _bt_parallel_seize(scan, &blkno);
1985  if (!status)
1986  {
1987  /* release the previous buffer, if pinned */
1990  return false;
1991  }
1992  }
1993  else
1994  {
1995  /* Not parallel, so use the previously-saved nextPage link. */
1996  blkno = so->currPos.nextPage;
1997  }
1998 
1999  /* Remember we left a page with data */
2000  so->currPos.moreLeft = true;
2001 
2002  /* release the previous buffer, if pinned */
2004  }
2005  else
2006  {
2007  /* Remember we left a page with data */
2008  so->currPos.moreRight = true;
2009 
2010  if (scan->parallel_scan != NULL)
2011  {
2012  /*
2013  * Seize the scan to get the current block number; if the scan has
2014  * ended already, bail out.
2015  */
2016  status = _bt_parallel_seize(scan, &blkno);
2018  if (!status)
2019  {
2021  return false;
2022  }
2023  }
2024  else
2025  {
2026  /* Not parallel, so just use our own notion of the current page */
2027  blkno = so->currPos.currPage;
2028  }
2029  }
2030 
2031  if (!_bt_readnextpage(scan, blkno, dir))
2032  return false;
2033 
2034  /* We have at least one item to return as scan's next item */
2036 
2037  return true;
2038 }
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4593
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1777
char * markTuples
Definition: nbtree.h:1060
BTScanPosData markPos
Definition: nbtree.h:1073

References _bt_drop_lock_and_maybe_pin(), _bt_killitems(), _bt_parallel_seize(), _bt_readnextpage(), Assert(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), InvalidBlockNumber, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

◆ _bt_walk_left()

static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf 
)
static

Definition at line 2246 of file nbtsearch.c.

2247 {
2248  Page page;
2249  BTPageOpaque opaque;
2250 
2251  page = BufferGetPage(buf);
2252  opaque = BTPageGetOpaque(page);
2253 
2254  for (;;)
2255  {
2256  BlockNumber obknum;
2257  BlockNumber lblkno;
2258  BlockNumber blkno;
2259  int tries;
2260 
2261  /* if we're at end of tree, release buf and return failure */
2262  if (P_LEFTMOST(opaque))
2263  {
2264  _bt_relbuf(rel, buf);
2265  break;
2266  }
2267  /* remember original page we are stepping left from */
2268  obknum = BufferGetBlockNumber(buf);
2269  /* step left */
2270  blkno = lblkno = opaque->btpo_prev;
2271  _bt_relbuf(rel, buf);
2272  /* check for interrupts while we're not holding any buffer lock */
2274  buf = _bt_getbuf(rel, blkno, BT_READ);
2275  page = BufferGetPage(buf);
2276  opaque = BTPageGetOpaque(page);
2277 
2278  /*
2279  * If this isn't the page we want, walk right till we find what we
2280  * want --- but go no more than four hops (an arbitrary limit). If we
2281  * don't find the correct page by then, the most likely bet is that
2282  * the original page got deleted and isn't in the sibling chain at all
2283  * anymore, not that its left sibling got split more than four times.
2284  *
2285  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2286  * because half-dead pages are still in the sibling chain.
2287  */
2288  tries = 0;
2289  for (;;)
2290  {
2291  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2292  {
2293  /* Found desired page, return it */
2294  return buf;
2295  }
2296  if (P_RIGHTMOST(opaque) || ++tries > 4)
2297  break;
2298  blkno = opaque->btpo_next;
2299  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2300  page = BufferGetPage(buf);
2301  opaque = BTPageGetOpaque(page);
2302  }
2303 
2304  /* Return to the original page to see what's up */
2305  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2306  page = BufferGetPage(buf);
2307  opaque = BTPageGetOpaque(page);
2308  if (P_ISDELETED(opaque))
2309  {
2310  /*
2311  * It was deleted. Move right to first nondeleted page (there
2312  * must be one); that is the page that has acquired the deleted
2313  * one's keyspace, so stepping left from it will take us where we
2314  * want to be.
2315  */
2316  for (;;)
2317  {
2318  if (P_RIGHTMOST(opaque))
2319  elog(ERROR, "fell off the end of index \"%s\"",
2321  blkno = opaque->btpo_next;
2322  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2323  page = BufferGetPage(buf);
2324  opaque = BTPageGetOpaque(page);
2325  if (!P_ISDELETED(opaque))
2326  break;
2327  }
2328 
2329  /*
2330  * Now return to top of loop, resetting obknum to point to this
2331  * nondeleted page, and try again.
2332  */
2333  }
2334  else
2335  {
2336  /*
2337  * It wasn't deleted; the explanation had better be that the page
2338  * to the left got split or deleted. Without this check, we'd go
2339  * into an infinite loop if there's anything wrong.
2340  */
2341  if (opaque->btpo_prev == lblkno)
2342  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2343  obknum, RelationGetRelationName(rel));
2344  /* Okay to try again with new lblkno value */
2345  }
2346  }
2347 
2348  return InvalidBuffer;
2349 }
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
BlockNumber btpo_prev
Definition: nbtree.h:64

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, elog(), ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_readnextpage().