PostgreSQL Source Code  git master
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static int _bt_setuppostingitems (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
 
static void _bt_savepostingitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static bool _bt_parallel_readpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_initialize_more_data (BTScanOpaque so, ScanDirection dir)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
Buffer _bt_moveright (Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 338 of file nbtsearch.c.

341 {
342  Page page;
343  BTPageOpaque opaque;
344  OffsetNumber low,
345  high;
346  int32 result,
347  cmpval;
348 
349  page = BufferGetPage(buf);
350  opaque = BTPageGetOpaque(page);
351 
352  /* Requesting nextkey semantics while using scantid seems nonsensical */
353  Assert(!key->nextkey || key->scantid == NULL);
354  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
355  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
356 
357  low = P_FIRSTDATAKEY(opaque);
358  high = PageGetMaxOffsetNumber(page);
359 
360  /*
361  * If there are no keys on the page, return the first available slot. Note
362  * this covers two cases: the page is really empty (no keys), or it
363  * contains only a high key. The latter case is possible after vacuuming.
364  * This can never happen on an internal page, however, since they are
365  * never empty (an internal page must have children).
366  */
367  if (unlikely(high < low))
368  return low;
369 
370  /*
371  * Binary search to find the first key on the page >= scan key, or first
372  * key > scankey when nextkey is true.
373  *
374  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
375  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
376  *
377  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
378  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
379  *
380  * We can fall out when high == low.
381  */
382  high++; /* establish the loop invariant for high */
383 
384  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
385 
386  while (high > low)
387  {
388  OffsetNumber mid = low + ((high - low) / 2);
389 
390  /* We have low <= mid < high, so mid points at a real slot */
391 
392  result = _bt_compare(rel, key, page, mid);
393 
394  if (result >= cmpval)
395  low = mid + 1;
396  else
397  high = mid;
398  }
399 
400  /*
401  * At this point we have high == low, but be careful: they could point
402  * past the last slot on the page.
403  *
404  * On a leaf page, we always return the first key >= scan key (resp. >
405  * scan key), which could be the last slot + 1.
406  */
407  if (P_ISLEAF(opaque))
408  return low;
409 
410  /*
411  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
412  * There must be one if _bt_compare() is playing by the rules.
413  */
414  Assert(low > P_FIRSTDATAKEY(opaque));
415 
416  return OffsetNumberPrev(low);
417 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
Pointer Page
Definition: bufpage.h:78
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
signed int int32
Definition: c.h:483
#define unlikely(x)
Definition: c.h:300
Assert(fmt[strlen(fmt) - 1] !='\n')
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:656
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:67

References _bt_compare(), Assert(), BTPageGetOpaque, buf, BufferGetPage(), sort-test::key, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), and unlikely.

Referenced by _bt_first(), and _bt_search().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 442 of file nbtsearch.c.

443 {
444  BTScanInsert key = insertstate->itup_key;
445  Page page;
446  BTPageOpaque opaque;
447  OffsetNumber low,
448  high,
449  stricthigh;
450  int32 result,
451  cmpval;
452 
453  page = BufferGetPage(insertstate->buf);
454  opaque = BTPageGetOpaque(page);
455 
456  Assert(P_ISLEAF(opaque));
457  Assert(!key->nextkey);
458  Assert(insertstate->postingoff == 0);
459 
460  if (!insertstate->bounds_valid)
461  {
462  /* Start new binary search */
463  low = P_FIRSTDATAKEY(opaque);
464  high = PageGetMaxOffsetNumber(page);
465  }
466  else
467  {
468  /* Restore result of previous binary search against same page */
469  low = insertstate->low;
470  high = insertstate->stricthigh;
471  }
472 
473  /* If there are no keys on the page, return the first available slot */
474  if (unlikely(high < low))
475  {
476  /* Caller can't reuse bounds */
477  insertstate->low = InvalidOffsetNumber;
478  insertstate->stricthigh = InvalidOffsetNumber;
479  insertstate->bounds_valid = false;
480  return low;
481  }
482 
483  /*
484  * Binary search to find the first key on the page >= scan key. (nextkey
485  * is always false when inserting).
486  *
487  * The loop invariant is: all slots before 'low' are < scan key, all slots
488  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
489  * maintained to save additional search effort for caller.
490  *
491  * We can fall out when high == low.
492  */
493  if (!insertstate->bounds_valid)
494  high++; /* establish the loop invariant for high */
495  stricthigh = high; /* high initially strictly higher */
496 
497  cmpval = 1; /* !nextkey comparison value */
498 
499  while (high > low)
500  {
501  OffsetNumber mid = low + ((high - low) / 2);
502 
503  /* We have low <= mid < high, so mid points at a real slot */
504 
505  result = _bt_compare(rel, key, page, mid);
506 
507  if (result >= cmpval)
508  low = mid + 1;
509  else
510  {
511  high = mid;
512  if (result != 0)
513  stricthigh = high;
514  }
515 
516  /*
517  * If tuple at offset located by binary search is a posting list whose
518  * TID range overlaps with caller's scantid, perform posting list
519  * binary search to set postingoff for caller. Caller must split the
520  * posting list when postingoff is set. This should happen
521  * infrequently.
522  */
523  if (unlikely(result == 0 && key->scantid != NULL))
524  {
525  /*
526  * postingoff should never be set more than once per leaf page
527  * binary search. That would mean that there are duplicate table
528  * TIDs in the index, which is never okay. Check for that here.
529  */
530  if (insertstate->postingoff != 0)
531  ereport(ERROR,
532  (errcode(ERRCODE_INDEX_CORRUPTED),
533  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
534  ItemPointerGetBlockNumber(key->scantid),
536  low, stricthigh,
537  BufferGetBlockNumber(insertstate->buf),
538  RelationGetRelationName(rel))));
539 
540  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
541  }
542  }
543 
544  /*
545  * On a leaf page, a binary search always returns the first key >= scan
546  * key (at least in !nextkey case), which could be the last slot + 1. This
547  * is also the lower bound of cached search.
548  *
549  * stricthigh may also be the last slot + 1, which prevents caller from
550  * using bounds directly, but is still useful to us if we're called a
551  * second time with cached bounds (cached low will be < stricthigh when
552  * that happens).
553  */
554  insertstate->low = low;
555  insertstate->stricthigh = stricthigh;
556  insertstate->bounds_valid = true;
557 
558  return low;
559 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3290
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errcode(int sqlerrcode)
Definition: elog.c:858
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:570
#define InvalidOffsetNumber
Definition: off.h:26
#define RelationGetRelationName(relation)
Definition: rel.h:538
OffsetNumber stricthigh
Definition: nbtree.h:830
bool bounds_valid
Definition: nbtree.h:828
OffsetNumber low
Definition: nbtree.h:829
BTScanInsert itup_key
Definition: nbtree.h:818

References _bt_binsrch_posting(), _bt_compare(), Assert(), BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 570 of file nbtsearch.c.

571 {
572  IndexTuple itup;
573  ItemId itemid;
574  int low,
575  high,
576  mid,
577  res;
578 
579  /*
580  * If this isn't a posting tuple, then the index must be corrupt (if it is
581  * an ordinary non-pivot tuple then there must be an existing tuple with a
582  * heap TID that equals inserter's new heap TID/scantid). Defensively
583  * check that tuple is a posting list tuple whose posting list range
584  * includes caller's scantid.
585  *
586  * (This is also needed because contrib/amcheck's rootdescend option needs
587  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
588  */
589  itemid = PageGetItemId(page, offnum);
590  itup = (IndexTuple) PageGetItem(page, itemid);
591  if (!BTreeTupleIsPosting(itup))
592  return 0;
593 
594  Assert(key->heapkeyspace && key->allequalimage);
595 
596  /*
597  * In the event that posting list tuple has LP_DEAD bit set, indicate this
598  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
599  * second call to _bt_binsrch_insert() can take place when its caller has
600  * removed the dead item.
601  */
602  if (ItemIdIsDead(itemid))
603  return -1;
604 
605  /* "high" is past end of posting list for loop invariant */
606  low = 0;
607  high = BTreeTupleGetNPosting(itup);
608  Assert(high >= 2);
609 
610  while (high > low)
611  {
612  mid = low + ((high - low) / 2);
613  res = ItemPointerCompare(key->scantid,
614  BTreeTupleGetPostingN(itup, mid));
615 
616  if (res > 0)
617  low = mid + 1;
618  else if (res < 0)
619  high = mid;
620  else
621  return mid;
622  }
623 
624  /* Exact match not found */
625  return low;
626 }
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), ItemIdIsDead, ItemPointerCompare(), sort-test::key, PageGetItem(), PageGetItemId(), and res.

Referenced by _bt_binsrch_insert().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 656 of file nbtsearch.c.

660 {
661  TupleDesc itupdesc = RelationGetDescr(rel);
662  BTPageOpaque opaque = BTPageGetOpaque(page);
663  IndexTuple itup;
664  ItemPointer heapTid;
665  ScanKey scankey;
666  int ncmpkey;
667  int ntupatts;
668  int32 result;
669 
670  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
672  Assert(key->heapkeyspace || key->scantid == NULL);
673 
674  /*
675  * Force result ">" if target item is first data item on an internal page
676  * --- see NOTE above.
677  */
678  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
679  return 1;
680 
681  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
682  ntupatts = BTreeTupleGetNAtts(itup, rel);
683 
684  /*
685  * The scan key is set up with the attribute number associated with each
686  * term in the key. It is important that, if the index is multi-key, the
687  * scan contain the first k key attributes, and that they be in order. If
688  * you think about how multi-key ordering works, you'll understand why
689  * this is.
690  *
691  * We don't test for violation of this condition here, however. The
692  * initial setup for the index scan had better have gotten it right (see
693  * _bt_first).
694  */
695 
696  ncmpkey = Min(ntupatts, key->keysz);
697  Assert(key->heapkeyspace || ncmpkey == key->keysz);
698  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
699  scankey = key->scankeys;
700  for (int i = 1; i <= ncmpkey; i++)
701  {
702  Datum datum;
703  bool isNull;
704 
705  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
706 
707  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
708  {
709  if (isNull)
710  result = 0; /* NULL "=" NULL */
711  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
712  result = -1; /* NULL "<" NOT_NULL */
713  else
714  result = 1; /* NULL ">" NOT_NULL */
715  }
716  else if (isNull) /* key is NOT_NULL and item is NULL */
717  {
718  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
719  result = 1; /* NOT_NULL ">" NULL */
720  else
721  result = -1; /* NOT_NULL "<" NULL */
722  }
723  else
724  {
725  /*
726  * The sk_func needs to be passed the index value as left arg and
727  * the sk_argument as right arg (they might be of different
728  * types). Since it is convenient for callers to think of
729  * _bt_compare as comparing the scankey to the index item, we have
730  * to flip the sign of the comparison result. (Unless it's a DESC
731  * column, in which case we *don't* flip the sign.)
732  */
733  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
734  scankey->sk_collation,
735  datum,
736  scankey->sk_argument));
737 
738  if (!(scankey->sk_flags & SK_BT_DESC))
739  INVERT_COMPARE_RESULT(result);
740  }
741 
742  /* if the keys are unequal, return the difference */
743  if (result != 0)
744  return result;
745 
746  scankey++;
747  }
748 
749  /*
750  * All non-truncated attributes (other than heap TID) were found to be
751  * equal. Treat truncated attributes as minus infinity when scankey has a
752  * key attribute value that would otherwise be compared directly.
753  *
754  * Note: it doesn't matter if ntupatts includes non-key attributes;
755  * scankey won't, so explicitly excluding non-key attributes isn't
756  * necessary.
757  */
758  if (key->keysz > ntupatts)
759  return 1;
760 
761  /*
762  * Use the heap TID attribute and scantid to try to break the tie. The
763  * rules are the same as any other key attribute -- only the
764  * representation differs.
765  */
766  heapTid = BTreeTupleGetHeapTID(itup);
767  if (key->scantid == NULL)
768  {
769  /*
770  * Most searches have a scankey that is considered greater than a
771  * truncated pivot tuple if and when the scankey has equal values for
772  * attributes up to and including the least significant untruncated
773  * attribute in tuple.
774  *
775  * For example, if an index has the minimum two attributes (single
776  * user key attribute, plus heap TID attribute), and a page's high key
777  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
778  * will not descend to the page to the left. The search will descend
779  * right instead. The truncated attribute in pivot tuple means that
780  * all non-pivot tuples on the page to the left are strictly < 'foo',
781  * so it isn't necessary to descend left. In other words, search
782  * doesn't have to descend left because it isn't interested in a match
783  * that has a heap TID value of -inf.
784  *
785  * However, some searches (pivotsearch searches) actually require that
786  * we descend left when this happens. -inf is treated as a possible
787  * match for omitted scankey attribute(s). This is needed by page
788  * deletion, which must re-find leaf pages that are targets for
789  * deletion using their high keys.
790  *
791  * Note: the heap TID part of the test ensures that scankey is being
792  * compared to a pivot tuple with one or more truncated key
793  * attributes.
794  *
795  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
796  * left here, since they have no heap TID attribute (and cannot have
797  * any -inf key values in any case, since truncation can only remove
798  * non-key attributes). !heapkeyspace searches must always be
799  * prepared to deal with matches on both sides of the pivot once the
800  * leaf level is reached.
801  */
802  if (key->heapkeyspace && !key->pivotsearch &&
803  key->keysz == ntupatts && heapTid == NULL)
804  return 1;
805 
806  /* All provided scankey arguments found to be equal */
807  return 0;
808  }
809 
810  /*
811  * Treat truncated heap TID as minus infinity, since scankey has a key
812  * attribute value (scantid) that would otherwise be compared directly
813  */
815  if (heapTid == NULL)
816  return 1;
817 
818  /*
819  * Scankey must be treated as equal to a posting list tuple if its scantid
820  * value falls within the range of the posting list. In all other cases
821  * there can only be a single heap TID value, which is compared directly
822  * with scantid.
823  */
825  result = ItemPointerCompare(key->scantid, heapTid);
826  if (result <= 0 || !BTreeTupleIsPosting(itup))
827  return result;
828  else
829  {
830  result = ItemPointerCompare(key->scantid,
832  if (result > 0)
833  return 1;
834  }
835 
836  return 0;
837 }
#define Min(x, y)
Definition: c.h:993
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1119
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1132
int i
Definition: isn.c:73
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1090
#define SK_BT_DESC
Definition: nbtree.h:1089
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2471
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define RelationGetDescr(relation)
Definition: rel.h:530
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:523
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 61 of file nbtsearch.c.

62 {
63  _bt_unlockbuf(scan->indexRelation, sp->buf);
64 
65  if (IsMVCCSnapshot(scan->xs_snapshot) &&
67  !scan->xs_want_itup)
68  {
69  ReleaseBuffer(sp->buf);
70  sp->buf = InvalidBuffer;
71  }
72 }
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4480
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define RelationNeedsWAL(relation)
Definition: rel.h:629
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
Buffer buf
Definition: nbtree.h:958
Relation indexRelation
Definition: relscan.h:118
struct SnapshotData * xs_snapshot
Definition: relscan.h:119

References _bt_unlockbuf(), BTScanPosData::buf, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2392 of file nbtsearch.c.

2393 {
2394  Relation rel = scan->indexRelation;
2395  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2396  Buffer buf;
2397  Page page;
2398  BTPageOpaque opaque;
2399  OffsetNumber start;
2400  BTScanPosItem *currItem;
2401 
2402  /*
2403  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2404  * version of _bt_search(). We don't maintain a stack since we know we
2405  * won't need it.
2406  */
2408 
2409  if (!BufferIsValid(buf))
2410  {
2411  /*
2412  * Empty index. Lock the whole relation, as nothing finer to lock
2413  * exists.
2414  */
2415  PredicateLockRelation(rel, scan->xs_snapshot);
2417  return false;
2418  }
2419 
2421  page = BufferGetPage(buf);
2422  opaque = BTPageGetOpaque(page);
2423  Assert(P_ISLEAF(opaque));
2424 
2425  if (ScanDirectionIsForward(dir))
2426  {
2427  /* There could be dead pages to the left, so not this: */
2428  /* Assert(P_LEFTMOST(opaque)); */
2429 
2430  start = P_FIRSTDATAKEY(opaque);
2431  }
2432  else if (ScanDirectionIsBackward(dir))
2433  {
2434  Assert(P_RIGHTMOST(opaque));
2435 
2436  start = PageGetMaxOffsetNumber(page);
2437  }
2438  else
2439  {
2440  elog(ERROR, "invalid scan direction: %d", (int) dir);
2441  start = 0; /* keep compiler quiet */
2442  }
2443 
2444  /* remember which buffer we have pinned */
2445  so->currPos.buf = buf;
2446 
2447  _bt_initialize_more_data(so, dir);
2448 
2449  /*
2450  * Now load data from the first page of the scan.
2451  */
2452  if (!_bt_readpage(scan, dir, start))
2453  {
2454  /*
2455  * There's no actually-matching data on this page. Try to advance to
2456  * the next page. Return false if there's no matching data at all.
2457  */
2458  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2459  if (!_bt_steppage(scan, dir))
2460  return false;
2461  }
2462  else
2463  {
2464  /* Drop the lock, and maybe the pin, on the current page */
2466  }
2467 
2468  /* OK, itemIndex says what to return */
2469  currItem = &so->currPos.items[so->currPos.itemIndex];
2470  scan->xs_heaptid = currItem->heapTid;
2471  if (scan->xs_want_itup)
2472  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2473 
2474  return true;
2475 }
int Buffer
Definition: buf.h:23
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1018
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1079
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2311
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1532
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:61
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1884
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2482
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2533
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2510
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
BTScanPosData currPos
Definition: nbtree.h:1075
char * currTuples
Definition: nbtree.h:1062
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:990
int itemIndex
Definition: nbtree.h:988
IndexTuple xs_itup
Definition: relscan.h:142
ItemPointerData xs_heaptid
Definition: relscan.h:147

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_initialize_more_data(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert(), BTPageGetOpaque, BTScanPosInvalidate, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog(), ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber(), PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 860 of file nbtsearch.c.

861 {
862  Relation rel = scan->indexRelation;
863  BTScanOpaque so = (BTScanOpaque) scan->opaque;
864  Buffer buf;
865  BTStack stack;
866  OffsetNumber offnum;
867  StrategyNumber strat;
868  bool nextkey;
869  bool goback;
870  BTScanInsertData inskey;
871  ScanKey startKeys[INDEX_MAX_KEYS];
872  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
873  int keysCount = 0;
874  int i;
875  bool status;
876  StrategyNumber strat_total;
877  BTScanPosItem *currItem;
878  BlockNumber blkno;
879 
881 
883 
884  /*
885  * Examine the scan keys and eliminate any redundant keys; also mark the
886  * keys that must be matched to continue the scan.
887  */
888  _bt_preprocess_keys(scan);
889 
890  /*
891  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
892  * never be satisfied (eg, x == 1 AND x > 2).
893  */
894  if (!so->qual_ok)
895  {
896  /* Notify any other workers that we're done with this scan key. */
897  _bt_parallel_done(scan);
898  return false;
899  }
900 
901  /*
902  * For parallel scans, get the starting page from shared state. If the
903  * scan has not started, proceed to find out first leaf page in the usual
904  * way while keeping other participating processes waiting. If the scan
905  * has already begun, use the page number from the shared structure.
906  */
907  if (scan->parallel_scan != NULL)
908  {
909  status = _bt_parallel_seize(scan, &blkno);
910  if (!status)
911  return false;
912  else if (blkno == P_NONE)
913  {
914  _bt_parallel_done(scan);
915  return false;
916  }
917  else if (blkno != InvalidBlockNumber)
918  {
919  if (!_bt_parallel_readpage(scan, blkno, dir))
920  return false;
921  goto readcomplete;
922  }
923  }
924 
925  /*----------
926  * Examine the scan keys to discover where we need to start the scan.
927  *
928  * We want to identify the keys that can be used as starting boundaries;
929  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
930  * a backwards scan. We can use keys for multiple attributes so long as
931  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
932  * a > or < boundary or find an attribute with no boundary (which can be
933  * thought of as the same as "> -infinity"), we can't use keys for any
934  * attributes to its right, because it would break our simplistic notion
935  * of what initial positioning strategy to use.
936  *
937  * When the scan keys include cross-type operators, _bt_preprocess_keys
938  * may not be able to eliminate redundant keys; in such cases we will
939  * arbitrarily pick a usable one for each attribute. This is correct
940  * but possibly not optimal behavior. (For example, with keys like
941  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
942  * x=5 would be more efficient.) Since the situation only arises given
943  * a poorly-worded query plus an incomplete opfamily, live with it.
944  *
945  * When both equality and inequality keys appear for a single attribute
946  * (again, only possible when cross-type operators appear), we *must*
947  * select one of the equality keys for the starting point, because
948  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
949  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
950  * start at x=4, we will fail and stop before reaching x=10. If multiple
951  * equality quals survive preprocessing, however, it doesn't matter which
952  * one we use --- by definition, they are either redundant or
953  * contradictory.
954  *
955  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
956  * If the index stores nulls at the end of the index we'll be starting
957  * from, and we have no boundary key for the column (which means the key
958  * we deduced NOT NULL from is an inequality key that constrains the other
959  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
960  * use as a boundary key. If we didn't do this, we might find ourselves
961  * traversing a lot of null entries at the start of the scan.
962  *
963  * In this loop, row-comparison keys are treated the same as keys on their
964  * first (leftmost) columns. We'll add on lower-order columns of the row
965  * comparison below, if possible.
966  *
967  * The selected scan keys (at most one per index column) are remembered by
968  * storing their addresses into the local startKeys[] array.
969  *----------
970  */
971  strat_total = BTEqualStrategyNumber;
972  if (so->numberOfKeys > 0)
973  {
974  AttrNumber curattr;
975  ScanKey chosen;
976  ScanKey impliesNN;
977  ScanKey cur;
978 
979  /*
980  * chosen is the so-far-chosen key for the current attribute, if any.
981  * We don't cast the decision in stone until we reach keys for the
982  * next attribute.
983  */
984  curattr = 1;
985  chosen = NULL;
986  /* Also remember any scankey that implies a NOT NULL constraint */
987  impliesNN = NULL;
988 
989  /*
990  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
991  * pass to handle after-last-key processing. Actual exit from the
992  * loop is at one of the "break" statements below.
993  */
994  for (cur = so->keyData, i = 0;; cur++, i++)
995  {
996  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
997  {
998  /*
999  * Done looking at keys for curattr. If we didn't find a
1000  * usable boundary key, see if we can deduce a NOT NULL key.
1001  */
1002  if (chosen == NULL && impliesNN != NULL &&
1003  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1004  ScanDirectionIsForward(dir) :
1006  {
1007  /* Yes, so build the key in notnullkeys[keysCount] */
1008  chosen = &notnullkeys[keysCount];
1009  ScanKeyEntryInitialize(chosen,
1011  (impliesNN->sk_flags &
1013  curattr,
1014  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1017  InvalidOid,
1018  InvalidOid,
1019  InvalidOid,
1020  (Datum) 0);
1021  }
1022 
1023  /*
1024  * If we still didn't find a usable boundary key, quit; else
1025  * save the boundary key pointer in startKeys.
1026  */
1027  if (chosen == NULL)
1028  break;
1029  startKeys[keysCount++] = chosen;
1030 
1031  /*
1032  * Adjust strat_total, and quit if we have stored a > or <
1033  * key.
1034  */
1035  strat = chosen->sk_strategy;
1036  if (strat != BTEqualStrategyNumber)
1037  {
1038  strat_total = strat;
1039  if (strat == BTGreaterStrategyNumber ||
1040  strat == BTLessStrategyNumber)
1041  break;
1042  }
1043 
1044  /*
1045  * Done if that was the last attribute, or if next key is not
1046  * in sequence (implying no boundary key is available for the
1047  * next attribute).
1048  */
1049  if (i >= so->numberOfKeys ||
1050  cur->sk_attno != curattr + 1)
1051  break;
1052 
1053  /*
1054  * Reset for next attr.
1055  */
1056  curattr = cur->sk_attno;
1057  chosen = NULL;
1058  impliesNN = NULL;
1059  }
1060 
1061  /*
1062  * Can we use this key as a starting boundary for this attr?
1063  *
1064  * If not, does it imply a NOT NULL constraint? (Because
1065  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1066  * *any* inequality key works for that; we need not test.)
1067  */
1068  switch (cur->sk_strategy)
1069  {
1070  case BTLessStrategyNumber:
1072  if (chosen == NULL)
1073  {
1074  if (ScanDirectionIsBackward(dir))
1075  chosen = cur;
1076  else
1077  impliesNN = cur;
1078  }
1079  break;
1080  case BTEqualStrategyNumber:
1081  /* override any non-equality choice */
1082  chosen = cur;
1083  break;
1086  if (chosen == NULL)
1087  {
1088  if (ScanDirectionIsForward(dir))
1089  chosen = cur;
1090  else
1091  impliesNN = cur;
1092  }
1093  break;
1094  }
1095  }
1096  }
1097 
1098  /*
1099  * If we found no usable boundary keys, we have to start from one end of
1100  * the tree. Walk down that edge to the first or last key, and scan from
1101  * there.
1102  */
1103  if (keysCount == 0)
1104  {
1105  bool match;
1106 
1107  match = _bt_endpoint(scan, dir);
1108 
1109  if (!match)
1110  {
1111  /* No match, so mark (parallel) scan finished */
1112  _bt_parallel_done(scan);
1113  }
1114 
1115  return match;
1116  }
1117 
1118  /*
1119  * We want to start the scan somewhere within the index. Set up an
1120  * insertion scankey we can use to search for the boundary point we
1121  * identified above. The insertion scankey is built using the keys
1122  * identified by startKeys[]. (Remaining insertion scankey fields are
1123  * initialized after initial-positioning strategy is finalized.)
1124  */
1125  Assert(keysCount <= INDEX_MAX_KEYS);
1126  for (i = 0; i < keysCount; i++)
1127  {
1128  ScanKey cur = startKeys[i];
1129 
1130  Assert(cur->sk_attno == i + 1);
1131 
1132  if (cur->sk_flags & SK_ROW_HEADER)
1133  {
1134  /*
1135  * Row comparison header: look to the first row member instead.
1136  *
1137  * The member scankeys are already in insertion format (ie, they
1138  * have sk_func = 3-way-comparison function), but we have to watch
1139  * out for nulls, which _bt_preprocess_keys didn't check. A null
1140  * in the first row member makes the condition unmatchable, just
1141  * like qual_ok = false.
1142  */
1143  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1144 
1145  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1146  if (subkey->sk_flags & SK_ISNULL)
1147  {
1148  _bt_parallel_done(scan);
1149  return false;
1150  }
1151  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1152 
1153  /*
1154  * If the row comparison is the last positioning key we accepted,
1155  * try to add additional keys from the lower-order row members.
1156  * (If we accepted independent conditions on additional index
1157  * columns, we use those instead --- doesn't seem worth trying to
1158  * determine which is more restrictive.) Note that this is OK
1159  * even if the row comparison is of ">" or "<" type, because the
1160  * condition applied to all but the last row member is effectively
1161  * ">=" or "<=", and so the extra keys don't break the positioning
1162  * scheme. But, by the same token, if we aren't able to use all
1163  * the row members, then the part of the row comparison that we
1164  * did use has to be treated as just a ">=" or "<=" condition, and
1165  * so we'd better adjust strat_total accordingly.
1166  */
1167  if (i == keysCount - 1)
1168  {
1169  bool used_all_subkeys = false;
1170 
1171  Assert(!(subkey->sk_flags & SK_ROW_END));
1172  for (;;)
1173  {
1174  subkey++;
1175  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1176  if (subkey->sk_attno != keysCount + 1)
1177  break; /* out-of-sequence, can't use it */
1178  if (subkey->sk_strategy != cur->sk_strategy)
1179  break; /* wrong direction, can't use it */
1180  if (subkey->sk_flags & SK_ISNULL)
1181  break; /* can't use null keys */
1182  Assert(keysCount < INDEX_MAX_KEYS);
1183  memcpy(inskey.scankeys + keysCount, subkey,
1184  sizeof(ScanKeyData));
1185  keysCount++;
1186  if (subkey->sk_flags & SK_ROW_END)
1187  {
1188  used_all_subkeys = true;
1189  break;
1190  }
1191  }
1192  if (!used_all_subkeys)
1193  {
1194  switch (strat_total)
1195  {
1196  case BTLessStrategyNumber:
1197  strat_total = BTLessEqualStrategyNumber;
1198  break;
1200  strat_total = BTGreaterEqualStrategyNumber;
1201  break;
1202  }
1203  }
1204  break; /* done with outer loop */
1205  }
1206  }
1207  else
1208  {
1209  /*
1210  * Ordinary comparison key. Transform the search-style scan key
1211  * to an insertion scan key by replacing the sk_func with the
1212  * appropriate btree comparison function.
1213  *
1214  * If scankey operator is not a cross-type comparison, we can use
1215  * the cached comparison function; otherwise gotta look it up in
1216  * the catalogs. (That can't lead to infinite recursion, since no
1217  * indexscan initiated by syscache lookup will use cross-data-type
1218  * operators.)
1219  *
1220  * We support the convention that sk_subtype == InvalidOid means
1221  * the opclass input type; this is a hack to simplify life for
1222  * ScanKeyInit().
1223  */
1224  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1225  cur->sk_subtype == InvalidOid)
1226  {
1227  FmgrInfo *procinfo;
1228 
1229  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1230  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1231  cur->sk_flags,
1232  cur->sk_attno,
1234  cur->sk_subtype,
1235  cur->sk_collation,
1236  procinfo,
1237  cur->sk_argument);
1238  }
1239  else
1240  {
1241  RegProcedure cmp_proc;
1242 
1243  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1244  rel->rd_opcintype[i],
1245  cur->sk_subtype,
1246  BTORDER_PROC);
1247  if (!RegProcedureIsValid(cmp_proc))
1248  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1249  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1250  cur->sk_attno, RelationGetRelationName(rel));
1251  ScanKeyEntryInitialize(inskey.scankeys + i,
1252  cur->sk_flags,
1253  cur->sk_attno,
1255  cur->sk_subtype,
1256  cur->sk_collation,
1257  cmp_proc,
1258  cur->sk_argument);
1259  }
1260  }
1261  }
1262 
1263  /*----------
1264  * Examine the selected initial-positioning strategy to determine exactly
1265  * where we need to start the scan, and set flag variables to control the
1266  * code below.
1267  *
1268  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1269  * item >= scan key. If nextkey = true, they will locate the first
1270  * item > scan key.
1271  *
1272  * If goback = true, we will then step back one item, while if
1273  * goback = false, we will start the scan on the located item.
1274  *----------
1275  */
1276  switch (strat_total)
1277  {
1278  case BTLessStrategyNumber:
1279 
1280  /*
1281  * Find first item >= scankey, then back up one to arrive at last
1282  * item < scankey. (Note: this positioning strategy is only used
1283  * for a backward scan, so that is always the correct starting
1284  * position.)
1285  */
1286  nextkey = false;
1287  goback = true;
1288  break;
1289 
1291 
1292  /*
1293  * Find first item > scankey, then back up one to arrive at last
1294  * item <= scankey. (Note: this positioning strategy is only used
1295  * for a backward scan, so that is always the correct starting
1296  * position.)
1297  */
1298  nextkey = true;
1299  goback = true;
1300  break;
1301 
1302  case BTEqualStrategyNumber:
1303 
1304  /*
1305  * If a backward scan was specified, need to start with last equal
1306  * item not first one.
1307  */
1308  if (ScanDirectionIsBackward(dir))
1309  {
1310  /*
1311  * This is the same as the <= strategy. We will check at the
1312  * end whether the found item is actually =.
1313  */
1314  nextkey = true;
1315  goback = true;
1316  }
1317  else
1318  {
1319  /*
1320  * This is the same as the >= strategy. We will check at the
1321  * end whether the found item is actually =.
1322  */
1323  nextkey = false;
1324  goback = false;
1325  }
1326  break;
1327 
1329 
1330  /*
1331  * Find first item >= scankey. (This is only used for forward
1332  * scans.)
1333  */
1334  nextkey = false;
1335  goback = false;
1336  break;
1337 
1339 
1340  /*
1341  * Find first item > scankey. (This is only used for forward
1342  * scans.)
1343  */
1344  nextkey = true;
1345  goback = false;
1346  break;
1347 
1348  default:
1349  /* can't get here, but keep compiler quiet */
1350  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1351  return false;
1352  }
1353 
1354  /* Initialize remaining insertion scan key fields */
1355  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1356  inskey.anynullkeys = false; /* unused */
1357  inskey.nextkey = nextkey;
1358  inskey.pivotsearch = false;
1359  inskey.scantid = NULL;
1360  inskey.keysz = keysCount;
1361 
1362  /*
1363  * Use the manufactured insertion scan key to descend the tree and
1364  * position ourselves on the target leaf page.
1365  */
1366  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1367 
1368  /* don't need to keep the stack around... */
1369  _bt_freestack(stack);
1370 
1371  if (!BufferIsValid(buf))
1372  {
1373  /*
1374  * We only get here if the index is completely empty. Lock relation
1375  * because nothing finer to lock exists. Without a buffer lock, it's
1376  * possible for another transaction to insert data between
1377  * _bt_search() and PredicateLockRelation(). We have to try again
1378  * after taking the relation-level predicate lock, to close a narrow
1379  * window where we wouldn't scan concurrently inserted tuples, but the
1380  * writer wouldn't see our predicate lock.
1381  */
1383  {
1384  PredicateLockRelation(rel, scan->xs_snapshot);
1385  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1386  _bt_freestack(stack);
1387  }
1388 
1389  if (!BufferIsValid(buf))
1390  {
1391  /*
1392  * Mark parallel scan as done, so that all the workers can finish
1393  * their scan.
1394  */
1395  _bt_parallel_done(scan);
1397  return false;
1398  }
1399  }
1400 
1402 
1403  _bt_initialize_more_data(so, dir);
1404 
1405  /* position to the precise item on the page */
1406  offnum = _bt_binsrch(rel, &inskey, buf);
1407 
1408  /*
1409  * If nextkey = false, we are positioned at the first item >= scan key, or
1410  * possibly at the end of a page on which all the existing items are less
1411  * than the scan key and we know that everything on later pages is greater
1412  * than or equal to scan key.
1413  *
1414  * If nextkey = true, we are positioned at the first item > scan key, or
1415  * possibly at the end of a page on which all the existing items are less
1416  * than or equal to the scan key and we know that everything on later
1417  * pages is greater than scan key.
1418  *
1419  * The actually desired starting point is either this item or the prior
1420  * one, or in the end-of-page case it's the first item on the next page or
1421  * the last item on this page. Adjust the starting offset if needed. (If
1422  * this results in an offset before the first item or after the last one,
1423  * _bt_readpage will report no items found, and then we'll step to the
1424  * next page as needed.)
1425  */
1426  if (goback)
1427  offnum = OffsetNumberPrev(offnum);
1428 
1429  /* remember which buffer we have pinned, if any */
1431  so->currPos.buf = buf;
1432 
1433  /*
1434  * Now load data from the first page of the scan.
1435  */
1436  if (!_bt_readpage(scan, dir, offnum))
1437  {
1438  /*
1439  * There's no actually-matching data on this page. Try to advance to
1440  * the next page. Return false if there's no matching data at all.
1441  */
1442  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1443  if (!_bt_steppage(scan, dir))
1444  return false;
1445  }
1446  else
1447  {
1448  /* Drop the lock, and maybe the pin, on the current page */
1450  }
1451 
1452 readcomplete:
1453  /* OK, itemIndex says what to return */
1454  currItem = &so->currPos.items[so->currPos.itemIndex];
1455  scan->xs_heaptid = currItem->heapTid;
1456  if (scan->xs_want_itup)
1457  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1458 
1459  return true;
1460 }
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define RegProcedureIsValid(p)
Definition: c.h:766
regproc RegProcedure
Definition: c.h:639
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:811
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:795
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:719
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:638
#define BTORDER_PROC
Definition: nbtree.h:707
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1012
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:719
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2166
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:338
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2392
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:96
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:182
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:749
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:623
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
ScanKey keyData
Definition: nbtree.h:1042
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
Oid * rd_opcintype
Definition: rel.h:207
Oid * rd_opfamily
Definition: rel.h:206
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), _bt_unlockbuf(), Assert(), BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid(), cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer(), elog(), ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2311 of file nbtsearch.c.

2312 {
2313  Buffer buf;
2314  Page page;
2315  BTPageOpaque opaque;
2316  OffsetNumber offnum;
2317  BlockNumber blkno;
2318  IndexTuple itup;
2319 
2320  /*
2321  * If we are looking for a leaf page, okay to descend from fast root;
2322  * otherwise better descend from true root. (There is no point in being
2323  * smarter about intermediate levels.)
2324  */
2325  if (level == 0)
2326  buf = _bt_getroot(rel, NULL, BT_READ);
2327  else
2328  buf = _bt_gettrueroot(rel);
2329 
2330  if (!BufferIsValid(buf))
2331  return InvalidBuffer;
2332 
2333  page = BufferGetPage(buf);
2334  opaque = BTPageGetOpaque(page);
2335 
2336  for (;;)
2337  {
2338  /*
2339  * If we landed on a deleted page, step right to find a live page
2340  * (there must be one). Also, if we want the rightmost page, step
2341  * right if needed to get to it (this could happen if the page split
2342  * since we obtained a pointer to it).
2343  */
2344  while (P_IGNORE(opaque) ||
2345  (rightmost && !P_RIGHTMOST(opaque)))
2346  {
2347  blkno = opaque->btpo_next;
2348  if (blkno == P_NONE)
2349  elog(ERROR, "fell off the end of index \"%s\"",
2351  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2352  page = BufferGetPage(buf);
2353  opaque = BTPageGetOpaque(page);
2354  }
2355 
2356  /* Done? */
2357  if (opaque->btpo_level == level)
2358  break;
2359  if (opaque->btpo_level < level)
2360  ereport(ERROR,
2361  (errcode(ERRCODE_INDEX_CORRUPTED),
2362  errmsg_internal("btree level %u not found in index \"%s\"",
2363  level, RelationGetRelationName(rel))));
2364 
2365  /* Descend to leftmost or rightmost child page */
2366  if (rightmost)
2367  offnum = PageGetMaxOffsetNumber(page);
2368  else
2369  offnum = P_FIRSTDATAKEY(opaque);
2370 
2371  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2372  blkno = BTreeTupleGetDownLink(itup);
2373 
2374  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2375  page = BufferGetPage(buf);
2376  opaque = BTPageGetOpaque(page);
2377  }
2378 
2379  return buf;
2380 }
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define P_IGNORE(opaque)
Definition: nbtree.h:225
BlockNumber btpo_next
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog(), ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_initialize_more_data()

static void _bt_initialize_more_data ( BTScanOpaque  so,
ScanDirection  dir 
)
inlinestatic

Definition at line 2482 of file nbtsearch.c.

2483 {
2484  /* initialize moreLeft/moreRight appropriately for scan direction */
2485  if (ScanDirectionIsForward(dir))
2486  {
2487  so->currPos.moreLeft = false;
2488  so->currPos.moreRight = true;
2489  }
2490  else
2491  {
2492  so->currPos.moreLeft = true;
2493  so->currPos.moreRight = false;
2494  }
2495  so->numKilled = 0; /* just paranoia */
2496  so->markItemIndex = -1; /* ditto */
2497 }
bool moreRight
Definition: nbtree.h:971
bool moreLeft
Definition: nbtree.h:970

References BTScanOpaqueData::currPos, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::numKilled, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_parallel_readpage().

◆ _bt_moveright()

Buffer _bt_moveright ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access 
)

Definition at line 235 of file nbtsearch.c.

242 {
243  Page page;
244  BTPageOpaque opaque;
245  int32 cmpval;
246 
247  Assert(!forupdate || heaprel != NULL);
248 
249  /*
250  * When nextkey = false (normal case): if the scan key that brought us to
251  * this page is > the high key stored on the page, then the page has split
252  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
253  * have some duplicates to the right as well as the left, but that's
254  * something that's only ever dealt with on the leaf level, after
255  * _bt_search has found an initial leaf page.)
256  *
257  * When nextkey = true: move right if the scan key is >= page's high key.
258  * (Note that key.scantid cannot be set in this case.)
259  *
260  * The page could even have split more than once, so scan as far as
261  * needed.
262  *
263  * We also have to move right if we followed a link that brought us to a
264  * dead page.
265  */
266  cmpval = key->nextkey ? 0 : 1;
267 
268  for (;;)
269  {
270  page = BufferGetPage(buf);
271  opaque = BTPageGetOpaque(page);
272 
273  if (P_RIGHTMOST(opaque))
274  break;
275 
276  /*
277  * Finish any incomplete splits we encounter along the way.
278  */
279  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
280  {
282 
283  /* upgrade our lock if necessary */
284  if (access == BT_READ)
285  {
286  _bt_unlockbuf(rel, buf);
287  _bt_lockbuf(rel, buf, BT_WRITE);
288  }
289 
290  if (P_INCOMPLETE_SPLIT(opaque))
291  _bt_finish_split(rel, heaprel, buf, stack);
292  else
293  _bt_relbuf(rel, buf);
294 
295  /* re-acquire the lock in the right mode, and re-check */
296  buf = _bt_getbuf(rel, blkno, access);
297  continue;
298  }
299 
300  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
301  {
302  /* step right one page */
303  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
304  continue;
305  }
306  else
307  break;
308  }
309 
310  if (P_IGNORE(opaque))
311  elog(ERROR, "fell off the end of index \"%s\"",
313 
314  return buf;
315 }
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
#define P_HIKEY
Definition: nbtree.h:367
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BT_WRITE
Definition: nbtree.h:720
short access
Definition: preproc-type.c:36

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage(), elog(), ERROR, sort-test::key, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_search().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1477 of file nbtsearch.c.

1478 {
1479  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1480  BTScanPosItem *currItem;
1481 
1482  /*
1483  * Advance to next tuple on current page; or if there's no more, try to
1484  * step to the next page with data.
1485  */
1486  if (ScanDirectionIsForward(dir))
1487  {
1488  if (++so->currPos.itemIndex > so->currPos.lastItem)
1489  {
1490  if (!_bt_steppage(scan, dir))
1491  return false;
1492  }
1493  }
1494  else
1495  {
1496  if (--so->currPos.itemIndex < so->currPos.firstItem)
1497  {
1498  if (!_bt_steppage(scan, dir))
1499  return false;
1500  }
1501  }
1502 
1503  /* OK, itemIndex says what to return */
1504  currItem = &so->currPos.items[so->currPos.itemIndex];
1505  scan->xs_heaptid = currItem->heapTid;
1506  if (scan->xs_want_itup)
1507  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1508 
1509  return true;
1510 }
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
int firstItem
Definition: nbtree.h:986
int lastItem
Definition: nbtree.h:987

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, if(), BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_parallel_readpage()

static bool _bt_parallel_readpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 2166 of file nbtsearch.c.

2167 {
2168  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2169 
2170  _bt_initialize_more_data(so, dir);
2171 
2172  if (!_bt_readnextpage(scan, blkno, dir))
2173  return false;
2174 
2175  /* Drop the lock, and maybe the pin, on the current page */
2177 
2178  return true;
2179 }
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1991

References _bt_drop_lock_and_maybe_pin(), _bt_initialize_more_data(), _bt_readnextpage(), BTScanOpaqueData::currPos, and IndexScanDescData::opaque.

Referenced by _bt_first().

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 1991 of file nbtsearch.c.

1992 {
1993  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1994  Relation rel;
1995  Page page;
1996  BTPageOpaque opaque;
1997  bool status;
1998 
1999  rel = scan->indexRelation;
2000 
2001  if (ScanDirectionIsForward(dir))
2002  {
2003  for (;;)
2004  {
2005  /*
2006  * if we're at end of scan, give up and mark parallel scan as
2007  * done, so that all the workers can finish their scan
2008  */
2009  if (blkno == P_NONE || !so->currPos.moreRight)
2010  {
2011  _bt_parallel_done(scan);
2013  return false;
2014  }
2015  /* check for interrupts while we're not holding any buffer lock */
2017  /* step right one page */
2018  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2019  page = BufferGetPage(so->currPos.buf);
2020  opaque = BTPageGetOpaque(page);
2021  /* check for deleted page */
2022  if (!P_IGNORE(opaque))
2023  {
2024  PredicateLockPage(rel, blkno, scan->xs_snapshot);
2025  /* see if there are any matches on this page */
2026  /* note that this will clear moreRight if we can stop */
2027  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
2028  break;
2029  }
2030  else if (scan->parallel_scan != NULL)
2031  {
2032  /* allow next page be processed by parallel worker */
2033  _bt_parallel_release(scan, opaque->btpo_next);
2034  }
2035 
2036  /* nope, keep going */
2037  if (scan->parallel_scan != NULL)
2038  {
2039  _bt_relbuf(rel, so->currPos.buf);
2040  status = _bt_parallel_seize(scan, &blkno);
2041  if (!status)
2042  {
2044  return false;
2045  }
2046  }
2047  else
2048  {
2049  blkno = opaque->btpo_next;
2050  _bt_relbuf(rel, so->currPos.buf);
2051  }
2052  }
2053  }
2054  else
2055  {
2056  /*
2057  * Should only happen in parallel cases, when some other backend
2058  * advanced the scan.
2059  */
2060  if (so->currPos.currPage != blkno)
2061  {
2063  so->currPos.currPage = blkno;
2064  }
2065 
2066  /*
2067  * Walk left to the next page with data. This is much more complex
2068  * than the walk-right case because of the possibility that the page
2069  * to our left splits while we are in flight to it, plus the
2070  * possibility that the page we were on gets deleted after we leave
2071  * it. See nbtree/README for details.
2072  *
2073  * It might be possible to rearrange this code to have less overhead
2074  * in pinning and locking, but that would require capturing the left
2075  * pointer when the page is initially read, and using it here, along
2076  * with big changes to _bt_walk_left() and the code below. It is not
2077  * clear whether this would be a win, since if the page immediately to
2078  * the left splits after we read this page and before we step left, we
2079  * would need to visit more pages than with the current code.
2080  *
2081  * Note that if we change the code so that we drop the pin for a scan
2082  * which uses a non-MVCC snapshot, we will need to modify the code for
2083  * walking left, to allow for the possibility that a referenced page
2084  * has been deleted. As long as the buffer is pinned or the snapshot
2085  * is MVCC the page cannot move past the half-dead state to fully
2086  * deleted.
2087  */
2088  if (BTScanPosIsPinned(so->currPos))
2089  _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2090  else
2091  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2092 
2093  for (;;)
2094  {
2095  /* Done if we know there are no matching keys to the left */
2096  if (!so->currPos.moreLeft)
2097  {
2098  _bt_relbuf(rel, so->currPos.buf);
2099  _bt_parallel_done(scan);
2101  return false;
2102  }
2103 
2104  /* Step to next physical page */
2105  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2106 
2107  /* if we're physically at end of index, return failure */
2108  if (so->currPos.buf == InvalidBuffer)
2109  {
2110  _bt_parallel_done(scan);
2112  return false;
2113  }
2114 
2115  /*
2116  * Okay, we managed to move left to a non-deleted page. Done if
2117  * it's not half-dead and contains matching tuples. Else loop back
2118  * and do it all again.
2119  */
2120  page = BufferGetPage(so->currPos.buf);
2121  opaque = BTPageGetOpaque(page);
2122  if (!P_IGNORE(opaque))
2123  {
2125  /* see if there are any matches on this page */
2126  /* note that this will clear moreLeft if we can stop */
2127  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
2128  break;
2129  }
2130  else if (scan->parallel_scan != NULL)
2131  {
2132  /* allow next page be processed by parallel worker */
2134  }
2135 
2136  /*
2137  * For parallel scans, get the last page scanned as it is quite
2138  * possible that by the time we try to seize the scan, some other
2139  * worker has already advanced the scan to a different page. We
2140  * must continue based on the latest page scanned by any worker.
2141  */
2142  if (scan->parallel_scan != NULL)
2143  {
2144  _bt_relbuf(rel, so->currPos.buf);
2145  status = _bt_parallel_seize(scan, &blkno);
2146  if (!status)
2147  {
2149  return false;
2150  }
2151  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2152  }
2153  }
2154  }
2155 
2156  return true;
2157 }
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:696
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:995
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1006
static Buffer _bt_walk_left(Relation rel, Buffer buf)
Definition: nbtsearch.c:2196
BlockNumber currPage
Definition: nbtree.h:961

References _bt_getbuf(), _bt_lockbuf(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), BT_READ, BTPageGetOpaque, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, if(), IndexScanDescData::indexRelation, InvalidBuffer, BTScanPosData::moreLeft, BTScanPosData::moreRight, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), ScanDirectionIsForward, and IndexScanDescData::xs_snapshot.

Referenced by _bt_parallel_readpage(), and _bt_steppage().

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum 
)
static

Definition at line 1532 of file nbtsearch.c.

1533 {
1534  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1535  Page page;
1536  BTPageOpaque opaque;
1537  OffsetNumber minoff;
1538  OffsetNumber maxoff;
1539  int itemIndex;
1540  bool continuescan;
1541  int indnatts;
1542 
1543  /*
1544  * We must have the buffer pinned and locked, but the usual macro can't be
1545  * used here; this function is what makes it good for currPos.
1546  */
1548 
1549  page = BufferGetPage(so->currPos.buf);
1550  opaque = BTPageGetOpaque(page);
1551 
1552  /* allow next page be processed by parallel worker */
1553  if (scan->parallel_scan)
1554  {
1555  if (ScanDirectionIsForward(dir))
1556  _bt_parallel_release(scan, opaque->btpo_next);
1557  else
1559  }
1560 
1561  continuescan = true; /* default assumption */
1563  minoff = P_FIRSTDATAKEY(opaque);
1564  maxoff = PageGetMaxOffsetNumber(page);
1565 
1566  /*
1567  * We note the buffer's block number so that we can release the pin later.
1568  * This allows us to re-read the buffer if it is needed again for hinting.
1569  */
1571 
1572  /*
1573  * We save the LSN of the page as we read it, so that we know whether it
1574  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1575  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1576  */
1578 
1579  /*
1580  * we must save the page's right-link while scanning it; this tells us
1581  * where to step right to after we're done with these items. There is no
1582  * corresponding need for the left-link, since splits always go right.
1583  */
1584  so->currPos.nextPage = opaque->btpo_next;
1585 
1586  /* initialize tuple workspace to empty */
1587  so->currPos.nextTupleOffset = 0;
1588 
1589  /*
1590  * Now that the current page has been made consistent, the macro should be
1591  * good.
1592  */
1594 
1595  if (ScanDirectionIsForward(dir))
1596  {
1597  /* load items[] in ascending order */
1598  itemIndex = 0;
1599 
1600  offnum = Max(offnum, minoff);
1601 
1602  while (offnum <= maxoff)
1603  {
1604  ItemId iid = PageGetItemId(page, offnum);
1605  IndexTuple itup;
1606 
1607  /*
1608  * If the scan specifies not to return killed tuples, then we
1609  * treat a killed tuple as not passing the qual
1610  */
1611  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1612  {
1613  offnum = OffsetNumberNext(offnum);
1614  continue;
1615  }
1616 
1617  itup = (IndexTuple) PageGetItem(page, iid);
1618 
1619  if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1620  {
1621  /* tuple passes all scan key conditions */
1622  if (!BTreeTupleIsPosting(itup))
1623  {
1624  /* Remember it */
1625  _bt_saveitem(so, itemIndex, offnum, itup);
1626  itemIndex++;
1627  }
1628  else
1629  {
1630  int tupleOffset;
1631 
1632  /*
1633  * Set up state to return posting list, and remember first
1634  * TID
1635  */
1636  tupleOffset =
1637  _bt_setuppostingitems(so, itemIndex, offnum,
1638  BTreeTupleGetPostingN(itup, 0),
1639  itup);
1640  itemIndex++;
1641  /* Remember additional TIDs */
1642  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1643  {
1644  _bt_savepostingitem(so, itemIndex, offnum,
1645  BTreeTupleGetPostingN(itup, i),
1646  tupleOffset);
1647  itemIndex++;
1648  }
1649  }
1650  }
1651  /* When !continuescan, there can't be any more matches, so stop */
1652  if (!continuescan)
1653  break;
1654 
1655  offnum = OffsetNumberNext(offnum);
1656  }
1657 
1658  /*
1659  * We don't need to visit page to the right when the high key
1660  * indicates that no more matches will be found there.
1661  *
1662  * Checking the high key like this works out more often than you might
1663  * think. Leaf page splits pick a split point between the two most
1664  * dissimilar tuples (this is weighed against the need to evenly share
1665  * free space). Leaf pages with high key attribute values that can
1666  * only appear on non-pivot tuples on the right sibling page are
1667  * common.
1668  */
1669  if (continuescan && !P_RIGHTMOST(opaque))
1670  {
1671  ItemId iid = PageGetItemId(page, P_HIKEY);
1672  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1673  int truncatt;
1674 
1675  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1676  _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1677  }
1678 
1679  if (!continuescan)
1680  so->currPos.moreRight = false;
1681 
1682  Assert(itemIndex <= MaxTIDsPerBTreePage);
1683  so->currPos.firstItem = 0;
1684  so->currPos.lastItem = itemIndex - 1;
1685  so->currPos.itemIndex = 0;
1686  }
1687  else
1688  {
1689  /* load items[] in descending order */
1690  itemIndex = MaxTIDsPerBTreePage;
1691 
1692  offnum = Min(offnum, maxoff);
1693 
1694  while (offnum >= minoff)
1695  {
1696  ItemId iid = PageGetItemId(page, offnum);
1697  IndexTuple itup;
1698  bool tuple_alive;
1699  bool passes_quals;
1700 
1701  /*
1702  * If the scan specifies not to return killed tuples, then we
1703  * treat a killed tuple as not passing the qual. Most of the
1704  * time, it's a win to not bother examining the tuple's index
1705  * keys, but just skip to the next tuple (previous, actually,
1706  * since we're scanning backwards). However, if this is the first
1707  * tuple on the page, we do check the index keys, to prevent
1708  * uselessly advancing to the page to the left. This is similar
1709  * to the high key optimization used by forward scans.
1710  */
1711  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1712  {
1713  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1714  if (offnum > P_FIRSTDATAKEY(opaque))
1715  {
1716  offnum = OffsetNumberPrev(offnum);
1717  continue;
1718  }
1719 
1720  tuple_alive = false;
1721  }
1722  else
1723  tuple_alive = true;
1724 
1725  itup = (IndexTuple) PageGetItem(page, iid);
1726 
1727  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1728  &continuescan);
1729  if (passes_quals && tuple_alive)
1730  {
1731  /* tuple passes all scan key conditions */
1732  if (!BTreeTupleIsPosting(itup))
1733  {
1734  /* Remember it */
1735  itemIndex--;
1736  _bt_saveitem(so, itemIndex, offnum, itup);
1737  }
1738  else
1739  {
1740  int tupleOffset;
1741 
1742  /*
1743  * Set up state to return posting list, and remember first
1744  * TID.
1745  *
1746  * Note that we deliberately save/return items from
1747  * posting lists in ascending heap TID order for backwards
1748  * scans. This allows _bt_killitems() to make a
1749  * consistent assumption about the order of items
1750  * associated with the same posting list tuple.
1751  */
1752  itemIndex--;
1753  tupleOffset =
1754  _bt_setuppostingitems(so, itemIndex, offnum,
1755  BTreeTupleGetPostingN(itup, 0),
1756  itup);
1757  /* Remember additional TIDs */
1758  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1759  {
1760  itemIndex--;
1761  _bt_savepostingitem(so, itemIndex, offnum,
1762  BTreeTupleGetPostingN(itup, i),
1763  tupleOffset);
1764  }
1765  }
1766  }
1767  if (!continuescan)
1768  {
1769  /* there can't be any more matches, so stop */
1770  so->currPos.moreLeft = false;
1771  break;
1772  }
1773 
1774  offnum = OffsetNumberPrev(offnum);
1775  }
1776 
1777  Assert(itemIndex >= 0);
1778  so->currPos.firstItem = itemIndex;
1781  }
1782 
1783  return (so->currPos.firstItem <= so->currPos.lastItem);
1784 }
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3551
#define Max(x, y)
Definition: c.h:987
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1788
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1818
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:1856
bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1362
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:516
int nextTupleOffset
Definition: nbtree.h:977
BlockNumber nextPage
Definition: nbtree.h:962
XLogRecPtr lsn
Definition: nbtree.h:960
bool ignore_killed_tuples
Definition: relscan.h:129

References _bt_checkkeys(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_setuppostingitems(), Assert(), BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage(), BufferIsValid(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_HIKEY, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_readnextpage().

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1788 of file nbtsearch.c.

1790 {
1791  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1792 
1793  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1794 
1795  currItem->heapTid = itup->t_tid;
1796  currItem->indexOffset = offnum;
1797  if (so->currTuples)
1798  {
1799  Size itupsz = IndexTupleSize(itup);
1800 
1801  currItem->tupleOffset = so->currPos.nextTupleOffset;
1802  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1803  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1804  }
1805 }
#define MAXALIGN(LEN)
Definition: c.h:800
size_t Size
Definition: c.h:594
#define IndexTupleSize(itup)
Definition: itup.h:70
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
ItemPointerData heapTid
Definition: nbtree.h:951
LocationIndex tupleOffset
Definition: nbtree.h:953
OffsetNumber indexOffset
Definition: nbtree.h:952
ItemPointerData t_tid
Definition: itup.h:37

References Assert(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_savepostingitem()

static void _bt_savepostingitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
int  tupleOffset 
)
inlinestatic

Definition at line 1856 of file nbtsearch.c.

1858 {
1859  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1860 
1861  currItem->heapTid = *heapTid;
1862  currItem->indexOffset = offnum;
1863 
1864  /*
1865  * Have index-only scans return the same base IndexTuple for every TID
1866  * that originates from the same posting list
1867  */
1868  if (so->currTuples)
1869  currItem->tupleOffset = tupleOffset;
1870 }

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 96 of file nbtsearch.c.

98 {
99  BTStack stack_in = NULL;
100  int page_access = BT_READ;
101 
102  /* heaprel must be set whenever _bt_allocbuf is reachable */
103  Assert(access == BT_READ || access == BT_WRITE);
104  Assert(access == BT_READ || heaprel != NULL);
105 
106  /* Get the root page to start with */
107  *bufP = _bt_getroot(rel, heaprel, access);
108 
109  /* If index is empty and access = BT_READ, no root page is created. */
110  if (!BufferIsValid(*bufP))
111  return (BTStack) NULL;
112 
113  /* Loop iterates once per level descended in the tree */
114  for (;;)
115  {
116  Page page;
117  BTPageOpaque opaque;
118  OffsetNumber offnum;
119  ItemId itemid;
120  IndexTuple itup;
121  BlockNumber child;
122  BTStack new_stack;
123 
124  /*
125  * Race -- the page we just grabbed may have split since we read its
126  * downlink in its parent page (or the metapage). If it has, we may
127  * need to move right to its new sibling. Do that.
128  *
129  * In write-mode, allow _bt_moveright to finish any incomplete splits
130  * along the way. Strictly speaking, we'd only need to finish an
131  * incomplete split on the leaf page we're about to insert to, not on
132  * any of the upper levels (internal pages with incomplete splits are
133  * also taken care of in _bt_getstackbuf). But this is a good
134  * opportunity to finish splits of internal pages too.
135  */
136  *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
137  stack_in, page_access);
138 
139  /* if this is a leaf page, we're done */
140  page = BufferGetPage(*bufP);
141  opaque = BTPageGetOpaque(page);
142  if (P_ISLEAF(opaque))
143  break;
144 
145  /*
146  * Find the appropriate pivot tuple on this page. Its downlink points
147  * to the child page that we're about to descend to.
148  */
149  offnum = _bt_binsrch(rel, key, *bufP);
150  itemid = PageGetItemId(page, offnum);
151  itup = (IndexTuple) PageGetItem(page, itemid);
152  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
153  child = BTreeTupleGetDownLink(itup);
154 
155  /*
156  * We need to save the location of the pivot tuple we chose in a new
157  * stack entry for this page/level. If caller ends up splitting a
158  * page one level down, it usually ends up inserting a new pivot
159  * tuple/downlink immediately after the location recorded here.
160  */
161  new_stack = (BTStack) palloc(sizeof(BTStackData));
162  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
163  new_stack->bts_offset = offnum;
164  new_stack->bts_parent = stack_in;
165 
166  /*
167  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
168  * we're on the level 1 and asked to lock leaf page in write mode,
169  * then lock next page in write mode, because it must be a leaf.
170  */
171  if (opaque->btpo_level == 1 && access == BT_WRITE)
172  page_access = BT_WRITE;
173 
174  /* drop the read lock on the page, then acquire one on its child */
175  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
176 
177  /* okay, all set to move down a level */
178  stack_in = new_stack;
179  }
180 
181  /*
182  * If we're asked to lock leaf in write mode, but didn't manage to, then
183  * relock. This should only happen when the root page is a leaf page (and
184  * the only page in the index other than the metapage).
185  */
186  if (access == BT_WRITE && page_access == BT_READ)
187  {
188  /* trade in our read lock for a write lock */
189  _bt_unlockbuf(rel, *bufP);
190  _bt_lockbuf(rel, *bufP, BT_WRITE);
191 
192  /*
193  * Race -- the leaf page may have split after we dropped the read lock
194  * but before we acquired a write lock. If it has, we may need to
195  * move right to its new sibling. Do that.
196  */
197  *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
198  }
199 
200  return stack_in;
201 }
void * palloc(Size size)
Definition: mcxt.c:1226
BTStackData * BTStack
Definition: nbtree.h:739
Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:235
BlockNumber bts_blkno
Definition: nbtree.h:734
struct BTStackData * bts_parent
Definition: nbtree.h:736
OffsetNumber bts_offset
Definition: nbtree.h:735

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_setuppostingitems()

static int _bt_setuppostingitems ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
IndexTuple  itup 
)
static

Definition at line 1818 of file nbtsearch.c.

1820 {
1821  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1822 
1823  Assert(BTreeTupleIsPosting(itup));
1824 
1825  currItem->heapTid = *heapTid;
1826  currItem->indexOffset = offnum;
1827  if (so->currTuples)
1828  {
1829  /* Save base IndexTuple (truncate posting list) */
1830  IndexTuple base;
1831  Size itupsz = BTreeTupleGetPostingOffset(itup);
1832 
1833  itupsz = MAXALIGN(itupsz);
1834  currItem->tupleOffset = so->currPos.nextTupleOffset;
1835  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1836  memcpy(base, itup, itupsz);
1837  /* Defensively reduce work area index tuple header size */
1838  base->t_info &= ~INDEX_SIZE_MASK;
1839  base->t_info |= itupsz;
1840  so->currPos.nextTupleOffset += itupsz;
1841 
1842  return currItem->tupleOffset;
1843  }
1844 
1845  return 0;
1846 }
#define INDEX_SIZE_MASK
Definition: itup.h:65
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, INDEX_SIZE_MASK, BTScanPosItem::indexOffset, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_info, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1884 of file nbtsearch.c.

1885 {
1886  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1888  bool status;
1889 
1891 
1892  /* Before leaving current page, deal with any killed items */
1893  if (so->numKilled > 0)
1894  _bt_killitems(scan);
1895 
1896  /*
1897  * Before we modify currPos, make a copy of the page data if there was a
1898  * mark position that needs it.
1899  */
1900  if (so->markItemIndex >= 0)
1901  {
1902  /* bump pin on current buffer for assignment to mark buffer */
1903  if (BTScanPosIsPinned(so->currPos))
1905  memcpy(&so->markPos, &so->currPos,
1906  offsetof(BTScanPosData, items[1]) +
1907  so->currPos.lastItem * sizeof(BTScanPosItem));
1908  if (so->markTuples)
1909  memcpy(so->markTuples, so->currTuples,
1910  so->currPos.nextTupleOffset);
1911  so->markPos.itemIndex = so->markItemIndex;
1912  so->markItemIndex = -1;
1913  }
1914 
1915  if (ScanDirectionIsForward(dir))
1916  {
1917  /* Walk right to the next page with data */
1918  if (scan->parallel_scan != NULL)
1919  {
1920  /*
1921  * Seize the scan to get the next block number; if the scan has
1922  * ended already, bail out.
1923  */
1924  status = _bt_parallel_seize(scan, &blkno);
1925  if (!status)
1926  {
1927  /* release the previous buffer, if pinned */
1930  return false;
1931  }
1932  }
1933  else
1934  {
1935  /* Not parallel, so use the previously-saved nextPage link. */
1936  blkno = so->currPos.nextPage;
1937  }
1938 
1939  /* Remember we left a page with data */
1940  so->currPos.moreLeft = true;
1941 
1942  /* release the previous buffer, if pinned */
1944  }
1945  else
1946  {
1947  /* Remember we left a page with data */
1948  so->currPos.moreRight = true;
1949 
1950  if (scan->parallel_scan != NULL)
1951  {
1952  /*
1953  * Seize the scan to get the current block number; if the scan has
1954  * ended already, bail out.
1955  */
1956  status = _bt_parallel_seize(scan, &blkno);
1958  if (!status)
1959  {
1961  return false;
1962  }
1963  }
1964  else
1965  {
1966  /* Not parallel, so just use our own notion of the current page */
1967  blkno = so->currPos.currPage;
1968  }
1969  }
1970 
1971  if (!_bt_readnextpage(scan, blkno, dir))
1972  return false;
1973 
1974  /* Drop the lock, and maybe the pin, on the current page */
1976 
1977  return true;
1978 }
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4512
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1725
char * markTuples
Definition: nbtree.h:1063
BTScanPosData markPos
Definition: nbtree.h:1076

References _bt_drop_lock_and_maybe_pin(), _bt_killitems(), _bt_parallel_seize(), _bt_readnextpage(), Assert(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), InvalidBlockNumber, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

◆ _bt_walk_left()

static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf 
)
static

Definition at line 2196 of file nbtsearch.c.

2197 {
2198  Page page;
2199  BTPageOpaque opaque;
2200 
2201  page = BufferGetPage(buf);
2202  opaque = BTPageGetOpaque(page);
2203 
2204  for (;;)
2205  {
2206  BlockNumber obknum;
2207  BlockNumber lblkno;
2208  BlockNumber blkno;
2209  int tries;
2210 
2211  /* if we're at end of tree, release buf and return failure */
2212  if (P_LEFTMOST(opaque))
2213  {
2214  _bt_relbuf(rel, buf);
2215  break;
2216  }
2217  /* remember original page we are stepping left from */
2218  obknum = BufferGetBlockNumber(buf);
2219  /* step left */
2220  blkno = lblkno = opaque->btpo_prev;
2221  _bt_relbuf(rel, buf);
2222  /* check for interrupts while we're not holding any buffer lock */
2224  buf = _bt_getbuf(rel, blkno, BT_READ);
2225  page = BufferGetPage(buf);
2226  opaque = BTPageGetOpaque(page);
2227 
2228  /*
2229  * If this isn't the page we want, walk right till we find what we
2230  * want --- but go no more than four hops (an arbitrary limit). If we
2231  * don't find the correct page by then, the most likely bet is that
2232  * the original page got deleted and isn't in the sibling chain at all
2233  * anymore, not that its left sibling got split more than four times.
2234  *
2235  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2236  * because half-dead pages are still in the sibling chain. Caller
2237  * must reject half-dead pages if wanted.
2238  */
2239  tries = 0;
2240  for (;;)
2241  {
2242  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2243  {
2244  /* Found desired page, return it */
2245  return buf;
2246  }
2247  if (P_RIGHTMOST(opaque) || ++tries > 4)
2248  break;
2249  blkno = opaque->btpo_next;
2250  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2251  page = BufferGetPage(buf);
2252  opaque = BTPageGetOpaque(page);
2253  }
2254 
2255  /* Return to the original page to see what's up */
2256  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2257  page = BufferGetPage(buf);
2258  opaque = BTPageGetOpaque(page);
2259  if (P_ISDELETED(opaque))
2260  {
2261  /*
2262  * It was deleted. Move right to first nondeleted page (there
2263  * must be one); that is the page that has acquired the deleted
2264  * one's keyspace, so stepping left from it will take us where we
2265  * want to be.
2266  */
2267  for (;;)
2268  {
2269  if (P_RIGHTMOST(opaque))
2270  elog(ERROR, "fell off the end of index \"%s\"",
2272  blkno = opaque->btpo_next;
2273  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2274  page = BufferGetPage(buf);
2275  opaque = BTPageGetOpaque(page);
2276  if (!P_ISDELETED(opaque))
2277  break;
2278  }
2279 
2280  /*
2281  * Now return to top of loop, resetting obknum to point to this
2282  * nondeleted page, and try again.
2283  */
2284  }
2285  else
2286  {
2287  /*
2288  * It wasn't deleted; the explanation had better be that the page
2289  * to the left got split or deleted. Without this check, we'd go
2290  * into an infinite loop if there's anything wrong.
2291  */
2292  if (opaque->btpo_prev == lblkno)
2293  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2294  obknum, RelationGetRelationName(rel));
2295  /* Okay to try again with new lblkno value */
2296  }
2297  }
2298 
2299  return InvalidBuffer;
2300 }
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
BlockNumber btpo_prev
Definition: nbtree.h:64

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, elog(), ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_readnextpage().