PostgreSQL Source Code  git master
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static bool _bt_parallel_readpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf, Snapshot snapshot)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_initialize_more_data (BTScanOpaque so, ScanDirection dir)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 345 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, BTScanInsertData::nextkey, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTScanInsertData::scantid, and unlikely.

Referenced by _bt_first(), and _bt_search().

348 {
349  Page page;
350  BTPageOpaque opaque;
351  OffsetNumber low,
352  high;
353  int32 result,
354  cmpval;
355 
356  /* Requesting nextkey semantics while using scantid seems nonsensical */
357  Assert(!key->nextkey || key->scantid == NULL);
358 
359  page = BufferGetPage(buf);
360  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
361 
362  low = P_FIRSTDATAKEY(opaque);
363  high = PageGetMaxOffsetNumber(page);
364 
365  /*
366  * If there are no keys on the page, return the first available slot. Note
367  * this covers two cases: the page is really empty (no keys), or it
368  * contains only a high key. The latter case is possible after vacuuming.
369  * This can never happen on an internal page, however, since they are
370  * never empty (an internal page must have children).
371  */
372  if (unlikely(high < low))
373  return low;
374 
375  /*
376  * Binary search to find the first key on the page >= scan key, or first
377  * key > scankey when nextkey is true.
378  *
379  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381  *
382  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384  *
385  * We can fall out when high == low.
386  */
387  high++; /* establish the loop invariant for high */
388 
389  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
390 
391  while (high > low)
392  {
393  OffsetNumber mid = low + ((high - low) / 2);
394 
395  /* We have low <= mid < high, so mid points at a real slot */
396 
397  result = _bt_compare(rel, key, page, mid);
398 
399  if (result >= cmpval)
400  low = mid + 1;
401  else
402  high = mid;
403  }
404 
405  /*
406  * At this point we have high == low, but be careful: they could point
407  * past the last slot on the page.
408  *
409  * On a leaf page, we always return the first key >= scan key (resp. >
410  * scan key), which could be the last slot + 1.
411  */
412  if (P_ISLEAF(opaque))
413  return low;
414 
415  /*
416  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
417  * There must be one if _bt_compare() is playing by the rules.
418  */
419  Assert(low > P_FIRSTDATAKEY(opaque));
420 
421  return OffsetNumberPrev(low);
422 }
ItemPointer scantid
Definition: nbtree.h:467
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:67
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
#define unlikely(x)
Definition: c.h:208
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 444 of file nbtsearch.c.

References _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

445 {
446  BTScanInsert key = insertstate->itup_key;
447  Page page;
448  BTPageOpaque opaque;
449  OffsetNumber low,
450  high,
451  stricthigh;
452  int32 result,
453  cmpval;
454 
455  page = BufferGetPage(insertstate->buf);
456  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
457 
458  Assert(P_ISLEAF(opaque));
459  Assert(!key->nextkey);
460 
461  if (!insertstate->bounds_valid)
462  {
463  /* Start new binary search */
464  low = P_FIRSTDATAKEY(opaque);
465  high = PageGetMaxOffsetNumber(page);
466  }
467  else
468  {
469  /* Restore result of previous binary search against same page */
470  low = insertstate->low;
471  high = insertstate->stricthigh;
472  }
473 
474  /* If there are no keys on the page, return the first available slot */
475  if (unlikely(high < low))
476  {
477  /* Caller can't reuse bounds */
478  insertstate->low = InvalidOffsetNumber;
479  insertstate->stricthigh = InvalidOffsetNumber;
480  insertstate->bounds_valid = false;
481  return low;
482  }
483 
484  /*
485  * Binary search to find the first key on the page >= scan key. (nextkey
486  * is always false when inserting).
487  *
488  * The loop invariant is: all slots before 'low' are < scan key, all slots
489  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
490  * maintained to save additional search effort for caller.
491  *
492  * We can fall out when high == low.
493  */
494  if (!insertstate->bounds_valid)
495  high++; /* establish the loop invariant for high */
496  stricthigh = high; /* high initially strictly higher */
497 
498  cmpval = 1; /* !nextkey comparison value */
499 
500  while (high > low)
501  {
502  OffsetNumber mid = low + ((high - low) / 2);
503 
504  /* We have low <= mid < high, so mid points at a real slot */
505 
506  result = _bt_compare(rel, key, page, mid);
507 
508  if (result >= cmpval)
509  low = mid + 1;
510  else
511  {
512  high = mid;
513  if (result != 0)
514  stricthigh = high;
515  }
516  }
517 
518  /*
519  * On a leaf page, a binary search always returns the first key >= scan
520  * key (at least in !nextkey case), which could be the last slot + 1. This
521  * is also the lower bound of cached search.
522  *
523  * stricthigh may also be the last slot + 1, which prevents caller from
524  * using bounds directly, but is still useful to us if we're called a
525  * second time with cached bounds (cached low will be < stricthigh when
526  * that happens).
527  */
528  insertstate->low = low;
529  insertstate->stricthigh = stricthigh;
530  insertstate->bounds_valid = true;
531 
532  return low;
533 }
bool bounds_valid
Definition: nbtree.h:499
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
OffsetNumber stricthigh
Definition: nbtree.h:501
OffsetNumber low
Definition: nbtree.h:500
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
uint16 OffsetNumber
Definition: off.h:24
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:489
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define unlikely(x)
Definition: c.h:208
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 558 of file nbtsearch.c.

References _bt_check_natts(), Assert, BTreeTupleGetHeapTID, BTreeTupleGetNAtts, DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_doinsert(), _bt_findinsertloc(), _bt_moveright(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

562 {
563  TupleDesc itupdesc = RelationGetDescr(rel);
565  IndexTuple itup;
566  ItemPointer heapTid;
567  ScanKey scankey;
568  int ncmpkey;
569  int ntupatts;
570 
571  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
573  Assert(key->heapkeyspace || key->scantid == NULL);
574 
575  /*
576  * Force result ">" if target item is first data item on an internal page
577  * --- see NOTE above.
578  */
579  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
580  return 1;
581 
582  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
583  ntupatts = BTreeTupleGetNAtts(itup, rel);
584 
585  /*
586  * The scan key is set up with the attribute number associated with each
587  * term in the key. It is important that, if the index is multi-key, the
588  * scan contain the first k key attributes, and that they be in order. If
589  * you think about how multi-key ordering works, you'll understand why
590  * this is.
591  *
592  * We don't test for violation of this condition here, however. The
593  * initial setup for the index scan had better have gotten it right (see
594  * _bt_first).
595  */
596 
597  ncmpkey = Min(ntupatts, key->keysz);
598  Assert(key->heapkeyspace || ncmpkey == key->keysz);
599  scankey = key->scankeys;
600  for (int i = 1; i <= ncmpkey; i++)
601  {
602  Datum datum;
603  bool isNull;
604  int32 result;
605 
606  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
607 
608  /* see comments about NULLs handling in btbuild */
609  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
610  {
611  if (isNull)
612  result = 0; /* NULL "=" NULL */
613  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
614  result = -1; /* NULL "<" NOT_NULL */
615  else
616  result = 1; /* NULL ">" NOT_NULL */
617  }
618  else if (isNull) /* key is NOT_NULL and item is NULL */
619  {
620  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
621  result = 1; /* NOT_NULL ">" NULL */
622  else
623  result = -1; /* NOT_NULL "<" NULL */
624  }
625  else
626  {
627  /*
628  * The sk_func needs to be passed the index value as left arg and
629  * the sk_argument as right arg (they might be of different
630  * types). Since it is convenient for callers to think of
631  * _bt_compare as comparing the scankey to the index item, we have
632  * to flip the sign of the comparison result. (Unless it's a DESC
633  * column, in which case we *don't* flip the sign.)
634  */
635  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
636  scankey->sk_collation,
637  datum,
638  scankey->sk_argument));
639 
640  if (!(scankey->sk_flags & SK_BT_DESC))
641  INVERT_COMPARE_RESULT(result);
642  }
643 
644  /* if the keys are unequal, return the difference */
645  if (result != 0)
646  return result;
647 
648  scankey++;
649  }
650 
651  /*
652  * All non-truncated attributes (other than heap TID) were found to be
653  * equal. Treat truncated attributes as minus infinity when scankey has a
654  * key attribute value that would otherwise be compared directly.
655  *
656  * Note: it doesn't matter if ntupatts includes non-key attributes;
657  * scankey won't, so explicitly excluding non-key attributes isn't
658  * necessary.
659  */
660  if (key->keysz > ntupatts)
661  return 1;
662 
663  /*
664  * Use the heap TID attribute and scantid to try to break the tie. The
665  * rules are the same as any other key attribute -- only the
666  * representation differs.
667  */
668  heapTid = BTreeTupleGetHeapTID(itup);
669  if (key->scantid == NULL)
670  {
671  /*
672  * Most searches have a scankey that is considered greater than a
673  * truncated pivot tuple if and when the scankey has equal values for
674  * attributes up to and including the least significant untruncated
675  * attribute in tuple.
676  *
677  * For example, if an index has the minimum two attributes (single
678  * user key attribute, plus heap TID attribute), and a page's high key
679  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
680  * will not descend to the page to the left. The search will descend
681  * right instead. The truncated attribute in pivot tuple means that
682  * all non-pivot tuples on the page to the left are strictly < 'foo',
683  * so it isn't necessary to descend left. In other words, search
684  * doesn't have to descend left because it isn't interested in a match
685  * that has a heap TID value of -inf.
686  *
687  * However, some searches (pivotsearch searches) actually require that
688  * we descend left when this happens. -inf is treated as a possible
689  * match for omitted scankey attribute(s). This is needed by page
690  * deletion, which must re-find leaf pages that are targets for
691  * deletion using their high keys.
692  *
693  * Note: the heap TID part of the test ensures that scankey is being
694  * compared to a pivot tuple with one or more truncated key
695  * attributes.
696  *
697  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
698  * left here, since they have no heap TID attribute (and cannot have
699  * any -inf key values in any case, since truncation can only remove
700  * non-key attributes). !heapkeyspace searches must always be
701  * prepared to deal with matches on both sides of the pivot once the
702  * leaf level is reached.
703  */
704  if (key->heapkeyspace && !key->pivotsearch &&
705  key->keysz == ntupatts && heapTid == NULL)
706  return 1;
707 
708  /* All provided scankey arguments found to be equal */
709  return 0;
710  }
711 
712  /*
713  * Treat truncated heap TID as minus infinity, since scankey has a key
714  * attribute value (scantid) that would otherwise be compared directly
715  */
717  if (heapTid == NULL)
718  return 1;
719 
721  return ItemPointerCompare(key->scantid, heapTid);
722 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define DatumGetInt32(X)
Definition: postgres.h:472
#define RelationGetDescr(relation)
Definition: rel.h:436
ItemPointer scantid
Definition: nbtree.h:467
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:890
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:347
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:328
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:429
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:466
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:673
uintptr_t Datum
Definition: postgres.h:367
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2368
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:672
bool heapkeyspace
Definition: nbtree.h:464
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:469
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1030
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 57 of file nbtsearch.c.

References BTScanPosData::buf, BUFFER_LOCK_UNLOCK, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, LockBuffer(), RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_readpage(), and _bt_steppage().

58 {
60 
61  if (IsMVCCSnapshot(scan->xs_snapshot) &&
63  !scan->xs_want_itup)
64  {
65  ReleaseBuffer(sp->buf);
66  sp->buf = InvalidBuffer;
67  }
68 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define InvalidBuffer
Definition: buf.h:25
struct SnapshotData * xs_snapshot
Definition: relscan.h:105
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3315
Relation indexRelation
Definition: relscan.h:104
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3552
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:98
#define RelationNeedsWAL(relation)
Definition: rel.h:512
Buffer buf
Definition: nbtree.h:541

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2075 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_initialize_more_data(), _bt_readpage(), _bt_steppage(), Assert, BTScanPosInvalidate, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog, ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, LockBuffer(), IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber, PageGetSpecialPointer, PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

2076 {
2077  Relation rel = scan->indexRelation;
2078  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2079  Buffer buf;
2080  Page page;
2081  BTPageOpaque opaque;
2082  OffsetNumber start;
2083  BTScanPosItem *currItem;
2084 
2085  /*
2086  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2087  * version of _bt_search(). We don't maintain a stack since we know we
2088  * won't need it.
2089  */
2091 
2092  if (!BufferIsValid(buf))
2093  {
2094  /*
2095  * Empty index. Lock the whole relation, as nothing finer to lock
2096  * exists.
2097  */
2098  PredicateLockRelation(rel, scan->xs_snapshot);
2100  return false;
2101  }
2102 
2104  page = BufferGetPage(buf);
2105  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2106  Assert(P_ISLEAF(opaque));
2107 
2108  if (ScanDirectionIsForward(dir))
2109  {
2110  /* There could be dead pages to the left, so not this: */
2111  /* Assert(P_LEFTMOST(opaque)); */
2112 
2113  start = P_FIRSTDATAKEY(opaque);
2114  }
2115  else if (ScanDirectionIsBackward(dir))
2116  {
2117  Assert(P_RIGHTMOST(opaque));
2118 
2119  start = PageGetMaxOffsetNumber(page);
2120  }
2121  else
2122  {
2123  elog(ERROR, "invalid scan direction: %d", (int) dir);
2124  start = 0; /* keep compiler quiet */
2125  }
2126 
2127  /* remember which buffer we have pinned */
2128  so->currPos.buf = buf;
2129 
2130  _bt_initialize_more_data(so, dir);
2131 
2132  /*
2133  * Now load data from the first page of the scan.
2134  */
2135  if (!_bt_readpage(scan, dir, start))
2136  {
2137  /*
2138  * There's no actually-matching data on this page. Try to advance to
2139  * the next page. Return false if there's no matching data at all.
2140  */
2142  if (!_bt_steppage(scan, dir))
2143  return false;
2144  }
2145  else
2146  {
2147  /* Drop the lock, and maybe the pin, on the current page */
2149  }
2150 
2151  /* OK, itemIndex says what to return */
2152  currItem = &so->currPos.items[so->currPos.itemIndex];
2153  scan->xs_heaptid = currItem->heapTid;
2154  if (scan->xs_want_itup)
2155  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2156 
2157  return true;
2158 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:128
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:573
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:571
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2503
char * currTuples
Definition: nbtree.h:645
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
struct SnapshotData * xs_snapshot
Definition: relscan.h:105
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1993
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Relation indexRelation
Definition: relscan.h:104
uint16 OffsetNumber
Definition: off.h:24
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:662
ItemPointerData xs_heaptid
Definition: relscan.h:133
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2165
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3552
#define Assert(condition)
Definition: c.h:732
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:601
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
BTScanPosData currPos
Definition: nbtree.h:658
#define elog(elevel,...)
Definition: elog.h:226
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1559
Buffer buf
Definition: nbtree.h:541
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1401
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 745 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_heapkeyspace(), _bt_initialize_more_data(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

746 {
747  Relation rel = scan->indexRelation;
748  BTScanOpaque so = (BTScanOpaque) scan->opaque;
749  Buffer buf;
750  BTStack stack;
751  OffsetNumber offnum;
752  StrategyNumber strat;
753  bool nextkey;
754  bool goback;
755  BTScanInsertData inskey;
756  ScanKey startKeys[INDEX_MAX_KEYS];
757  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
758  int keysCount = 0;
759  int i;
760  bool status = true;
761  StrategyNumber strat_total;
762  BTScanPosItem *currItem;
763  BlockNumber blkno;
764 
766 
768 
769  /*
770  * Examine the scan keys and eliminate any redundant keys; also mark the
771  * keys that must be matched to continue the scan.
772  */
773  _bt_preprocess_keys(scan);
774 
775  /*
776  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
777  * never be satisfied (eg, x == 1 AND x > 2).
778  */
779  if (!so->qual_ok)
780  return false;
781 
782  /*
783  * For parallel scans, get the starting page from shared state. If the
784  * scan has not started, proceed to find out first leaf page in the usual
785  * way while keeping other participating processes waiting. If the scan
786  * has already begun, use the page number from the shared structure.
787  */
788  if (scan->parallel_scan != NULL)
789  {
790  status = _bt_parallel_seize(scan, &blkno);
791  if (!status)
792  return false;
793  else if (blkno == P_NONE)
794  {
795  _bt_parallel_done(scan);
796  return false;
797  }
798  else if (blkno != InvalidBlockNumber)
799  {
800  if (!_bt_parallel_readpage(scan, blkno, dir))
801  return false;
802  goto readcomplete;
803  }
804  }
805 
806  /*----------
807  * Examine the scan keys to discover where we need to start the scan.
808  *
809  * We want to identify the keys that can be used as starting boundaries;
810  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
811  * a backwards scan. We can use keys for multiple attributes so long as
812  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
813  * a > or < boundary or find an attribute with no boundary (which can be
814  * thought of as the same as "> -infinity"), we can't use keys for any
815  * attributes to its right, because it would break our simplistic notion
816  * of what initial positioning strategy to use.
817  *
818  * When the scan keys include cross-type operators, _bt_preprocess_keys
819  * may not be able to eliminate redundant keys; in such cases we will
820  * arbitrarily pick a usable one for each attribute. This is correct
821  * but possibly not optimal behavior. (For example, with keys like
822  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
823  * x=5 would be more efficient.) Since the situation only arises given
824  * a poorly-worded query plus an incomplete opfamily, live with it.
825  *
826  * When both equality and inequality keys appear for a single attribute
827  * (again, only possible when cross-type operators appear), we *must*
828  * select one of the equality keys for the starting point, because
829  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
830  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
831  * start at x=4, we will fail and stop before reaching x=10. If multiple
832  * equality quals survive preprocessing, however, it doesn't matter which
833  * one we use --- by definition, they are either redundant or
834  * contradictory.
835  *
836  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
837  * If the index stores nulls at the end of the index we'll be starting
838  * from, and we have no boundary key for the column (which means the key
839  * we deduced NOT NULL from is an inequality key that constrains the other
840  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
841  * use as a boundary key. If we didn't do this, we might find ourselves
842  * traversing a lot of null entries at the start of the scan.
843  *
844  * In this loop, row-comparison keys are treated the same as keys on their
845  * first (leftmost) columns. We'll add on lower-order columns of the row
846  * comparison below, if possible.
847  *
848  * The selected scan keys (at most one per index column) are remembered by
849  * storing their addresses into the local startKeys[] array.
850  *----------
851  */
852  strat_total = BTEqualStrategyNumber;
853  if (so->numberOfKeys > 0)
854  {
855  AttrNumber curattr;
856  ScanKey chosen;
857  ScanKey impliesNN;
858  ScanKey cur;
859 
860  /*
861  * chosen is the so-far-chosen key for the current attribute, if any.
862  * We don't cast the decision in stone until we reach keys for the
863  * next attribute.
864  */
865  curattr = 1;
866  chosen = NULL;
867  /* Also remember any scankey that implies a NOT NULL constraint */
868  impliesNN = NULL;
869 
870  /*
871  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
872  * pass to handle after-last-key processing. Actual exit from the
873  * loop is at one of the "break" statements below.
874  */
875  for (cur = so->keyData, i = 0;; cur++, i++)
876  {
877  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
878  {
879  /*
880  * Done looking at keys for curattr. If we didn't find a
881  * usable boundary key, see if we can deduce a NOT NULL key.
882  */
883  if (chosen == NULL && impliesNN != NULL &&
884  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
887  {
888  /* Yes, so build the key in notnullkeys[keysCount] */
889  chosen = &notnullkeys[keysCount];
890  ScanKeyEntryInitialize(chosen,
892  (impliesNN->sk_flags &
894  curattr,
895  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
898  InvalidOid,
899  InvalidOid,
900  InvalidOid,
901  (Datum) 0);
902  }
903 
904  /*
905  * If we still didn't find a usable boundary key, quit; else
906  * save the boundary key pointer in startKeys.
907  */
908  if (chosen == NULL)
909  break;
910  startKeys[keysCount++] = chosen;
911 
912  /*
913  * Adjust strat_total, and quit if we have stored a > or <
914  * key.
915  */
916  strat = chosen->sk_strategy;
917  if (strat != BTEqualStrategyNumber)
918  {
919  strat_total = strat;
920  if (strat == BTGreaterStrategyNumber ||
921  strat == BTLessStrategyNumber)
922  break;
923  }
924 
925  /*
926  * Done if that was the last attribute, or if next key is not
927  * in sequence (implying no boundary key is available for the
928  * next attribute).
929  */
930  if (i >= so->numberOfKeys ||
931  cur->sk_attno != curattr + 1)
932  break;
933 
934  /*
935  * Reset for next attr.
936  */
937  curattr = cur->sk_attno;
938  chosen = NULL;
939  impliesNN = NULL;
940  }
941 
942  /*
943  * Can we use this key as a starting boundary for this attr?
944  *
945  * If not, does it imply a NOT NULL constraint? (Because
946  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
947  * *any* inequality key works for that; we need not test.)
948  */
949  switch (cur->sk_strategy)
950  {
953  if (chosen == NULL)
954  {
955  if (ScanDirectionIsBackward(dir))
956  chosen = cur;
957  else
958  impliesNN = cur;
959  }
960  break;
962  /* override any non-equality choice */
963  chosen = cur;
964  break;
967  if (chosen == NULL)
968  {
969  if (ScanDirectionIsForward(dir))
970  chosen = cur;
971  else
972  impliesNN = cur;
973  }
974  break;
975  }
976  }
977  }
978 
979  /*
980  * If we found no usable boundary keys, we have to start from one end of
981  * the tree. Walk down that edge to the first or last key, and scan from
982  * there.
983  */
984  if (keysCount == 0)
985  {
986  bool match;
987 
988  match = _bt_endpoint(scan, dir);
989 
990  if (!match)
991  {
992  /* No match, so mark (parallel) scan finished */
993  _bt_parallel_done(scan);
994  }
995 
996  return match;
997  }
998 
999  /*
1000  * We want to start the scan somewhere within the index. Set up an
1001  * insertion scankey we can use to search for the boundary point we
1002  * identified above. The insertion scankey is built using the keys
1003  * identified by startKeys[]. (Remaining insertion scankey fields are
1004  * initialized after initial-positioning strategy is finalized.)
1005  */
1006  Assert(keysCount <= INDEX_MAX_KEYS);
1007  for (i = 0; i < keysCount; i++)
1008  {
1009  ScanKey cur = startKeys[i];
1010 
1011  Assert(cur->sk_attno == i + 1);
1012 
1013  if (cur->sk_flags & SK_ROW_HEADER)
1014  {
1015  /*
1016  * Row comparison header: look to the first row member instead.
1017  *
1018  * The member scankeys are already in insertion format (ie, they
1019  * have sk_func = 3-way-comparison function), but we have to watch
1020  * out for nulls, which _bt_preprocess_keys didn't check. A null
1021  * in the first row member makes the condition unmatchable, just
1022  * like qual_ok = false.
1023  */
1024  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1025 
1026  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1027  if (subkey->sk_flags & SK_ISNULL)
1028  {
1029  _bt_parallel_done(scan);
1030  return false;
1031  }
1032  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1033 
1034  /*
1035  * If the row comparison is the last positioning key we accepted,
1036  * try to add additional keys from the lower-order row members.
1037  * (If we accepted independent conditions on additional index
1038  * columns, we use those instead --- doesn't seem worth trying to
1039  * determine which is more restrictive.) Note that this is OK
1040  * even if the row comparison is of ">" or "<" type, because the
1041  * condition applied to all but the last row member is effectively
1042  * ">=" or "<=", and so the extra keys don't break the positioning
1043  * scheme. But, by the same token, if we aren't able to use all
1044  * the row members, then the part of the row comparison that we
1045  * did use has to be treated as just a ">=" or "<=" condition, and
1046  * so we'd better adjust strat_total accordingly.
1047  */
1048  if (i == keysCount - 1)
1049  {
1050  bool used_all_subkeys = false;
1051 
1052  Assert(!(subkey->sk_flags & SK_ROW_END));
1053  for (;;)
1054  {
1055  subkey++;
1056  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1057  if (subkey->sk_attno != keysCount + 1)
1058  break; /* out-of-sequence, can't use it */
1059  if (subkey->sk_strategy != cur->sk_strategy)
1060  break; /* wrong direction, can't use it */
1061  if (subkey->sk_flags & SK_ISNULL)
1062  break; /* can't use null keys */
1063  Assert(keysCount < INDEX_MAX_KEYS);
1064  memcpy(inskey.scankeys + keysCount, subkey,
1065  sizeof(ScanKeyData));
1066  keysCount++;
1067  if (subkey->sk_flags & SK_ROW_END)
1068  {
1069  used_all_subkeys = true;
1070  break;
1071  }
1072  }
1073  if (!used_all_subkeys)
1074  {
1075  switch (strat_total)
1076  {
1077  case BTLessStrategyNumber:
1078  strat_total = BTLessEqualStrategyNumber;
1079  break;
1081  strat_total = BTGreaterEqualStrategyNumber;
1082  break;
1083  }
1084  }
1085  break; /* done with outer loop */
1086  }
1087  }
1088  else
1089  {
1090  /*
1091  * Ordinary comparison key. Transform the search-style scan key
1092  * to an insertion scan key by replacing the sk_func with the
1093  * appropriate btree comparison function.
1094  *
1095  * If scankey operator is not a cross-type comparison, we can use
1096  * the cached comparison function; otherwise gotta look it up in
1097  * the catalogs. (That can't lead to infinite recursion, since no
1098  * indexscan initiated by syscache lookup will use cross-data-type
1099  * operators.)
1100  *
1101  * We support the convention that sk_subtype == InvalidOid means
1102  * the opclass input type; this is a hack to simplify life for
1103  * ScanKeyInit().
1104  */
1105  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1106  cur->sk_subtype == InvalidOid)
1107  {
1108  FmgrInfo *procinfo;
1109 
1110  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1111  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1112  cur->sk_flags,
1113  cur->sk_attno,
1115  cur->sk_subtype,
1116  cur->sk_collation,
1117  procinfo,
1118  cur->sk_argument);
1119  }
1120  else
1121  {
1122  RegProcedure cmp_proc;
1123 
1124  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1125  rel->rd_opcintype[i],
1126  cur->sk_subtype,
1127  BTORDER_PROC);
1128  if (!RegProcedureIsValid(cmp_proc))
1129  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1130  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1131  cur->sk_attno, RelationGetRelationName(rel));
1132  ScanKeyEntryInitialize(inskey.scankeys + i,
1133  cur->sk_flags,
1134  cur->sk_attno,
1136  cur->sk_subtype,
1137  cur->sk_collation,
1138  cmp_proc,
1139  cur->sk_argument);
1140  }
1141  }
1142  }
1143 
1144  /*----------
1145  * Examine the selected initial-positioning strategy to determine exactly
1146  * where we need to start the scan, and set flag variables to control the
1147  * code below.
1148  *
1149  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1150  * item >= scan key. If nextkey = true, they will locate the first
1151  * item > scan key.
1152  *
1153  * If goback = true, we will then step back one item, while if
1154  * goback = false, we will start the scan on the located item.
1155  *----------
1156  */
1157  switch (strat_total)
1158  {
1159  case BTLessStrategyNumber:
1160 
1161  /*
1162  * Find first item >= scankey, then back up one to arrive at last
1163  * item < scankey. (Note: this positioning strategy is only used
1164  * for a backward scan, so that is always the correct starting
1165  * position.)
1166  */
1167  nextkey = false;
1168  goback = true;
1169  break;
1170 
1172 
1173  /*
1174  * Find first item > scankey, then back up one to arrive at last
1175  * item <= scankey. (Note: this positioning strategy is only used
1176  * for a backward scan, so that is always the correct starting
1177  * position.)
1178  */
1179  nextkey = true;
1180  goback = true;
1181  break;
1182 
1183  case BTEqualStrategyNumber:
1184 
1185  /*
1186  * If a backward scan was specified, need to start with last equal
1187  * item not first one.
1188  */
1189  if (ScanDirectionIsBackward(dir))
1190  {
1191  /*
1192  * This is the same as the <= strategy. We will check at the
1193  * end whether the found item is actually =.
1194  */
1195  nextkey = true;
1196  goback = true;
1197  }
1198  else
1199  {
1200  /*
1201  * This is the same as the >= strategy. We will check at the
1202  * end whether the found item is actually =.
1203  */
1204  nextkey = false;
1205  goback = false;
1206  }
1207  break;
1208 
1210 
1211  /*
1212  * Find first item >= scankey. (This is only used for forward
1213  * scans.)
1214  */
1215  nextkey = false;
1216  goback = false;
1217  break;
1218 
1220 
1221  /*
1222  * Find first item > scankey. (This is only used for forward
1223  * scans.)
1224  */
1225  nextkey = true;
1226  goback = false;
1227  break;
1228 
1229  default:
1230  /* can't get here, but keep compiler quiet */
1231  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1232  return false;
1233  }
1234 
1235  /* Initialize remaining insertion scan key fields */
1236  inskey.heapkeyspace = _bt_heapkeyspace(rel);
1237  inskey.nextkey = nextkey;
1238  inskey.pivotsearch = false;
1239  inskey.scantid = NULL;
1240  inskey.keysz = keysCount;
1241 
1242  /*
1243  * Use the manufactured insertion scan key to descend the tree and
1244  * position ourselves on the target leaf page.
1245  */
1246  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1247 
1248  /* don't need to keep the stack around... */
1249  _bt_freestack(stack);
1250 
1251  if (!BufferIsValid(buf))
1252  {
1253  /*
1254  * We only get here if the index is completely empty. Lock relation
1255  * because nothing finer to lock exists.
1256  */
1257  PredicateLockRelation(rel, scan->xs_snapshot);
1258 
1259  /*
1260  * mark parallel scan as done, so that all the workers can finish
1261  * their scan
1262  */
1263  _bt_parallel_done(scan);
1265 
1266  return false;
1267  }
1268  else
1270  scan->xs_snapshot);
1271 
1272  _bt_initialize_more_data(so, dir);
1273 
1274  /* position to the precise item on the page */
1275  offnum = _bt_binsrch(rel, &inskey, buf);
1276 
1277  /*
1278  * If nextkey = false, we are positioned at the first item >= scan key, or
1279  * possibly at the end of a page on which all the existing items are less
1280  * than the scan key and we know that everything on later pages is greater
1281  * than or equal to scan key.
1282  *
1283  * If nextkey = true, we are positioned at the first item > scan key, or
1284  * possibly at the end of a page on which all the existing items are less
1285  * than or equal to the scan key and we know that everything on later
1286  * pages is greater than scan key.
1287  *
1288  * The actually desired starting point is either this item or the prior
1289  * one, or in the end-of-page case it's the first item on the next page or
1290  * the last item on this page. Adjust the starting offset if needed. (If
1291  * this results in an offset before the first item or after the last one,
1292  * _bt_readpage will report no items found, and then we'll step to the
1293  * next page as needed.)
1294  */
1295  if (goback)
1296  offnum = OffsetNumberPrev(offnum);
1297 
1298  /* remember which buffer we have pinned, if any */
1300  so->currPos.buf = buf;
1301 
1302  /*
1303  * Now load data from the first page of the scan.
1304  */
1305  if (!_bt_readpage(scan, dir, offnum))
1306  {
1307  /*
1308  * There's no actually-matching data on this page. Try to advance to
1309  * the next page. Return false if there's no matching data at all.
1310  */
1312  if (!_bt_steppage(scan, dir))
1313  return false;
1314  }
1315  else
1316  {
1317  /* Drop the lock, and maybe the pin, on the current page */
1319  }
1320 
1321 readcomplete:
1322  /* OK, itemIndex says what to return */
1323  currItem = &so->currPos.items[so->currPos.itemIndex];
1324  scan->xs_heaptid = currItem->heapTid;
1325  if (scan->xs_want_itup)
1326  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1327 
1328  return true;
1329 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:158
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
#define BTORDER_PROC
Definition: nbtree.h:393
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:820
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:128
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:573
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:720
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1844
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:571
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2503
char * currTuples
Definition: nbtree.h:645
regproc RegProcedure
Definition: c.h:505
#define P_NONE
Definition: nbtree.h:181
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:595
struct SnapshotData * xs_snapshot
Definition: relscan.h:105
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:345
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:104
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:403
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:662
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:133
static char * buf
Definition: pg_test_fsync.c:67
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2075
#define RegProcedureIsValid(p)
Definition: c.h:640
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:444
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1321
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:639
Oid * rd_opfamily
Definition: rel.h:164
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:739
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:673
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2165
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:152
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3552
bool _bt_heapkeyspace(Relation rel)
Definition: nbtpage.c:686
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:672
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:601
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int numberOfKeys
Definition: nbtree.h:624
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define DatumGetPointer(X)
Definition: postgres.h:549
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
BTScanPosData currPos
Definition: nbtree.h:658
ScanKey keyData
Definition: nbtree.h:625
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:226
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1559
Buffer buf
Definition: nbtree.h:541
Oid * rd_opcintype
Definition: rel.h:165
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:226
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1401
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 1993 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, buf, BufferGetPage, BufferIsValid, elog, ERROR, InvalidBuffer, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

1995 {
1996  Buffer buf;
1997  Page page;
1998  BTPageOpaque opaque;
1999  OffsetNumber offnum;
2000  BlockNumber blkno;
2001  IndexTuple itup;
2002 
2003  /*
2004  * If we are looking for a leaf page, okay to descend from fast root;
2005  * otherwise better descend from true root. (There is no point in being
2006  * smarter about intermediate levels.)
2007  */
2008  if (level == 0)
2009  buf = _bt_getroot(rel, BT_READ);
2010  else
2011  buf = _bt_gettrueroot(rel);
2012 
2013  if (!BufferIsValid(buf))
2014  return InvalidBuffer;
2015 
2016  page = BufferGetPage(buf);
2017  TestForOldSnapshot(snapshot, rel, page);
2018  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2019 
2020  for (;;)
2021  {
2022  /*
2023  * If we landed on a deleted page, step right to find a live page
2024  * (there must be one). Also, if we want the rightmost page, step
2025  * right if needed to get to it (this could happen if the page split
2026  * since we obtained a pointer to it).
2027  */
2028  while (P_IGNORE(opaque) ||
2029  (rightmost && !P_RIGHTMOST(opaque)))
2030  {
2031  blkno = opaque->btpo_next;
2032  if (blkno == P_NONE)
2033  elog(ERROR, "fell off the end of index \"%s\"",
2035  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2036  page = BufferGetPage(buf);
2037  TestForOldSnapshot(snapshot, rel, page);
2038  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2039  }
2040 
2041  /* Done? */
2042  if (opaque->btpo.level == level)
2043  break;
2044  if (opaque->btpo.level < level)
2045  elog(ERROR, "btree level %u not found in index \"%s\"",
2046  level, RelationGetRelationName(rel));
2047 
2048  /* Descend to leftmost or rightmost child page */
2049  if (rightmost)
2050  offnum = PageGetMaxOffsetNumber(page);
2051  else
2052  offnum = P_FIRSTDATAKEY(opaque);
2053 
2054  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2055  blkno = BTreeInnerTupleGetDownLink(itup);
2056 
2057  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2058  page = BufferGetPage(buf);
2059  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2060  }
2061 
2062  return buf;
2063 }
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:302
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:934
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
union BTPageOpaqueData::@46 btpo
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:403
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:444
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:293
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 level
Definition: nbtree.h:61
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:541
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define elog(elevel,...)
Definition: elog.h:226
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_initialize_more_data()

static void _bt_initialize_more_data ( BTScanOpaque  so,
ScanDirection  dir 
)
inlinestatic

Definition at line 2165 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::numKilled, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_parallel_readpage().

2166 {
2167  /* initialize moreLeft/moreRight appropriately for scan direction */
2168  if (ScanDirectionIsForward(dir))
2169  {
2170  so->currPos.moreLeft = false;
2171  so->currPos.moreRight = true;
2172  }
2173  else
2174  {
2175  so->currPos.moreLeft = true;
2176  so->currPos.moreRight = false;
2177  }
2178  so->numKilled = 0; /* just paranoia */
2179  so->markItemIndex = -1; /* ditto */
2180 }
bool moreRight
Definition: nbtree.h:554
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
bool moreLeft
Definition: nbtree.h:553
int markItemIndex
Definition: nbtree.h:655
BTScanPosData currPos
Definition: nbtree.h:658

◆ _bt_moveright()

Buffer _bt_moveright ( Relation  rel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access,
Snapshot  snapshot 
)

Definition at line 243 of file nbtsearch.c.

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BT_WRITE, BTPageOpaqueData::btpo_next, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, elog, ERROR, LockBuffer(), BTScanInsertData::nextkey, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_search().

250 {
251  Page page;
252  BTPageOpaque opaque;
253  int32 cmpval;
254 
255  /*
256  * When nextkey = false (normal case): if the scan key that brought us to
257  * this page is > the high key stored on the page, then the page has split
258  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
259  * have some duplicates to the right as well as the left, but that's
260  * something that's only ever dealt with on the leaf level, after
261  * _bt_search has found an initial leaf page.)
262  *
263  * When nextkey = true: move right if the scan key is >= page's high key.
264  * (Note that key.scantid cannot be set in this case.)
265  *
266  * The page could even have split more than once, so scan as far as
267  * needed.
268  *
269  * We also have to move right if we followed a link that brought us to a
270  * dead page.
271  */
272  cmpval = key->nextkey ? 0 : 1;
273 
274  for (;;)
275  {
276  page = BufferGetPage(buf);
277  TestForOldSnapshot(snapshot, rel, page);
278  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
279 
280  if (P_RIGHTMOST(opaque))
281  break;
282 
283  /*
284  * Finish any incomplete splits we encounter along the way.
285  */
286  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
287  {
289 
290  /* upgrade our lock if necessary */
291  if (access == BT_READ)
292  {
295  }
296 
297  if (P_INCOMPLETE_SPLIT(opaque))
298  _bt_finish_split(rel, buf, stack);
299  else
300  _bt_relbuf(rel, buf);
301 
302  /* re-acquire the lock in the right mode, and re-check */
303  buf = _bt_getbuf(rel, blkno, access);
304  continue;
305  }
306 
307  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
308  {
309  /* step right one page */
310  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
311  continue;
312  }
313  else
314  break;
315  }
316 
317  if (P_IGNORE(opaque))
318  elog(ERROR, "fell off the end of index \"%s\"",
320 
321  return buf;
322 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:934
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
#define BT_READ
Definition: nbtree.h:403
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1789
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:67
#define RelationGetRelationName(relation)
Definition: rel.h:444
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3552
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:404
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:74

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1346 of file nbtsearch.c.

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

1347 {
1348  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1349  BTScanPosItem *currItem;
1350 
1351  /*
1352  * Advance to next tuple on current page; or if there's no more, try to
1353  * step to the next page with data.
1354  */
1355  if (ScanDirectionIsForward(dir))
1356  {
1357  if (++so->currPos.itemIndex > so->currPos.lastItem)
1358  {
1359  if (!_bt_steppage(scan, dir))
1360  return false;
1361  }
1362  }
1363  else
1364  {
1365  if (--so->currPos.itemIndex < so->currPos.firstItem)
1366  {
1367  if (!_bt_steppage(scan, dir))
1368  return false;
1369  }
1370  }
1371 
1372  /* OK, itemIndex says what to return */
1373  currItem = &so->currPos.items[so->currPos.itemIndex];
1374  scan->xs_heaptid = currItem->heapTid;
1375  if (scan->xs_want_itup)
1376  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1377 
1378  return true;
1379 }
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:128
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:573
int itemIndex
Definition: nbtree.h:571
char * currTuples
Definition: nbtree.h:645
int lastItem
Definition: nbtree.h:570
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:662
ItemPointerData xs_heaptid
Definition: relscan.h:133
IndexTupleData * IndexTuple
Definition: itup.h:53
int firstItem
Definition: nbtree.h:569
BTScanPosData currPos
Definition: nbtree.h:658
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1559

◆ _bt_parallel_readpage()

static bool _bt_parallel_readpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 1844 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_initialize_more_data(), _bt_readnextpage(), BTScanOpaqueData::currPos, and IndexScanDescData::opaque.

Referenced by _bt_first().

1845 {
1846  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1847 
1848  _bt_initialize_more_data(so, dir);
1849 
1850  if (!_bt_readnextpage(scan, blkno, dir))
1851  return false;
1852 
1853  /* Drop the lock, and maybe the pin, on the current page */
1855 
1856  return true;
1857 }
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:662
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2165
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1666
BTScanPosData currPos
Definition: nbtree.h:658

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 1666 of file nbtsearch.c.

References _bt_getbuf(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), BT_READ, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBuffer, LockBuffer(), BTScanPosData::moreLeft, BTScanPosData::moreRight, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber, PageGetSpecialPointer, IndexScanDescData::parallel_scan, PredicateLockPage(), ScanDirectionIsForward, status(), TestForOldSnapshot(), and IndexScanDescData::xs_snapshot.

Referenced by _bt_parallel_readpage(), and _bt_steppage().

1667 {
1668  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1669  Relation rel;
1670  Page page;
1671  BTPageOpaque opaque;
1672  bool status = true;
1673 
1674  rel = scan->indexRelation;
1675 
1676  if (ScanDirectionIsForward(dir))
1677  {
1678  for (;;)
1679  {
1680  /*
1681  * if we're at end of scan, give up and mark parallel scan as
1682  * done, so that all the workers can finish their scan
1683  */
1684  if (blkno == P_NONE || !so->currPos.moreRight)
1685  {
1686  _bt_parallel_done(scan);
1688  return false;
1689  }
1690  /* check for interrupts while we're not holding any buffer lock */
1692  /* step right one page */
1693  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1694  page = BufferGetPage(so->currPos.buf);
1695  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1696  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1697  /* check for deleted page */
1698  if (!P_IGNORE(opaque))
1699  {
1700  PredicateLockPage(rel, blkno, scan->xs_snapshot);
1701  /* see if there are any matches on this page */
1702  /* note that this will clear moreRight if we can stop */
1703  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1704  break;
1705  }
1706  else if (scan->parallel_scan != NULL)
1707  {
1708  /* allow next page be processed by parallel worker */
1709  _bt_parallel_release(scan, opaque->btpo_next);
1710  }
1711 
1712  /* nope, keep going */
1713  if (scan->parallel_scan != NULL)
1714  {
1715  _bt_relbuf(rel, so->currPos.buf);
1716  status = _bt_parallel_seize(scan, &blkno);
1717  if (!status)
1718  {
1720  return false;
1721  }
1722  }
1723  else
1724  {
1725  blkno = opaque->btpo_next;
1726  _bt_relbuf(rel, so->currPos.buf);
1727  }
1728  }
1729  }
1730  else
1731  {
1732  /*
1733  * Should only happen in parallel cases, when some other backend
1734  * advanced the scan.
1735  */
1736  if (so->currPos.currPage != blkno)
1737  {
1739  so->currPos.currPage = blkno;
1740  }
1741 
1742  /*
1743  * Walk left to the next page with data. This is much more complex
1744  * than the walk-right case because of the possibility that the page
1745  * to our left splits while we are in flight to it, plus the
1746  * possibility that the page we were on gets deleted after we leave
1747  * it. See nbtree/README for details.
1748  *
1749  * It might be possible to rearrange this code to have less overhead
1750  * in pinning and locking, but that would require capturing the left
1751  * pointer when the page is initially read, and using it here, along
1752  * with big changes to _bt_walk_left() and the code below. It is not
1753  * clear whether this would be a win, since if the page immediately to
1754  * the left splits after we read this page and before we step left, we
1755  * would need to visit more pages than with the current code.
1756  *
1757  * Note that if we change the code so that we drop the pin for a scan
1758  * which uses a non-MVCC snapshot, we will need to modify the code for
1759  * walking left, to allow for the possibility that a referenced page
1760  * has been deleted. As long as the buffer is pinned or the snapshot
1761  * is MVCC the page cannot move past the half-dead state to fully
1762  * deleted.
1763  */
1764  if (BTScanPosIsPinned(so->currPos))
1765  LockBuffer(so->currPos.buf, BT_READ);
1766  else
1767  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1768 
1769  for (;;)
1770  {
1771  /* Done if we know there are no matching keys to the left */
1772  if (!so->currPos.moreLeft)
1773  {
1774  _bt_relbuf(rel, so->currPos.buf);
1775  _bt_parallel_done(scan);
1777  return false;
1778  }
1779 
1780  /* Step to next physical page */
1781  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1782  scan->xs_snapshot);
1783 
1784  /* if we're physically at end of index, return failure */
1785  if (so->currPos.buf == InvalidBuffer)
1786  {
1787  _bt_parallel_done(scan);
1789  return false;
1790  }
1791 
1792  /*
1793  * Okay, we managed to move left to a non-deleted page. Done if
1794  * it's not half-dead and contains matching tuples. Else loop back
1795  * and do it all again.
1796  */
1797  page = BufferGetPage(so->currPos.buf);
1798  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1799  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1800  if (!P_IGNORE(opaque))
1801  {
1803  /* see if there are any matches on this page */
1804  /* note that this will clear moreLeft if we can stop */
1805  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1806  break;
1807  }
1808  else if (scan->parallel_scan != NULL)
1809  {
1810  /* allow next page be processed by parallel worker */
1812  }
1813 
1814  /*
1815  * For parallel scans, get the last page scanned as it is quite
1816  * possible that by the time we try to seize the scan, some other
1817  * worker has already advanced the scan to a different page. We
1818  * must continue based on the latest page scanned by any worker.
1819  */
1820  if (scan->parallel_scan != NULL)
1821  {
1822  _bt_relbuf(rel, so->currPos.buf);
1823  status = _bt_parallel_seize(scan, &blkno);
1824  if (!status)
1825  {
1827  return false;
1828  }
1829  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1830  }
1831  }
1832  }
1833 
1834  return true;
1835 }
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool moreRight
Definition: nbtree.h:554
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:720
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
BlockNumber currPage
Definition: nbtree.h:544
struct SnapshotData * xs_snapshot
Definition: relscan.h:105
bool moreLeft
Definition: nbtree.h:553
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Relation indexRelation
Definition: relscan.h:104
#define BT_READ
Definition: nbtree.h:403
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:662
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:578
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:639
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:152
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3552
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:601
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:697
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
BTScanPosData currPos
Definition: nbtree.h:658
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
Buffer buf
Definition: nbtree.h:541
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:226
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1401
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:589
Pointer Page
Definition: bufpage.h:74
static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:1874

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum 
)
static

Definition at line 1401 of file nbtsearch.c.

References _bt_checkkeys(), _bt_parallel_release(), _bt_saveitem(), Assert, BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage, BufferIsValid, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxIndexTuplesPerPage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetMaxOffsetNumber, PageGetSpecialPointer, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_readnextpage().

1402 {
1403  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1404  Page page;
1405  BTPageOpaque opaque;
1406  OffsetNumber minoff;
1407  OffsetNumber maxoff;
1408  int itemIndex;
1409  IndexTuple itup;
1410  bool continuescan;
1411 
1412  /*
1413  * We must have the buffer pinned and locked, but the usual macro can't be
1414  * used here; this function is what makes it good for currPos.
1415  */
1417 
1418  page = BufferGetPage(so->currPos.buf);
1419  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1420 
1421  /* allow next page be processed by parallel worker */
1422  if (scan->parallel_scan)
1423  {
1424  if (ScanDirectionIsForward(dir))
1425  _bt_parallel_release(scan, opaque->btpo_next);
1426  else
1428  }
1429 
1430  minoff = P_FIRSTDATAKEY(opaque);
1431  maxoff = PageGetMaxOffsetNumber(page);
1432 
1433  /*
1434  * We note the buffer's block number so that we can release the pin later.
1435  * This allows us to re-read the buffer if it is needed again for hinting.
1436  */
1438 
1439  /*
1440  * We save the LSN of the page as we read it, so that we know whether it
1441  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1442  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1443  */
1445 
1446  /*
1447  * we must save the page's right-link while scanning it; this tells us
1448  * where to step right to after we're done with these items. There is no
1449  * corresponding need for the left-link, since splits always go right.
1450  */
1451  so->currPos.nextPage = opaque->btpo_next;
1452 
1453  /* initialize tuple workspace to empty */
1454  so->currPos.nextTupleOffset = 0;
1455 
1456  /*
1457  * Now that the current page has been made consistent, the macro should be
1458  * good.
1459  */
1461 
1462  if (ScanDirectionIsForward(dir))
1463  {
1464  /* load items[] in ascending order */
1465  itemIndex = 0;
1466 
1467  offnum = Max(offnum, minoff);
1468 
1469  while (offnum <= maxoff)
1470  {
1471  itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1472  if (itup != NULL)
1473  {
1474  /* tuple passes all scan key conditions, so remember it */
1475  _bt_saveitem(so, itemIndex, offnum, itup);
1476  itemIndex++;
1477  }
1478  if (!continuescan)
1479  {
1480  /* there can't be any more matches, so stop */
1481  so->currPos.moreRight = false;
1482  break;
1483  }
1484 
1485  offnum = OffsetNumberNext(offnum);
1486  }
1487 
1488  Assert(itemIndex <= MaxIndexTuplesPerPage);
1489  so->currPos.firstItem = 0;
1490  so->currPos.lastItem = itemIndex - 1;
1491  so->currPos.itemIndex = 0;
1492  }
1493  else
1494  {
1495  /* load items[] in descending order */
1496  itemIndex = MaxIndexTuplesPerPage;
1497 
1498  offnum = Min(offnum, maxoff);
1499 
1500  while (offnum >= minoff)
1501  {
1502  itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1503  if (itup != NULL)
1504  {
1505  /* tuple passes all scan key conditions, so remember it */
1506  itemIndex--;
1507  _bt_saveitem(so, itemIndex, offnum, itup);
1508  }
1509  if (!continuescan)
1510  {
1511  /* there can't be any more matches, so stop */
1512  so->currPos.moreLeft = false;
1513  break;
1514  }
1515 
1516  offnum = OffsetNumberPrev(offnum);
1517  }
1518 
1519  Assert(itemIndex >= 0);
1520  so->currPos.firstItem = itemIndex;
1523  }
1524 
1525  return (so->currPos.firstItem <= so->currPos.lastItem);
1526 }
bool moreRight
Definition: nbtree.h:554
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1353
int itemIndex
Definition: nbtree.h:571
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:890
BlockNumber currPage
Definition: nbtree.h:544
bool moreLeft
Definition: nbtree.h:553
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
int nextTupleOffset
Definition: nbtree.h:560
int lastItem
Definition: nbtree.h:570
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:662
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2838
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:578
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1530
int firstItem
Definition: nbtree.h:569
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:152
#define Max(x, y)
Definition: c.h:884
#define Assert(condition)
Definition: c.h:732
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:697
BlockNumber nextPage
Definition: nbtree.h:545
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
BTScanPosData currPos
Definition: nbtree.h:658
#define MaxIndexTuplesPerPage
Definition: itup.h:145
XLogRecPtr lsn
Definition: nbtree.h:543
Buffer buf
Definition: nbtree.h:541
Pointer Page
Definition: bufpage.h:74

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1530 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

1532 {
1533  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1534 
1535  currItem->heapTid = itup->t_tid;
1536  currItem->indexOffset = offnum;
1537  if (so->currTuples)
1538  {
1539  Size itupsz = IndexTupleSize(itup);
1540 
1541  currItem->tupleOffset = so->currPos.nextTupleOffset;
1542  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1543  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1544  }
1545 }
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:573
char * currTuples
Definition: nbtree.h:645
ItemPointerData t_tid
Definition: itup.h:37
OffsetNumber indexOffset
Definition: nbtree.h:535
int nextTupleOffset
Definition: nbtree.h:560
size_t Size
Definition: c.h:466
#define MAXALIGN(LEN)
Definition: c.h:685
ItemPointerData heapTid
Definition: nbtree.h:534
BTScanPosData currPos
Definition: nbtree.h:658
LocationIndex tupleOffset
Definition: nbtree.h:536
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
BTScanInsert  key,
Buffer bufP,
int  access,
Snapshot  snapshot 
)

Definition at line 92 of file nbtsearch.c.

References _bt_binsrch(), _bt_getroot(), _bt_moveright(), _bt_relandgetbuf(), BT_READ, BT_WRITE, BTPageOpaqueData::btpo, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTPageOpaqueData::level, LockBuffer(), P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, and palloc().

Referenced by _bt_doinsert(), _bt_first(), _bt_pagedel(), and bt_rootdescend().

94 {
95  BTStack stack_in = NULL;
96  int page_access = BT_READ;
97 
98  /* Get the root page to start with */
99  *bufP = _bt_getroot(rel, access);
100 
101  /* If index is empty and access = BT_READ, no root page is created. */
102  if (!BufferIsValid(*bufP))
103  return (BTStack) NULL;
104 
105  /* Loop iterates once per level descended in the tree */
106  for (;;)
107  {
108  Page page;
109  BTPageOpaque opaque;
110  OffsetNumber offnum;
111  ItemId itemid;
112  IndexTuple itup;
113  BlockNumber blkno;
114  BlockNumber par_blkno;
115  BTStack new_stack;
116 
117  /*
118  * Race -- the page we just grabbed may have split since we read its
119  * pointer in the parent (or metapage). If it has, we may need to
120  * move right to its new sibling. Do that.
121  *
122  * In write-mode, allow _bt_moveright to finish any incomplete splits
123  * along the way. Strictly speaking, we'd only need to finish an
124  * incomplete split on the leaf page we're about to insert to, not on
125  * any of the upper levels (they are taken care of in _bt_getstackbuf,
126  * if the leaf page is split and we insert to the parent page). But
127  * this is a good opportunity to finish splits of internal pages too.
128  */
129  *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
130  page_access, snapshot);
131 
132  /* if this is a leaf page, we're done */
133  page = BufferGetPage(*bufP);
134  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135  if (P_ISLEAF(opaque))
136  break;
137 
138  /*
139  * Find the appropriate item on the internal page, and get the child
140  * page that it points to.
141  */
142  offnum = _bt_binsrch(rel, key, *bufP);
143  itemid = PageGetItemId(page, offnum);
144  itup = (IndexTuple) PageGetItem(page, itemid);
145  blkno = BTreeInnerTupleGetDownLink(itup);
146  par_blkno = BufferGetBlockNumber(*bufP);
147 
148  /*
149  * We need to save the location of the index entry we chose in the
150  * parent page on a stack. In case we split the tree, we'll use the
151  * stack to work back up to the parent page. We also save the actual
152  * downlink (block) to uniquely identify the index entry, in case it
153  * moves right while we're working lower in the tree. See the paper
154  * by Lehman and Yao for how this is detected and handled. (We use the
155  * child link during the second half of a page split -- if caller ends
156  * up splitting the child it usually ends up inserting a new pivot
157  * tuple for child's new right sibling immediately after the original
158  * bts_offset offset recorded here. The downlink block will be needed
159  * to check if bts_offset remains the position of this same pivot
160  * tuple.)
161  */
162  new_stack = (BTStack) palloc(sizeof(BTStackData));
163  new_stack->bts_blkno = par_blkno;
164  new_stack->bts_offset = offnum;
165  new_stack->bts_btentry = blkno;
166  new_stack->bts_parent = stack_in;
167 
168  /*
169  * Page level 1 is lowest non-leaf page level prior to leaves. So,
170  * if we're on the level 1 and asked to lock leaf page in write mode,
171  * then lock next page in write mode, because it must be a leaf.
172  */
173  if (opaque->btpo.level == 1 && access == BT_WRITE)
174  page_access = BT_WRITE;
175 
176  /* drop the read lock on the parent page, acquire one on the child */
177  *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
178 
179  /* okay, all set to move down a level */
180  stack_in = new_stack;
181  }
182 
183  /*
184  * If we're asked to lock leaf in write mode, but didn't manage to, then
185  * relock. That may happen when the root page appears to be leaf.
186  */
187  if (access == BT_WRITE && page_access == BT_READ)
188  {
189  /* trade in our read lock for a write lock */
191  LockBuffer(*bufP, BT_WRITE);
192 
193  /*
194  * If the page was split between the time that we surrendered our read
195  * lock and acquired our write lock, then this page may no longer be
196  * the right place for the key we want to insert. In this case, we
197  * need to move right in the tree. See Lehman and Yao for an
198  * excruciatingly precise description.
199  */
200  *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
201  snapshot);
202  }
203 
204  return stack_in;
205 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:302
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:934
union BTPageOpaqueData::@46 btpo
BTStackData * BTStack
Definition: nbtree.h:424
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:345
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:403
OffsetNumber bts_offset
Definition: nbtree.h:419
IndexTupleData * IndexTuple
Definition: itup.h:53
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:293
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 level
Definition: nbtree.h:61
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3552
BlockNumber bts_blkno
Definition: nbtree.h:418
BlockNumber bts_btentry
Definition: nbtree.h:420
Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:243
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
struct BTStackData * bts_parent
Definition: nbtree.h:421
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
void * palloc(Size size)
Definition: mcxt.c:924
#define BT_WRITE
Definition: nbtree.h:404
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1559 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_killitems(), _bt_parallel_seize(), _bt_readnextpage(), Assert, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), InvalidBlockNumber, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, ScanDirectionIsForward, and status().

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

1560 {
1561  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1563  bool status = true;
1564 
1566 
1567  /* Before leaving current page, deal with any killed items */
1568  if (so->numKilled > 0)
1569  _bt_killitems(scan);
1570 
1571  /*
1572  * Before we modify currPos, make a copy of the page data if there was a
1573  * mark position that needs it.
1574  */
1575  if (so->markItemIndex >= 0)
1576  {
1577  /* bump pin on current buffer for assignment to mark buffer */
1578  if (BTScanPosIsPinned(so->currPos))
1580  memcpy(&so->markPos, &so->currPos,
1581  offsetof(BTScanPosData, items[1]) +
1582  so->currPos.lastItem * sizeof(BTScanPosItem));
1583  if (so->markTuples)
1584  memcpy(so->markTuples, so->currTuples,
1585  so->currPos.nextTupleOffset);
1586  so->markPos.itemIndex = so->markItemIndex;
1587  so->markItemIndex = -1;
1588  }
1589 
1590  if (ScanDirectionIsForward(dir))
1591  {
1592  /* Walk right to the next page with data */
1593  if (scan->parallel_scan != NULL)
1594  {
1595  /*
1596  * Seize the scan to get the next block number; if the scan has
1597  * ended already, bail out.
1598  */
1599  status = _bt_parallel_seize(scan, &blkno);
1600  if (!status)
1601  {
1602  /* release the previous buffer, if pinned */
1605  return false;
1606  }
1607  }
1608  else
1609  {
1610  /* Not parallel, so use the previously-saved nextPage link. */
1611  blkno = so->currPos.nextPage;
1612  }
1613 
1614  /* Remember we left a page with data */
1615  so->currPos.moreLeft = true;
1616 
1617  /* release the previous buffer, if pinned */
1619  }
1620  else
1621  {
1622  /* Remember we left a page with data */
1623  so->currPos.moreRight = true;
1624 
1625  if (scan->parallel_scan != NULL)
1626  {
1627  /*
1628  * Seize the scan to get the current block number; if the scan has
1629  * ended already, bail out.
1630  */
1631  status = _bt_parallel_seize(scan, &blkno);
1633  if (!status)
1634  {
1636  return false;
1637  }
1638  }
1639  else
1640  {
1641  /* Not parallel, so just use our own notion of the current page */
1642  blkno = so->currPos.currPage;
1643  }
1644  }
1645 
1646  if (!_bt_readnextpage(scan, blkno, dir))
1647  return false;
1648 
1649  /* Drop the lock, and maybe the pin, on the current page */
1651 
1652  return true;
1653 }
bool moreRight
Definition: nbtree.h:554
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:571
char * currTuples
Definition: nbtree.h:645
BlockNumber currPage
Definition: nbtree.h:544
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:595
uint32 BlockNumber
Definition: block.h:31
bool moreLeft
Definition: nbtree.h:553
int nextTupleOffset
Definition: nbtree.h:560
int lastItem
Definition: nbtree.h:570
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1727
BTScanPosData markPos
Definition: nbtree.h:659
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:662
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:578
char * markTuples
Definition: nbtree.h:646
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:639
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:152
int markItemIndex
Definition: nbtree.h:655
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1666
#define Assert(condition)
Definition: c.h:732
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:601
#define InvalidBlockNumber
Definition: block.h:33
BlockNumber nextPage
Definition: nbtree.h:545
BTScanPosData currPos
Definition: nbtree.h:658
Buffer buf
Definition: nbtree.h:541
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:226
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3353
#define offsetof(type, field)
Definition: c.h:655
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:589

◆ _bt_walk_left()

static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf,
Snapshot  snapshot 
)
static

Definition at line 1874 of file nbtsearch.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_readnextpage().

1875 {
1876  Page page;
1877  BTPageOpaque opaque;
1878 
1879  page = BufferGetPage(buf);
1880  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1881 
1882  for (;;)
1883  {
1884  BlockNumber obknum;
1885  BlockNumber lblkno;
1886  BlockNumber blkno;
1887  int tries;
1888 
1889  /* if we're at end of tree, release buf and return failure */
1890  if (P_LEFTMOST(opaque))
1891  {
1892  _bt_relbuf(rel, buf);
1893  break;
1894  }
1895  /* remember original page we are stepping left from */
1896  obknum = BufferGetBlockNumber(buf);
1897  /* step left */
1898  blkno = lblkno = opaque->btpo_prev;
1899  _bt_relbuf(rel, buf);
1900  /* check for interrupts while we're not holding any buffer lock */
1902  buf = _bt_getbuf(rel, blkno, BT_READ);
1903  page = BufferGetPage(buf);
1904  TestForOldSnapshot(snapshot, rel, page);
1905  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1906 
1907  /*
1908  * If this isn't the page we want, walk right till we find what we
1909  * want --- but go no more than four hops (an arbitrary limit). If we
1910  * don't find the correct page by then, the most likely bet is that
1911  * the original page got deleted and isn't in the sibling chain at all
1912  * anymore, not that its left sibling got split more than four times.
1913  *
1914  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1915  * because half-dead pages are still in the sibling chain. Caller
1916  * must reject half-dead pages if wanted.
1917  */
1918  tries = 0;
1919  for (;;)
1920  {
1921  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1922  {
1923  /* Found desired page, return it */
1924  return buf;
1925  }
1926  if (P_RIGHTMOST(opaque) || ++tries > 4)
1927  break;
1928  blkno = opaque->btpo_next;
1929  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1930  page = BufferGetPage(buf);
1931  TestForOldSnapshot(snapshot, rel, page);
1932  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1933  }
1934 
1935  /* Return to the original page to see what's up */
1936  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1937  page = BufferGetPage(buf);
1938  TestForOldSnapshot(snapshot, rel, page);
1939  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1940  if (P_ISDELETED(opaque))
1941  {
1942  /*
1943  * It was deleted. Move right to first nondeleted page (there
1944  * must be one); that is the page that has acquired the deleted
1945  * one's keyspace, so stepping left from it will take us where we
1946  * want to be.
1947  */
1948  for (;;)
1949  {
1950  if (P_RIGHTMOST(opaque))
1951  elog(ERROR, "fell off the end of index \"%s\"",
1953  blkno = opaque->btpo_next;
1954  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1955  page = BufferGetPage(buf);
1956  TestForOldSnapshot(snapshot, rel, page);
1957  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1958  if (!P_ISDELETED(opaque))
1959  break;
1960  }
1961 
1962  /*
1963  * Now return to top of loop, resetting obknum to point to this
1964  * nondeleted page, and try again.
1965  */
1966  }
1967  else
1968  {
1969  /*
1970  * It wasn't deleted; the explanation had better be that the page
1971  * to the left got split or deleted. Without this check, we'd go
1972  * into an infinite loop if there's anything wrong.
1973  */
1974  if (opaque->btpo_prev == lblkno)
1975  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1976  obknum, RelationGetRelationName(rel));
1977  /* Okay to try again with new lblkno value */
1978  }
1979  }
1980 
1981  return InvalidBuffer;
1982 }
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:934
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BT_READ
Definition: nbtree.h:403
#define ERROR
Definition: elog.h:43
BlockNumber btpo_prev
Definition: nbtree.h:57
static char * buf
Definition: pg_test_fsync.c:67
#define RelationGetRelationName(relation)
Definition: rel.h:444
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define P_ISDELETED(opaque)
Definition: nbtree.h:191
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
#define elog(elevel,...)
Definition: elog.h:226
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:74