PostgreSQL Source Code  git master
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static bool _bt_parallel_readpage (IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf, Snapshot snapshot)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_initialize_more_data (BTScanOpaque so, ScanDirection dir)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 339 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, BTScanInsertData::nextkey, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTScanInsertData::scantid, and unlikely.

Referenced by _bt_first(), and _bt_search().

342 {
343  Page page;
344  BTPageOpaque opaque;
345  OffsetNumber low,
346  high;
347  int32 result,
348  cmpval;
349 
350  page = BufferGetPage(buf);
351  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
352 
353  /* Requesting nextkey semantics while using scantid seems nonsensical */
354  Assert(!key->nextkey || key->scantid == NULL);
355  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
356  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
357 
358  low = P_FIRSTDATAKEY(opaque);
359  high = PageGetMaxOffsetNumber(page);
360 
361  /*
362  * If there are no keys on the page, return the first available slot. Note
363  * this covers two cases: the page is really empty (no keys), or it
364  * contains only a high key. The latter case is possible after vacuuming.
365  * This can never happen on an internal page, however, since they are
366  * never empty (an internal page must have children).
367  */
368  if (unlikely(high < low))
369  return low;
370 
371  /*
372  * Binary search to find the first key on the page >= scan key, or first
373  * key > scankey when nextkey is true.
374  *
375  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
376  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
377  *
378  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
379  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
380  *
381  * We can fall out when high == low.
382  */
383  high++; /* establish the loop invariant for high */
384 
385  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
386 
387  while (high > low)
388  {
389  OffsetNumber mid = low + ((high - low) / 2);
390 
391  /* We have low <= mid < high, so mid points at a real slot */
392 
393  result = _bt_compare(rel, key, page, mid);
394 
395  if (result >= cmpval)
396  low = mid + 1;
397  else
398  high = mid;
399  }
400 
401  /*
402  * At this point we have high == low, but be careful: they could point
403  * past the last slot on the page.
404  *
405  * On a leaf page, we always return the first key >= scan key (resp. >
406  * scan key), which could be the last slot + 1.
407  */
408  if (P_ISLEAF(opaque))
409  return low;
410 
411  /*
412  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
413  * There must be one if _bt_compare() is playing by the rules.
414  */
415  Assert(low > P_FIRSTDATAKEY(opaque));
416 
417  return OffsetNumberPrev(low);
418 }
ItemPointer scantid
Definition: nbtree.h:475
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:68
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:554
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define unlikely(x)
Definition: c.h:208
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 440 of file nbtsearch.c.

References _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

441 {
442  BTScanInsert key = insertstate->itup_key;
443  Page page;
444  BTPageOpaque opaque;
445  OffsetNumber low,
446  high,
447  stricthigh;
448  int32 result,
449  cmpval;
450 
451  page = BufferGetPage(insertstate->buf);
452  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
453 
454  Assert(P_ISLEAF(opaque));
455  Assert(!key->nextkey);
456 
457  if (!insertstate->bounds_valid)
458  {
459  /* Start new binary search */
460  low = P_FIRSTDATAKEY(opaque);
461  high = PageGetMaxOffsetNumber(page);
462  }
463  else
464  {
465  /* Restore result of previous binary search against same page */
466  low = insertstate->low;
467  high = insertstate->stricthigh;
468  }
469 
470  /* If there are no keys on the page, return the first available slot */
471  if (unlikely(high < low))
472  {
473  /* Caller can't reuse bounds */
474  insertstate->low = InvalidOffsetNumber;
475  insertstate->stricthigh = InvalidOffsetNumber;
476  insertstate->bounds_valid = false;
477  return low;
478  }
479 
480  /*
481  * Binary search to find the first key on the page >= scan key. (nextkey
482  * is always false when inserting).
483  *
484  * The loop invariant is: all slots before 'low' are < scan key, all slots
485  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
486  * maintained to save additional search effort for caller.
487  *
488  * We can fall out when high == low.
489  */
490  if (!insertstate->bounds_valid)
491  high++; /* establish the loop invariant for high */
492  stricthigh = high; /* high initially strictly higher */
493 
494  cmpval = 1; /* !nextkey comparison value */
495 
496  while (high > low)
497  {
498  OffsetNumber mid = low + ((high - low) / 2);
499 
500  /* We have low <= mid < high, so mid points at a real slot */
501 
502  result = _bt_compare(rel, key, page, mid);
503 
504  if (result >= cmpval)
505  low = mid + 1;
506  else
507  {
508  high = mid;
509  if (result != 0)
510  stricthigh = high;
511  }
512  }
513 
514  /*
515  * On a leaf page, a binary search always returns the first key >= scan
516  * key (at least in !nextkey case), which could be the last slot + 1. This
517  * is also the lower bound of cached search.
518  *
519  * stricthigh may also be the last slot + 1, which prevents caller from
520  * using bounds directly, but is still useful to us if we're called a
521  * second time with cached bounds (cached low will be < stricthigh when
522  * that happens).
523  */
524  insertstate->low = low;
525  insertstate->stricthigh = stricthigh;
526  insertstate->bounds_valid = true;
527 
528  return low;
529 }
bool bounds_valid
Definition: nbtree.h:507
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
OffsetNumber stricthigh
Definition: nbtree.h:509
OffsetNumber low
Definition: nbtree.h:508
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
uint16 OffsetNumber
Definition: off.h:24
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:554
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:497
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define unlikely(x)
Definition: c.h:208
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 554 of file nbtsearch.c.

References _bt_check_natts(), Assert, BTreeTupleGetHeapTID, BTreeTupleGetNAtts, DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_doinsert(), _bt_findinsertloc(), _bt_moveright(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

558 {
559  TupleDesc itupdesc = RelationGetDescr(rel);
561  IndexTuple itup;
562  ItemPointer heapTid;
563  ScanKey scankey;
564  int ncmpkey;
565  int ntupatts;
566 
567  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
569  Assert(key->heapkeyspace || key->scantid == NULL);
570 
571  /*
572  * Force result ">" if target item is first data item on an internal page
573  * --- see NOTE above.
574  */
575  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
576  return 1;
577 
578  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
579  ntupatts = BTreeTupleGetNAtts(itup, rel);
580 
581  /*
582  * The scan key is set up with the attribute number associated with each
583  * term in the key. It is important that, if the index is multi-key, the
584  * scan contain the first k key attributes, and that they be in order. If
585  * you think about how multi-key ordering works, you'll understand why
586  * this is.
587  *
588  * We don't test for violation of this condition here, however. The
589  * initial setup for the index scan had better have gotten it right (see
590  * _bt_first).
591  */
592 
593  ncmpkey = Min(ntupatts, key->keysz);
594  Assert(key->heapkeyspace || ncmpkey == key->keysz);
595  scankey = key->scankeys;
596  for (int i = 1; i <= ncmpkey; i++)
597  {
598  Datum datum;
599  bool isNull;
600  int32 result;
601 
602  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
603 
604  /* see comments about NULLs handling in btbuild */
605  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
606  {
607  if (isNull)
608  result = 0; /* NULL "=" NULL */
609  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
610  result = -1; /* NULL "<" NOT_NULL */
611  else
612  result = 1; /* NULL ">" NOT_NULL */
613  }
614  else if (isNull) /* key is NOT_NULL and item is NULL */
615  {
616  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
617  result = 1; /* NOT_NULL ">" NULL */
618  else
619  result = -1; /* NOT_NULL "<" NULL */
620  }
621  else
622  {
623  /*
624  * The sk_func needs to be passed the index value as left arg and
625  * the sk_argument as right arg (they might be of different
626  * types). Since it is convenient for callers to think of
627  * _bt_compare as comparing the scankey to the index item, we have
628  * to flip the sign of the comparison result. (Unless it's a DESC
629  * column, in which case we *don't* flip the sign.)
630  */
631  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
632  scankey->sk_collation,
633  datum,
634  scankey->sk_argument));
635 
636  if (!(scankey->sk_flags & SK_BT_DESC))
637  INVERT_COMPARE_RESULT(result);
638  }
639 
640  /* if the keys are unequal, return the difference */
641  if (result != 0)
642  return result;
643 
644  scankey++;
645  }
646 
647  /*
648  * All non-truncated attributes (other than heap TID) were found to be
649  * equal. Treat truncated attributes as minus infinity when scankey has a
650  * key attribute value that would otherwise be compared directly.
651  *
652  * Note: it doesn't matter if ntupatts includes non-key attributes;
653  * scankey won't, so explicitly excluding non-key attributes isn't
654  * necessary.
655  */
656  if (key->keysz > ntupatts)
657  return 1;
658 
659  /*
660  * Use the heap TID attribute and scantid to try to break the tie. The
661  * rules are the same as any other key attribute -- only the
662  * representation differs.
663  */
664  heapTid = BTreeTupleGetHeapTID(itup);
665  if (key->scantid == NULL)
666  {
667  /*
668  * Most searches have a scankey that is considered greater than a
669  * truncated pivot tuple if and when the scankey has equal values for
670  * attributes up to and including the least significant untruncated
671  * attribute in tuple.
672  *
673  * For example, if an index has the minimum two attributes (single
674  * user key attribute, plus heap TID attribute), and a page's high key
675  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
676  * will not descend to the page to the left. The search will descend
677  * right instead. The truncated attribute in pivot tuple means that
678  * all non-pivot tuples on the page to the left are strictly < 'foo',
679  * so it isn't necessary to descend left. In other words, search
680  * doesn't have to descend left because it isn't interested in a match
681  * that has a heap TID value of -inf.
682  *
683  * However, some searches (pivotsearch searches) actually require that
684  * we descend left when this happens. -inf is treated as a possible
685  * match for omitted scankey attribute(s). This is needed by page
686  * deletion, which must re-find leaf pages that are targets for
687  * deletion using their high keys.
688  *
689  * Note: the heap TID part of the test ensures that scankey is being
690  * compared to a pivot tuple with one or more truncated key
691  * attributes.
692  *
693  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
694  * left here, since they have no heap TID attribute (and cannot have
695  * any -inf key values in any case, since truncation can only remove
696  * non-key attributes). !heapkeyspace searches must always be
697  * prepared to deal with matches on both sides of the pivot once the
698  * leaf level is reached.
699  */
700  if (key->heapkeyspace && !key->pivotsearch &&
701  key->keysz == ntupatts && heapTid == NULL)
702  return 1;
703 
704  /* All provided scankey arguments found to be equal */
705  return 0;
706  }
707 
708  /*
709  * Treat truncated heap TID as minus infinity, since scankey has a key
710  * attribute value (scantid) that would otherwise be compared directly
711  */
713  if (heapTid == NULL)
714  return 1;
715 
717  return ItemPointerCompare(key->scantid, heapTid);
718 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define DatumGetInt32(X)
Definition: postgres.h:472
#define RelationGetDescr(relation)
Definition: rel.h:445
ItemPointer scantid
Definition: nbtree.h:475
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:904
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:346
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:438
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:474
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:681
uintptr_t Datum
Definition: postgres.h:367
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2377
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:680
bool heapkeyspace
Definition: nbtree.h:471
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:477
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1044
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 57 of file nbtsearch.c.

References BTScanPosData::buf, BUFFER_LOCK_UNLOCK, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, LockBuffer(), RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_readpage(), and _bt_steppage().

58 {
60 
61  if (IsMVCCSnapshot(scan->xs_snapshot) &&
63  !scan->xs_want_itup)
64  {
65  ReleaseBuffer(sp->buf);
66  sp->buf = InvalidBuffer;
67  }
68 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define InvalidBuffer
Definition: buf.h:25
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3365
Relation indexRelation
Definition: relscan.h:103
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:97
#define RelationNeedsWAL(relation)
Definition: rel.h:521
Buffer buf
Definition: nbtree.h:549

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2143 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_initialize_more_data(), _bt_readpage(), _bt_steppage(), Assert, BTScanPosInvalidate, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog, ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, LockBuffer(), IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber, PageGetSpecialPointer, PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

2144 {
2145  Relation rel = scan->indexRelation;
2146  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2147  Buffer buf;
2148  Page page;
2149  BTPageOpaque opaque;
2150  OffsetNumber start;
2151  BTScanPosItem *currItem;
2152 
2153  /*
2154  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2155  * version of _bt_search(). We don't maintain a stack since we know we
2156  * won't need it.
2157  */
2159 
2160  if (!BufferIsValid(buf))
2161  {
2162  /*
2163  * Empty index. Lock the whole relation, as nothing finer to lock
2164  * exists.
2165  */
2166  PredicateLockRelation(rel, scan->xs_snapshot);
2168  return false;
2169  }
2170 
2172  page = BufferGetPage(buf);
2173  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2174  Assert(P_ISLEAF(opaque));
2175 
2176  if (ScanDirectionIsForward(dir))
2177  {
2178  /* There could be dead pages to the left, so not this: */
2179  /* Assert(P_LEFTMOST(opaque)); */
2180 
2181  start = P_FIRSTDATAKEY(opaque);
2182  }
2183  else if (ScanDirectionIsBackward(dir))
2184  {
2185  Assert(P_RIGHTMOST(opaque));
2186 
2187  start = PageGetMaxOffsetNumber(page);
2188  }
2189  else
2190  {
2191  elog(ERROR, "invalid scan direction: %d", (int) dir);
2192  start = 0; /* keep compiler quiet */
2193  }
2194 
2195  /* remember which buffer we have pinned */
2196  so->currPos.buf = buf;
2197 
2198  _bt_initialize_more_data(so, dir);
2199 
2200  /*
2201  * Now load data from the first page of the scan.
2202  */
2203  if (!_bt_readpage(scan, dir, start))
2204  {
2205  /*
2206  * There's no actually-matching data on this page. Try to advance to
2207  * the next page. Return false if there's no matching data at all.
2208  */
2210  if (!_bt_steppage(scan, dir))
2211  return false;
2212  }
2213  else
2214  {
2215  /* Drop the lock, and maybe the pin, on the current page */
2217  }
2218 
2219  /* OK, itemIndex says what to return */
2220  currItem = &so->currPos.items[so->currPos.itemIndex];
2221  scan->xs_heaptid = currItem->heapTid;
2222  if (scan->xs_want_itup)
2223  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2224 
2225  return true;
2226 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:581
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:579
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2503
char * currTuples
Definition: nbtree.h:653
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2059
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
ItemPointerData xs_heaptid
Definition: relscan.h:132
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2233
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
#define Assert(condition)
Definition: c.h:732
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:609
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
BTScanPosData currPos
Definition: nbtree.h:666
#define elog(elevel,...)
Definition: elog.h:226
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1625
Buffer buf
Definition: nbtree.h:549
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1398
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 741 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_heapkeyspace(), _bt_initialize_more_data(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

742 {
743  Relation rel = scan->indexRelation;
744  BTScanOpaque so = (BTScanOpaque) scan->opaque;
745  Buffer buf;
746  BTStack stack;
747  OffsetNumber offnum;
748  StrategyNumber strat;
749  bool nextkey;
750  bool goback;
751  BTScanInsertData inskey;
752  ScanKey startKeys[INDEX_MAX_KEYS];
753  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
754  int keysCount = 0;
755  int i;
756  bool status = true;
757  StrategyNumber strat_total;
758  BTScanPosItem *currItem;
759  BlockNumber blkno;
760 
762 
764 
765  /*
766  * Examine the scan keys and eliminate any redundant keys; also mark the
767  * keys that must be matched to continue the scan.
768  */
769  _bt_preprocess_keys(scan);
770 
771  /*
772  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
773  * never be satisfied (eg, x == 1 AND x > 2).
774  */
775  if (!so->qual_ok)
776  return false;
777 
778  /*
779  * For parallel scans, get the starting page from shared state. If the
780  * scan has not started, proceed to find out first leaf page in the usual
781  * way while keeping other participating processes waiting. If the scan
782  * has already begun, use the page number from the shared structure.
783  */
784  if (scan->parallel_scan != NULL)
785  {
786  status = _bt_parallel_seize(scan, &blkno);
787  if (!status)
788  return false;
789  else if (blkno == P_NONE)
790  {
791  _bt_parallel_done(scan);
792  return false;
793  }
794  else if (blkno != InvalidBlockNumber)
795  {
796  if (!_bt_parallel_readpage(scan, blkno, dir))
797  return false;
798  goto readcomplete;
799  }
800  }
801 
802  /*----------
803  * Examine the scan keys to discover where we need to start the scan.
804  *
805  * We want to identify the keys that can be used as starting boundaries;
806  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
807  * a backwards scan. We can use keys for multiple attributes so long as
808  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
809  * a > or < boundary or find an attribute with no boundary (which can be
810  * thought of as the same as "> -infinity"), we can't use keys for any
811  * attributes to its right, because it would break our simplistic notion
812  * of what initial positioning strategy to use.
813  *
814  * When the scan keys include cross-type operators, _bt_preprocess_keys
815  * may not be able to eliminate redundant keys; in such cases we will
816  * arbitrarily pick a usable one for each attribute. This is correct
817  * but possibly not optimal behavior. (For example, with keys like
818  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
819  * x=5 would be more efficient.) Since the situation only arises given
820  * a poorly-worded query plus an incomplete opfamily, live with it.
821  *
822  * When both equality and inequality keys appear for a single attribute
823  * (again, only possible when cross-type operators appear), we *must*
824  * select one of the equality keys for the starting point, because
825  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
826  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
827  * start at x=4, we will fail and stop before reaching x=10. If multiple
828  * equality quals survive preprocessing, however, it doesn't matter which
829  * one we use --- by definition, they are either redundant or
830  * contradictory.
831  *
832  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
833  * If the index stores nulls at the end of the index we'll be starting
834  * from, and we have no boundary key for the column (which means the key
835  * we deduced NOT NULL from is an inequality key that constrains the other
836  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
837  * use as a boundary key. If we didn't do this, we might find ourselves
838  * traversing a lot of null entries at the start of the scan.
839  *
840  * In this loop, row-comparison keys are treated the same as keys on their
841  * first (leftmost) columns. We'll add on lower-order columns of the row
842  * comparison below, if possible.
843  *
844  * The selected scan keys (at most one per index column) are remembered by
845  * storing their addresses into the local startKeys[] array.
846  *----------
847  */
848  strat_total = BTEqualStrategyNumber;
849  if (so->numberOfKeys > 0)
850  {
851  AttrNumber curattr;
852  ScanKey chosen;
853  ScanKey impliesNN;
854  ScanKey cur;
855 
856  /*
857  * chosen is the so-far-chosen key for the current attribute, if any.
858  * We don't cast the decision in stone until we reach keys for the
859  * next attribute.
860  */
861  curattr = 1;
862  chosen = NULL;
863  /* Also remember any scankey that implies a NOT NULL constraint */
864  impliesNN = NULL;
865 
866  /*
867  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
868  * pass to handle after-last-key processing. Actual exit from the
869  * loop is at one of the "break" statements below.
870  */
871  for (cur = so->keyData, i = 0;; cur++, i++)
872  {
873  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
874  {
875  /*
876  * Done looking at keys for curattr. If we didn't find a
877  * usable boundary key, see if we can deduce a NOT NULL key.
878  */
879  if (chosen == NULL && impliesNN != NULL &&
880  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
883  {
884  /* Yes, so build the key in notnullkeys[keysCount] */
885  chosen = &notnullkeys[keysCount];
886  ScanKeyEntryInitialize(chosen,
888  (impliesNN->sk_flags &
890  curattr,
891  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
894  InvalidOid,
895  InvalidOid,
896  InvalidOid,
897  (Datum) 0);
898  }
899 
900  /*
901  * If we still didn't find a usable boundary key, quit; else
902  * save the boundary key pointer in startKeys.
903  */
904  if (chosen == NULL)
905  break;
906  startKeys[keysCount++] = chosen;
907 
908  /*
909  * Adjust strat_total, and quit if we have stored a > or <
910  * key.
911  */
912  strat = chosen->sk_strategy;
913  if (strat != BTEqualStrategyNumber)
914  {
915  strat_total = strat;
916  if (strat == BTGreaterStrategyNumber ||
917  strat == BTLessStrategyNumber)
918  break;
919  }
920 
921  /*
922  * Done if that was the last attribute, or if next key is not
923  * in sequence (implying no boundary key is available for the
924  * next attribute).
925  */
926  if (i >= so->numberOfKeys ||
927  cur->sk_attno != curattr + 1)
928  break;
929 
930  /*
931  * Reset for next attr.
932  */
933  curattr = cur->sk_attno;
934  chosen = NULL;
935  impliesNN = NULL;
936  }
937 
938  /*
939  * Can we use this key as a starting boundary for this attr?
940  *
941  * If not, does it imply a NOT NULL constraint? (Because
942  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
943  * *any* inequality key works for that; we need not test.)
944  */
945  switch (cur->sk_strategy)
946  {
949  if (chosen == NULL)
950  {
951  if (ScanDirectionIsBackward(dir))
952  chosen = cur;
953  else
954  impliesNN = cur;
955  }
956  break;
958  /* override any non-equality choice */
959  chosen = cur;
960  break;
963  if (chosen == NULL)
964  {
965  if (ScanDirectionIsForward(dir))
966  chosen = cur;
967  else
968  impliesNN = cur;
969  }
970  break;
971  }
972  }
973  }
974 
975  /*
976  * If we found no usable boundary keys, we have to start from one end of
977  * the tree. Walk down that edge to the first or last key, and scan from
978  * there.
979  */
980  if (keysCount == 0)
981  {
982  bool match;
983 
984  match = _bt_endpoint(scan, dir);
985 
986  if (!match)
987  {
988  /* No match, so mark (parallel) scan finished */
989  _bt_parallel_done(scan);
990  }
991 
992  return match;
993  }
994 
995  /*
996  * We want to start the scan somewhere within the index. Set up an
997  * insertion scankey we can use to search for the boundary point we
998  * identified above. The insertion scankey is built using the keys
999  * identified by startKeys[]. (Remaining insertion scankey fields are
1000  * initialized after initial-positioning strategy is finalized.)
1001  */
1002  Assert(keysCount <= INDEX_MAX_KEYS);
1003  for (i = 0; i < keysCount; i++)
1004  {
1005  ScanKey cur = startKeys[i];
1006 
1007  Assert(cur->sk_attno == i + 1);
1008 
1009  if (cur->sk_flags & SK_ROW_HEADER)
1010  {
1011  /*
1012  * Row comparison header: look to the first row member instead.
1013  *
1014  * The member scankeys are already in insertion format (ie, they
1015  * have sk_func = 3-way-comparison function), but we have to watch
1016  * out for nulls, which _bt_preprocess_keys didn't check. A null
1017  * in the first row member makes the condition unmatchable, just
1018  * like qual_ok = false.
1019  */
1020  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1021 
1022  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1023  if (subkey->sk_flags & SK_ISNULL)
1024  {
1025  _bt_parallel_done(scan);
1026  return false;
1027  }
1028  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1029 
1030  /*
1031  * If the row comparison is the last positioning key we accepted,
1032  * try to add additional keys from the lower-order row members.
1033  * (If we accepted independent conditions on additional index
1034  * columns, we use those instead --- doesn't seem worth trying to
1035  * determine which is more restrictive.) Note that this is OK
1036  * even if the row comparison is of ">" or "<" type, because the
1037  * condition applied to all but the last row member is effectively
1038  * ">=" or "<=", and so the extra keys don't break the positioning
1039  * scheme. But, by the same token, if we aren't able to use all
1040  * the row members, then the part of the row comparison that we
1041  * did use has to be treated as just a ">=" or "<=" condition, and
1042  * so we'd better adjust strat_total accordingly.
1043  */
1044  if (i == keysCount - 1)
1045  {
1046  bool used_all_subkeys = false;
1047 
1048  Assert(!(subkey->sk_flags & SK_ROW_END));
1049  for (;;)
1050  {
1051  subkey++;
1052  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1053  if (subkey->sk_attno != keysCount + 1)
1054  break; /* out-of-sequence, can't use it */
1055  if (subkey->sk_strategy != cur->sk_strategy)
1056  break; /* wrong direction, can't use it */
1057  if (subkey->sk_flags & SK_ISNULL)
1058  break; /* can't use null keys */
1059  Assert(keysCount < INDEX_MAX_KEYS);
1060  memcpy(inskey.scankeys + keysCount, subkey,
1061  sizeof(ScanKeyData));
1062  keysCount++;
1063  if (subkey->sk_flags & SK_ROW_END)
1064  {
1065  used_all_subkeys = true;
1066  break;
1067  }
1068  }
1069  if (!used_all_subkeys)
1070  {
1071  switch (strat_total)
1072  {
1073  case BTLessStrategyNumber:
1074  strat_total = BTLessEqualStrategyNumber;
1075  break;
1077  strat_total = BTGreaterEqualStrategyNumber;
1078  break;
1079  }
1080  }
1081  break; /* done with outer loop */
1082  }
1083  }
1084  else
1085  {
1086  /*
1087  * Ordinary comparison key. Transform the search-style scan key
1088  * to an insertion scan key by replacing the sk_func with the
1089  * appropriate btree comparison function.
1090  *
1091  * If scankey operator is not a cross-type comparison, we can use
1092  * the cached comparison function; otherwise gotta look it up in
1093  * the catalogs. (That can't lead to infinite recursion, since no
1094  * indexscan initiated by syscache lookup will use cross-data-type
1095  * operators.)
1096  *
1097  * We support the convention that sk_subtype == InvalidOid means
1098  * the opclass input type; this is a hack to simplify life for
1099  * ScanKeyInit().
1100  */
1101  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1102  cur->sk_subtype == InvalidOid)
1103  {
1104  FmgrInfo *procinfo;
1105 
1106  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1107  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1108  cur->sk_flags,
1109  cur->sk_attno,
1111  cur->sk_subtype,
1112  cur->sk_collation,
1113  procinfo,
1114  cur->sk_argument);
1115  }
1116  else
1117  {
1118  RegProcedure cmp_proc;
1119 
1120  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1121  rel->rd_opcintype[i],
1122  cur->sk_subtype,
1123  BTORDER_PROC);
1124  if (!RegProcedureIsValid(cmp_proc))
1125  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1126  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1127  cur->sk_attno, RelationGetRelationName(rel));
1128  ScanKeyEntryInitialize(inskey.scankeys + i,
1129  cur->sk_flags,
1130  cur->sk_attno,
1132  cur->sk_subtype,
1133  cur->sk_collation,
1134  cmp_proc,
1135  cur->sk_argument);
1136  }
1137  }
1138  }
1139 
1140  /*----------
1141  * Examine the selected initial-positioning strategy to determine exactly
1142  * where we need to start the scan, and set flag variables to control the
1143  * code below.
1144  *
1145  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1146  * item >= scan key. If nextkey = true, they will locate the first
1147  * item > scan key.
1148  *
1149  * If goback = true, we will then step back one item, while if
1150  * goback = false, we will start the scan on the located item.
1151  *----------
1152  */
1153  switch (strat_total)
1154  {
1155  case BTLessStrategyNumber:
1156 
1157  /*
1158  * Find first item >= scankey, then back up one to arrive at last
1159  * item < scankey. (Note: this positioning strategy is only used
1160  * for a backward scan, so that is always the correct starting
1161  * position.)
1162  */
1163  nextkey = false;
1164  goback = true;
1165  break;
1166 
1168 
1169  /*
1170  * Find first item > scankey, then back up one to arrive at last
1171  * item <= scankey. (Note: this positioning strategy is only used
1172  * for a backward scan, so that is always the correct starting
1173  * position.)
1174  */
1175  nextkey = true;
1176  goback = true;
1177  break;
1178 
1179  case BTEqualStrategyNumber:
1180 
1181  /*
1182  * If a backward scan was specified, need to start with last equal
1183  * item not first one.
1184  */
1185  if (ScanDirectionIsBackward(dir))
1186  {
1187  /*
1188  * This is the same as the <= strategy. We will check at the
1189  * end whether the found item is actually =.
1190  */
1191  nextkey = true;
1192  goback = true;
1193  }
1194  else
1195  {
1196  /*
1197  * This is the same as the >= strategy. We will check at the
1198  * end whether the found item is actually =.
1199  */
1200  nextkey = false;
1201  goback = false;
1202  }
1203  break;
1204 
1206 
1207  /*
1208  * Find first item >= scankey. (This is only used for forward
1209  * scans.)
1210  */
1211  nextkey = false;
1212  goback = false;
1213  break;
1214 
1216 
1217  /*
1218  * Find first item > scankey. (This is only used for forward
1219  * scans.)
1220  */
1221  nextkey = true;
1222  goback = false;
1223  break;
1224 
1225  default:
1226  /* can't get here, but keep compiler quiet */
1227  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1228  return false;
1229  }
1230 
1231  /* Initialize remaining insertion scan key fields */
1232  inskey.heapkeyspace = _bt_heapkeyspace(rel);
1233  inskey.anynullkeys = false; /* unused */
1234  inskey.nextkey = nextkey;
1235  inskey.pivotsearch = false;
1236  inskey.scantid = NULL;
1237  inskey.keysz = keysCount;
1238 
1239  /*
1240  * Use the manufactured insertion scan key to descend the tree and
1241  * position ourselves on the target leaf page.
1242  */
1243  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1244 
1245  /* don't need to keep the stack around... */
1246  _bt_freestack(stack);
1247 
1248  if (!BufferIsValid(buf))
1249  {
1250  /*
1251  * We only get here if the index is completely empty. Lock relation
1252  * because nothing finer to lock exists.
1253  */
1254  PredicateLockRelation(rel, scan->xs_snapshot);
1255 
1256  /*
1257  * mark parallel scan as done, so that all the workers can finish
1258  * their scan
1259  */
1260  _bt_parallel_done(scan);
1262 
1263  return false;
1264  }
1265  else
1267  scan->xs_snapshot);
1268 
1269  _bt_initialize_more_data(so, dir);
1270 
1271  /* position to the precise item on the page */
1272  offnum = _bt_binsrch(rel, &inskey, buf);
1273 
1274  /*
1275  * If nextkey = false, we are positioned at the first item >= scan key, or
1276  * possibly at the end of a page on which all the existing items are less
1277  * than the scan key and we know that everything on later pages is greater
1278  * than or equal to scan key.
1279  *
1280  * If nextkey = true, we are positioned at the first item > scan key, or
1281  * possibly at the end of a page on which all the existing items are less
1282  * than or equal to the scan key and we know that everything on later
1283  * pages is greater than scan key.
1284  *
1285  * The actually desired starting point is either this item or the prior
1286  * one, or in the end-of-page case it's the first item on the next page or
1287  * the last item on this page. Adjust the starting offset if needed. (If
1288  * this results in an offset before the first item or after the last one,
1289  * _bt_readpage will report no items found, and then we'll step to the
1290  * next page as needed.)
1291  */
1292  if (goback)
1293  offnum = OffsetNumberPrev(offnum);
1294 
1295  /* remember which buffer we have pinned, if any */
1297  so->currPos.buf = buf;
1298 
1299  /*
1300  * Now load data from the first page of the scan.
1301  */
1302  if (!_bt_readpage(scan, dir, offnum))
1303  {
1304  /*
1305  * There's no actually-matching data on this page. Try to advance to
1306  * the next page. Return false if there's no matching data at all.
1307  */
1309  if (!_bt_steppage(scan, dir))
1310  return false;
1311  }
1312  else
1313  {
1314  /* Drop the lock, and maybe the pin, on the current page */
1316  }
1317 
1318 readcomplete:
1319  /* OK, itemIndex says what to return */
1320  currItem = &so->currPos.items[so->currPos.itemIndex];
1321  scan->xs_heaptid = currItem->heapTid;
1322  if (scan->xs_want_itup)
1323  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1324 
1325  return true;
1326 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
#define BTORDER_PROC
Definition: nbtree.h:392
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:794
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:581
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1910
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:579
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2503
char * currTuples
Definition: nbtree.h:653
regproc RegProcedure
Definition: c.h:505
#define P_NONE
Definition: nbtree.h:181
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:603
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:339
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:132
static char * buf
Definition: pg_test_fsync.c:68
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2143
#define RegProcedureIsValid(p)
Definition: c.h:640
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:453
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1373
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
Oid * rd_opfamily
Definition: rel.h:158
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:744
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:681
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2233
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
bool _bt_heapkeyspace(Relation rel)
Definition: nbtpage.c:636
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:680
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:609
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
int numberOfKeys
Definition: nbtree.h:632
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define DatumGetPointer(X)
Definition: postgres.h:549
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
BTScanPosData currPos
Definition: nbtree.h:666
ScanKey keyData
Definition: nbtree.h:633
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:226
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1625
Buffer buf
Definition: nbtree.h:549
Oid * rd_opcintype
Definition: rel.h:159
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1398
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 2059 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, buf, BufferGetPage, BufferIsValid, elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

2061 {
2062  Buffer buf;
2063  Page page;
2064  BTPageOpaque opaque;
2065  OffsetNumber offnum;
2066  BlockNumber blkno;
2067  IndexTuple itup;
2068 
2069  /*
2070  * If we are looking for a leaf page, okay to descend from fast root;
2071  * otherwise better descend from true root. (There is no point in being
2072  * smarter about intermediate levels.)
2073  */
2074  if (level == 0)
2075  buf = _bt_getroot(rel, BT_READ);
2076  else
2077  buf = _bt_gettrueroot(rel);
2078 
2079  if (!BufferIsValid(buf))
2080  return InvalidBuffer;
2081 
2082  page = BufferGetPage(buf);
2083  TestForOldSnapshot(snapshot, rel, page);
2084  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2085 
2086  for (;;)
2087  {
2088  /*
2089  * If we landed on a deleted page, step right to find a live page
2090  * (there must be one). Also, if we want the rightmost page, step
2091  * right if needed to get to it (this could happen if the page split
2092  * since we obtained a pointer to it).
2093  */
2094  while (P_IGNORE(opaque) ||
2095  (rightmost && !P_RIGHTMOST(opaque)))
2096  {
2097  blkno = opaque->btpo_next;
2098  if (blkno == P_NONE)
2099  elog(ERROR, "fell off the end of index \"%s\"",
2101  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2102  page = BufferGetPage(buf);
2103  TestForOldSnapshot(snapshot, rel, page);
2104  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2105  }
2106 
2107  /* Done? */
2108  if (opaque->btpo.level == level)
2109  break;
2110  if (opaque->btpo.level < level)
2111  ereport(ERROR,
2112  (errcode(ERRCODE_INDEX_CORRUPTED),
2113  errmsg_internal("btree level %u not found in index \"%s\"",
2114  level, RelationGetRelationName(rel))));
2115 
2116  /* Descend to leftmost or rightmost child page */
2117  if (rightmost)
2118  offnum = PageGetMaxOffsetNumber(page);
2119  else
2120  offnum = P_FIRSTDATAKEY(opaque);
2121 
2122  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2123  blkno = BTreeInnerTupleGetDownLink(itup);
2124 
2125  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2126  page = BufferGetPage(buf);
2127  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2128  }
2129 
2130  return buf;
2131 }
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
union BTPageOpaqueData::@46 btpo
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:453
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:255
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:61
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:488
int errmsg_internal(const char *fmt,...)
Definition: elog.c:814
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define elog(elevel,...)
Definition: elog.h:226
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_initialize_more_data()

static void _bt_initialize_more_data ( BTScanOpaque  so,
ScanDirection  dir 
)
inlinestatic

Definition at line 2233 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::numKilled, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_parallel_readpage().

2234 {
2235  /* initialize moreLeft/moreRight appropriately for scan direction */
2236  if (ScanDirectionIsForward(dir))
2237  {
2238  so->currPos.moreLeft = false;
2239  so->currPos.moreRight = true;
2240  }
2241  else
2242  {
2243  so->currPos.moreLeft = true;
2244  so->currPos.moreRight = false;
2245  }
2246  so->numKilled = 0; /* just paranoia */
2247  so->markItemIndex = -1; /* ditto */
2248 }
bool moreRight
Definition: nbtree.h:562
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
bool moreLeft
Definition: nbtree.h:561
int markItemIndex
Definition: nbtree.h:663
BTScanPosData currPos
Definition: nbtree.h:666

◆ _bt_moveright()

Buffer _bt_moveright ( Relation  rel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access,
Snapshot  snapshot 
)

Definition at line 237 of file nbtsearch.c.

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BT_WRITE, BTPageOpaqueData::btpo_next, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, elog, ERROR, LockBuffer(), BTScanInsertData::nextkey, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_search().

244 {
245  Page page;
246  BTPageOpaque opaque;
247  int32 cmpval;
248 
249  /*
250  * When nextkey = false (normal case): if the scan key that brought us to
251  * this page is > the high key stored on the page, then the page has split
252  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
253  * have some duplicates to the right as well as the left, but that's
254  * something that's only ever dealt with on the leaf level, after
255  * _bt_search has found an initial leaf page.)
256  *
257  * When nextkey = true: move right if the scan key is >= page's high key.
258  * (Note that key.scantid cannot be set in this case.)
259  *
260  * The page could even have split more than once, so scan as far as
261  * needed.
262  *
263  * We also have to move right if we followed a link that brought us to a
264  * dead page.
265  */
266  cmpval = key->nextkey ? 0 : 1;
267 
268  for (;;)
269  {
270  page = BufferGetPage(buf);
271  TestForOldSnapshot(snapshot, rel, page);
272  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
273 
274  if (P_RIGHTMOST(opaque))
275  break;
276 
277  /*
278  * Finish any incomplete splits we encounter along the way.
279  */
280  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
281  {
283 
284  /* upgrade our lock if necessary */
285  if (access == BT_READ)
286  {
289  }
290 
291  if (P_INCOMPLETE_SPLIT(opaque))
292  _bt_finish_split(rel, buf, stack);
293  else
294  _bt_relbuf(rel, buf);
295 
296  /* re-acquire the lock in the right mode, and re-check */
297  buf = _bt_getbuf(rel, blkno, access);
298  continue;
299  }
300 
301  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
302  {
303  /* step right one page */
304  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
305  continue;
306  }
307  else
308  break;
309  }
310 
311  if (P_IGNORE(opaque))
312  elog(ERROR, "fell off the end of index \"%s\"",
314 
315  return buf;
316 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
#define BT_READ
Definition: nbtree.h:402
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1856
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:453
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:554
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:403
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1343 of file nbtsearch.c.

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

1344 {
1345  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1346  BTScanPosItem *currItem;
1347 
1348  /*
1349  * Advance to next tuple on current page; or if there's no more, try to
1350  * step to the next page with data.
1351  */
1352  if (ScanDirectionIsForward(dir))
1353  {
1354  if (++so->currPos.itemIndex > so->currPos.lastItem)
1355  {
1356  if (!_bt_steppage(scan, dir))
1357  return false;
1358  }
1359  }
1360  else
1361  {
1362  if (--so->currPos.itemIndex < so->currPos.firstItem)
1363  {
1364  if (!_bt_steppage(scan, dir))
1365  return false;
1366  }
1367  }
1368 
1369  /* OK, itemIndex says what to return */
1370  currItem = &so->currPos.items[so->currPos.itemIndex];
1371  scan->xs_heaptid = currItem->heapTid;
1372  if (scan->xs_want_itup)
1373  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1374 
1375  return true;
1376 }
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:581
int itemIndex
Definition: nbtree.h:579
char * currTuples
Definition: nbtree.h:653
int lastItem
Definition: nbtree.h:578
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
ItemPointerData xs_heaptid
Definition: relscan.h:132
IndexTupleData * IndexTuple
Definition: itup.h:53
int firstItem
Definition: nbtree.h:577
BTScanPosData currPos
Definition: nbtree.h:666
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1625

◆ _bt_parallel_readpage()

static bool _bt_parallel_readpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 1910 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_initialize_more_data(), _bt_readnextpage(), BTScanOpaqueData::currPos, and IndexScanDescData::opaque.

Referenced by _bt_first().

1911 {
1912  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1913 
1914  _bt_initialize_more_data(so, dir);
1915 
1916  if (!_bt_readnextpage(scan, blkno, dir))
1917  return false;
1918 
1919  /* Drop the lock, and maybe the pin, on the current page */
1921 
1922  return true;
1923 }
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2233
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1732
BTScanPosData currPos
Definition: nbtree.h:666

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
ScanDirection  dir 
)
static

Definition at line 1732 of file nbtsearch.c.

References _bt_getbuf(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), BT_READ, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBuffer, LockBuffer(), BTScanPosData::moreLeft, BTScanPosData::moreRight, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber, PageGetSpecialPointer, IndexScanDescData::parallel_scan, PredicateLockPage(), ScanDirectionIsForward, status(), TestForOldSnapshot(), and IndexScanDescData::xs_snapshot.

Referenced by _bt_parallel_readpage(), and _bt_steppage().

1733 {
1734  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1735  Relation rel;
1736  Page page;
1737  BTPageOpaque opaque;
1738  bool status = true;
1739 
1740  rel = scan->indexRelation;
1741 
1742  if (ScanDirectionIsForward(dir))
1743  {
1744  for (;;)
1745  {
1746  /*
1747  * if we're at end of scan, give up and mark parallel scan as
1748  * done, so that all the workers can finish their scan
1749  */
1750  if (blkno == P_NONE || !so->currPos.moreRight)
1751  {
1752  _bt_parallel_done(scan);
1754  return false;
1755  }
1756  /* check for interrupts while we're not holding any buffer lock */
1758  /* step right one page */
1759  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1760  page = BufferGetPage(so->currPos.buf);
1761  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1762  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1763  /* check for deleted page */
1764  if (!P_IGNORE(opaque))
1765  {
1766  PredicateLockPage(rel, blkno, scan->xs_snapshot);
1767  /* see if there are any matches on this page */
1768  /* note that this will clear moreRight if we can stop */
1769  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1770  break;
1771  }
1772  else if (scan->parallel_scan != NULL)
1773  {
1774  /* allow next page be processed by parallel worker */
1775  _bt_parallel_release(scan, opaque->btpo_next);
1776  }
1777 
1778  /* nope, keep going */
1779  if (scan->parallel_scan != NULL)
1780  {
1781  _bt_relbuf(rel, so->currPos.buf);
1782  status = _bt_parallel_seize(scan, &blkno);
1783  if (!status)
1784  {
1786  return false;
1787  }
1788  }
1789  else
1790  {
1791  blkno = opaque->btpo_next;
1792  _bt_relbuf(rel, so->currPos.buf);
1793  }
1794  }
1795  }
1796  else
1797  {
1798  /*
1799  * Should only happen in parallel cases, when some other backend
1800  * advanced the scan.
1801  */
1802  if (so->currPos.currPage != blkno)
1803  {
1805  so->currPos.currPage = blkno;
1806  }
1807 
1808  /*
1809  * Walk left to the next page with data. This is much more complex
1810  * than the walk-right case because of the possibility that the page
1811  * to our left splits while we are in flight to it, plus the
1812  * possibility that the page we were on gets deleted after we leave
1813  * it. See nbtree/README for details.
1814  *
1815  * It might be possible to rearrange this code to have less overhead
1816  * in pinning and locking, but that would require capturing the left
1817  * pointer when the page is initially read, and using it here, along
1818  * with big changes to _bt_walk_left() and the code below. It is not
1819  * clear whether this would be a win, since if the page immediately to
1820  * the left splits after we read this page and before we step left, we
1821  * would need to visit more pages than with the current code.
1822  *
1823  * Note that if we change the code so that we drop the pin for a scan
1824  * which uses a non-MVCC snapshot, we will need to modify the code for
1825  * walking left, to allow for the possibility that a referenced page
1826  * has been deleted. As long as the buffer is pinned or the snapshot
1827  * is MVCC the page cannot move past the half-dead state to fully
1828  * deleted.
1829  */
1830  if (BTScanPosIsPinned(so->currPos))
1831  LockBuffer(so->currPos.buf, BT_READ);
1832  else
1833  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1834 
1835  for (;;)
1836  {
1837  /* Done if we know there are no matching keys to the left */
1838  if (!so->currPos.moreLeft)
1839  {
1840  _bt_relbuf(rel, so->currPos.buf);
1841  _bt_parallel_done(scan);
1843  return false;
1844  }
1845 
1846  /* Step to next physical page */
1847  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1848  scan->xs_snapshot);
1849 
1850  /* if we're physically at end of index, return failure */
1851  if (so->currPos.buf == InvalidBuffer)
1852  {
1853  _bt_parallel_done(scan);
1855  return false;
1856  }
1857 
1858  /*
1859  * Okay, we managed to move left to a non-deleted page. Done if
1860  * it's not half-dead and contains matching tuples. Else loop back
1861  * and do it all again.
1862  */
1863  page = BufferGetPage(so->currPos.buf);
1864  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1865  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1866  if (!P_IGNORE(opaque))
1867  {
1869  /* see if there are any matches on this page */
1870  /* note that this will clear moreLeft if we can stop */
1871  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1872  break;
1873  }
1874  else if (scan->parallel_scan != NULL)
1875  {
1876  /* allow next page be processed by parallel worker */
1878  }
1879 
1880  /*
1881  * For parallel scans, get the last page scanned as it is quite
1882  * possible that by the time we try to seize the scan, some other
1883  * worker has already advanced the scan to a different page. We
1884  * must continue based on the latest page scanned by any worker.
1885  */
1886  if (scan->parallel_scan != NULL)
1887  {
1888  _bt_relbuf(rel, so->currPos.buf);
1889  status = _bt_parallel_seize(scan, &blkno);
1890  if (!status)
1891  {
1893  return false;
1894  }
1895  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1896  }
1897  }
1898  }
1899 
1900  return true;
1901 }
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool moreRight
Definition: nbtree.h:562
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
BlockNumber currPage
Definition: nbtree.h:552
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
bool moreLeft
Definition: nbtree.h:561
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Relation indexRelation
Definition: relscan.h:103
#define BT_READ
Definition: nbtree.h:402
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:586
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:609
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:699
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
BTScanPosData currPos
Definition: nbtree.h:666
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
Buffer buf
Definition: nbtree.h:549
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1398
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:597
Pointer Page
Definition: bufpage.h:78
static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:1940

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum 
)
static

Definition at line 1398 of file nbtsearch.c.

References _bt_checkkeys(), _bt_parallel_release(), _bt_saveitem(), Assert, BTreeTupleGetNAtts, BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage, BufferIsValid, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxIndexTuplesPerPage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_HIKEY, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_readnextpage().

1399 {
1400  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1401  Page page;
1402  BTPageOpaque opaque;
1403  OffsetNumber minoff;
1404  OffsetNumber maxoff;
1405  int itemIndex;
1406  bool continuescan;
1407  int indnatts;
1408 
1409  /*
1410  * We must have the buffer pinned and locked, but the usual macro can't be
1411  * used here; this function is what makes it good for currPos.
1412  */
1414 
1415  page = BufferGetPage(so->currPos.buf);
1416  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1417 
1418  /* allow next page be processed by parallel worker */
1419  if (scan->parallel_scan)
1420  {
1421  if (ScanDirectionIsForward(dir))
1422  _bt_parallel_release(scan, opaque->btpo_next);
1423  else
1425  }
1426 
1427  continuescan = true; /* default assumption */
1429  minoff = P_FIRSTDATAKEY(opaque);
1430  maxoff = PageGetMaxOffsetNumber(page);
1431 
1432  /*
1433  * We note the buffer's block number so that we can release the pin later.
1434  * This allows us to re-read the buffer if it is needed again for hinting.
1435  */
1437 
1438  /*
1439  * We save the LSN of the page as we read it, so that we know whether it
1440  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1441  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1442  */
1444 
1445  /*
1446  * we must save the page's right-link while scanning it; this tells us
1447  * where to step right to after we're done with these items. There is no
1448  * corresponding need for the left-link, since splits always go right.
1449  */
1450  so->currPos.nextPage = opaque->btpo_next;
1451 
1452  /* initialize tuple workspace to empty */
1453  so->currPos.nextTupleOffset = 0;
1454 
1455  /*
1456  * Now that the current page has been made consistent, the macro should be
1457  * good.
1458  */
1460 
1461  if (ScanDirectionIsForward(dir))
1462  {
1463  /* load items[] in ascending order */
1464  itemIndex = 0;
1465 
1466  offnum = Max(offnum, minoff);
1467 
1468  while (offnum <= maxoff)
1469  {
1470  ItemId iid = PageGetItemId(page, offnum);
1471  IndexTuple itup;
1472 
1473  /*
1474  * If the scan specifies not to return killed tuples, then we
1475  * treat a killed tuple as not passing the qual
1476  */
1477  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1478  {
1479  offnum = OffsetNumberNext(offnum);
1480  continue;
1481  }
1482 
1483  itup = (IndexTuple) PageGetItem(page, iid);
1484 
1485  if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1486  {
1487  /* tuple passes all scan key conditions, so remember it */
1488  _bt_saveitem(so, itemIndex, offnum, itup);
1489  itemIndex++;
1490  }
1491  /* When !continuescan, there can't be any more matches, so stop */
1492  if (!continuescan)
1493  break;
1494 
1495  offnum = OffsetNumberNext(offnum);
1496  }
1497 
1498  /*
1499  * We don't need to visit page to the right when the high key
1500  * indicates that no more matches will be found there.
1501  *
1502  * Checking the high key like this works out more often than you might
1503  * think. Leaf page splits pick a split point between the two most
1504  * dissimilar tuples (this is weighed against the need to evenly share
1505  * free space). Leaf pages with high key attribute values that can
1506  * only appear on non-pivot tuples on the right sibling page are
1507  * common.
1508  */
1509  if (continuescan && !P_RIGHTMOST(opaque))
1510  {
1511  ItemId iid = PageGetItemId(page, P_HIKEY);
1512  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1513  int truncatt;
1514 
1515  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1516  _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1517  }
1518 
1519  if (!continuescan)
1520  so->currPos.moreRight = false;
1521 
1522  Assert(itemIndex <= MaxIndexTuplesPerPage);
1523  so->currPos.firstItem = 0;
1524  so->currPos.lastItem = itemIndex - 1;
1525  so->currPos.itemIndex = 0;
1526  }
1527  else
1528  {
1529  /* load items[] in descending order */
1530  itemIndex = MaxIndexTuplesPerPage;
1531 
1532  offnum = Min(offnum, maxoff);
1533 
1534  while (offnum >= minoff)
1535  {
1536  ItemId iid = PageGetItemId(page, offnum);
1537  IndexTuple itup;
1538  bool tuple_alive;
1539  bool passes_quals;
1540 
1541  /*
1542  * If the scan specifies not to return killed tuples, then we
1543  * treat a killed tuple as not passing the qual. Most of the
1544  * time, it's a win to not bother examining the tuple's index
1545  * keys, but just skip to the next tuple (previous, actually,
1546  * since we're scanning backwards). However, if this is the first
1547  * tuple on the page, we do check the index keys, to prevent
1548  * uselessly advancing to the page to the left. This is similar
1549  * to the high key optimization used by forward scans.
1550  */
1551  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1552  {
1553  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1554  if (offnum > P_FIRSTDATAKEY(opaque))
1555  {
1556  offnum = OffsetNumberPrev(offnum);
1557  continue;
1558  }
1559 
1560  tuple_alive = false;
1561  }
1562  else
1563  tuple_alive = true;
1564 
1565  itup = (IndexTuple) PageGetItem(page, iid);
1566 
1567  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1568  &continuescan);
1569  if (passes_quals && tuple_alive)
1570  {
1571  /* tuple passes all scan key conditions, so remember it */
1572  itemIndex--;
1573  _bt_saveitem(so, itemIndex, offnum, itup);
1574  }
1575  if (!continuescan)
1576  {
1577  /* there can't be any more matches, so stop */
1578  so->currPos.moreLeft = false;
1579  break;
1580  }
1581 
1582  offnum = OffsetNumberPrev(offnum);
1583  }
1584 
1585  Assert(itemIndex >= 0);
1586  so->currPos.firstItem = itemIndex;
1589  }
1590 
1591  return (so->currPos.firstItem <= so->currPos.lastItem);
1592 }
bool moreRight
Definition: nbtree.h:562
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
int itemIndex
Definition: nbtree.h:579
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:904
BlockNumber currPage
Definition: nbtree.h:552
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
bool moreLeft
Definition: nbtree.h:561
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
bool ignore_killed_tuples
Definition: relscan.h:114
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
int nextTupleOffset
Definition: nbtree.h:568
int lastItem
Definition: nbtree.h:578
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2876
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:586
IndexTupleData * IndexTuple
Definition: itup.h:53
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1596
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:431
int firstItem
Definition: nbtree.h:577
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1357
#define Max(x, y)
Definition: c.h:898
#define Assert(condition)
Definition: c.h:732
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:699
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber nextPage
Definition: nbtree.h:553
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
BTScanPosData currPos
Definition: nbtree.h:666
#define MaxIndexTuplesPerPage
Definition: itup.h:145
XLogRecPtr lsn
Definition: nbtree.h:551
Buffer buf
Definition: nbtree.h:549
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1596 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

1598 {
1599  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1600 
1601  currItem->heapTid = itup->t_tid;
1602  currItem->indexOffset = offnum;
1603  if (so->currTuples)
1604  {
1605  Size itupsz = IndexTupleSize(itup);
1606 
1607  currItem->tupleOffset = so->currPos.nextTupleOffset;
1608  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1609  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1610  }
1611 }
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:581
char * currTuples
Definition: nbtree.h:653
ItemPointerData t_tid
Definition: itup.h:37
OffsetNumber indexOffset
Definition: nbtree.h:543
int nextTupleOffset
Definition: nbtree.h:568
size_t Size
Definition: c.h:466
#define MAXALIGN(LEN)
Definition: c.h:685
ItemPointerData heapTid
Definition: nbtree.h:542
BTScanPosData currPos
Definition: nbtree.h:666
LocationIndex tupleOffset
Definition: nbtree.h:544
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
BTScanInsert  key,
Buffer bufP,
int  access,
Snapshot  snapshot 
)

Definition at line 92 of file nbtsearch.c.

References _bt_binsrch(), _bt_getroot(), _bt_moveright(), _bt_relandgetbuf(), BT_READ, BT_WRITE, BTPageOpaqueData::btpo, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTPageOpaqueData::level, LockBuffer(), P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, and palloc().

Referenced by _bt_doinsert(), _bt_first(), _bt_pagedel(), and bt_rootdescend().

94 {
95  BTStack stack_in = NULL;
96  int page_access = BT_READ;
97 
98  /* Get the root page to start with */
99  *bufP = _bt_getroot(rel, access);
100 
101  /* If index is empty and access = BT_READ, no root page is created. */
102  if (!BufferIsValid(*bufP))
103  return (BTStack) NULL;
104 
105  /* Loop iterates once per level descended in the tree */
106  for (;;)
107  {
108  Page page;
109  BTPageOpaque opaque;
110  OffsetNumber offnum;
111  ItemId itemid;
112  IndexTuple itup;
113  BlockNumber blkno;
114  BlockNumber par_blkno;
115  BTStack new_stack;
116 
117  /*
118  * Race -- the page we just grabbed may have split since we read its
119  * pointer in the parent (or metapage). If it has, we may need to
120  * move right to its new sibling. Do that.
121  *
122  * In write-mode, allow _bt_moveright to finish any incomplete splits
123  * along the way. Strictly speaking, we'd only need to finish an
124  * incomplete split on the leaf page we're about to insert to, not on
125  * any of the upper levels (they are taken care of in _bt_getstackbuf,
126  * if the leaf page is split and we insert to the parent page). But
127  * this is a good opportunity to finish splits of internal pages too.
128  */
129  *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
130  page_access, snapshot);
131 
132  /* if this is a leaf page, we're done */
133  page = BufferGetPage(*bufP);
134  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135  if (P_ISLEAF(opaque))
136  break;
137 
138  /*
139  * Find the appropriate item on the internal page, and get the child
140  * page that it points to.
141  */
142  offnum = _bt_binsrch(rel, key, *bufP);
143  itemid = PageGetItemId(page, offnum);
144  itup = (IndexTuple) PageGetItem(page, itemid);
145  blkno = BTreeInnerTupleGetDownLink(itup);
146  par_blkno = BufferGetBlockNumber(*bufP);
147 
148  /*
149  * We need to save the location of the pivot tuple we chose in the
150  * parent page on a stack. If we need to split a page, we'll use
151  * the stack to work back up to its parent page. If caller ends up
152  * splitting a page one level down, it usually ends up inserting a
153  * new pivot tuple/downlink immediately after the location recorded
154  * here.
155  */
156  new_stack = (BTStack) palloc(sizeof(BTStackData));
157  new_stack->bts_blkno = par_blkno;
158  new_stack->bts_offset = offnum;
159  new_stack->bts_parent = stack_in;
160 
161  /*
162  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
163  * we're on the level 1 and asked to lock leaf page in write mode,
164  * then lock next page in write mode, because it must be a leaf.
165  */
166  if (opaque->btpo.level == 1 && access == BT_WRITE)
167  page_access = BT_WRITE;
168 
169  /* drop the read lock on the parent page, acquire one on the child */
170  *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
171 
172  /* okay, all set to move down a level */
173  stack_in = new_stack;
174  }
175 
176  /*
177  * If we're asked to lock leaf in write mode, but didn't manage to, then
178  * relock. This should only happen when the root page is a leaf page (and
179  * the only page in the index other than the metapage).
180  */
181  if (access == BT_WRITE && page_access == BT_READ)
182  {
183  /* trade in our read lock for a write lock */
185  LockBuffer(*bufP, BT_WRITE);
186 
187  /*
188  * If the page was split between the time that we surrendered our read
189  * lock and acquired our write lock, then this page may no longer be
190  * the right place for the key we want to insert. In this case, we
191  * need to move right in the tree. See Lehman and Yao for an
192  * excruciatingly precise description.
193  */
194  *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
195  snapshot);
196  }
197 
198  return stack_in;
199 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
union BTPageOpaqueData::@46 btpo
BTStackData * BTStack
Definition: nbtree.h:422
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:339
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:402
OffsetNumber bts_offset
Definition: nbtree.h:418
IndexTupleData * IndexTuple
Definition: itup.h:53
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:255
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:61
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
BlockNumber bts_blkno
Definition: nbtree.h:417
Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:237
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
struct BTStackData * bts_parent
Definition: nbtree.h:419
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
void * palloc(Size size)
Definition: mcxt.c:949
#define BT_WRITE
Definition: nbtree.h:403
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1625 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_killitems(), _bt_parallel_seize(), _bt_readnextpage(), Assert, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), InvalidBlockNumber, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, ScanDirectionIsForward, and status().

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

1626 {
1627  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1629  bool status = true;
1630 
1632 
1633  /* Before leaving current page, deal with any killed items */
1634  if (so->numKilled > 0)
1635  _bt_killitems(scan);
1636 
1637  /*
1638  * Before we modify currPos, make a copy of the page data if there was a
1639  * mark position that needs it.
1640  */
1641  if (so->markItemIndex >= 0)
1642  {
1643  /* bump pin on current buffer for assignment to mark buffer */
1644  if (BTScanPosIsPinned(so->currPos))
1646  memcpy(&so->markPos, &so->currPos,
1647  offsetof(BTScanPosData, items[1]) +
1648  so->currPos.lastItem * sizeof(BTScanPosItem));
1649  if (so->markTuples)
1650  memcpy(so->markTuples, so->currTuples,
1651  so->currPos.nextTupleOffset);
1652  so->markPos.itemIndex = so->markItemIndex;
1653  so->markItemIndex = -1;
1654  }
1655 
1656  if (ScanDirectionIsForward(dir))
1657  {
1658  /* Walk right to the next page with data */
1659  if (scan->parallel_scan != NULL)
1660  {
1661  /*
1662  * Seize the scan to get the next block number; if the scan has
1663  * ended already, bail out.
1664  */
1665  status = _bt_parallel_seize(scan, &blkno);
1666  if (!status)
1667  {
1668  /* release the previous buffer, if pinned */
1671  return false;
1672  }
1673  }
1674  else
1675  {
1676  /* Not parallel, so use the previously-saved nextPage link. */
1677  blkno = so->currPos.nextPage;
1678  }
1679 
1680  /* Remember we left a page with data */
1681  so->currPos.moreLeft = true;
1682 
1683  /* release the previous buffer, if pinned */
1685  }
1686  else
1687  {
1688  /* Remember we left a page with data */
1689  so->currPos.moreRight = true;
1690 
1691  if (scan->parallel_scan != NULL)
1692  {
1693  /*
1694  * Seize the scan to get the current block number; if the scan has
1695  * ended already, bail out.
1696  */
1697  status = _bt_parallel_seize(scan, &blkno);
1699  if (!status)
1700  {
1702  return false;
1703  }
1704  }
1705  else
1706  {
1707  /* Not parallel, so just use our own notion of the current page */
1708  blkno = so->currPos.currPage;
1709  }
1710  }
1711 
1712  if (!_bt_readnextpage(scan, blkno, dir))
1713  return false;
1714 
1715  /* Drop the lock, and maybe the pin, on the current page */
1717 
1718  return true;
1719 }
bool moreRight
Definition: nbtree.h:562
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:579
char * currTuples
Definition: nbtree.h:653
BlockNumber currPage
Definition: nbtree.h:552
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:603
uint32 BlockNumber
Definition: block.h:31
bool moreLeft
Definition: nbtree.h:561
int nextTupleOffset
Definition: nbtree.h:568
int lastItem
Definition: nbtree.h:578
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
BTScanPosData markPos
Definition: nbtree.h:667
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:586
char * markTuples
Definition: nbtree.h:654
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
int markItemIndex
Definition: nbtree.h:663
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1732
#define Assert(condition)
Definition: c.h:732
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:609
#define InvalidBlockNumber
Definition: block.h:33
BlockNumber nextPage
Definition: nbtree.h:553
BTScanPosData currPos
Definition: nbtree.h:666
Buffer buf
Definition: nbtree.h:549
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3403
#define offsetof(type, field)
Definition: c.h:655
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:597

◆ _bt_walk_left()

static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf,
Snapshot  snapshot 
)
static

Definition at line 1940 of file nbtsearch.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_readnextpage().

1941 {
1942  Page page;
1943  BTPageOpaque opaque;
1944 
1945  page = BufferGetPage(buf);
1946  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1947 
1948  for (;;)
1949  {
1950  BlockNumber obknum;
1951  BlockNumber lblkno;
1952  BlockNumber blkno;
1953  int tries;
1954 
1955  /* if we're at end of tree, release buf and return failure */
1956  if (P_LEFTMOST(opaque))
1957  {
1958  _bt_relbuf(rel, buf);
1959  break;
1960  }
1961  /* remember original page we are stepping left from */
1962  obknum = BufferGetBlockNumber(buf);
1963  /* step left */
1964  blkno = lblkno = opaque->btpo_prev;
1965  _bt_relbuf(rel, buf);
1966  /* check for interrupts while we're not holding any buffer lock */
1968  buf = _bt_getbuf(rel, blkno, BT_READ);
1969  page = BufferGetPage(buf);
1970  TestForOldSnapshot(snapshot, rel, page);
1971  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1972 
1973  /*
1974  * If this isn't the page we want, walk right till we find what we
1975  * want --- but go no more than four hops (an arbitrary limit). If we
1976  * don't find the correct page by then, the most likely bet is that
1977  * the original page got deleted and isn't in the sibling chain at all
1978  * anymore, not that its left sibling got split more than four times.
1979  *
1980  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1981  * because half-dead pages are still in the sibling chain. Caller
1982  * must reject half-dead pages if wanted.
1983  */
1984  tries = 0;
1985  for (;;)
1986  {
1987  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1988  {
1989  /* Found desired page, return it */
1990  return buf;
1991  }
1992  if (P_RIGHTMOST(opaque) || ++tries > 4)
1993  break;
1994  blkno = opaque->btpo_next;
1995  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1996  page = BufferGetPage(buf);
1997  TestForOldSnapshot(snapshot, rel, page);
1998  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1999  }
2000 
2001  /* Return to the original page to see what's up */
2002  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2003  page = BufferGetPage(buf);
2004  TestForOldSnapshot(snapshot, rel, page);
2005  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2006  if (P_ISDELETED(opaque))
2007  {
2008  /*
2009  * It was deleted. Move right to first nondeleted page (there
2010  * must be one); that is the page that has acquired the deleted
2011  * one's keyspace, so stepping left from it will take us where we
2012  * want to be.
2013  */
2014  for (;;)
2015  {
2016  if (P_RIGHTMOST(opaque))
2017  elog(ERROR, "fell off the end of index \"%s\"",
2019  blkno = opaque->btpo_next;
2020  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2021  page = BufferGetPage(buf);
2022  TestForOldSnapshot(snapshot, rel, page);
2023  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2024  if (!P_ISDELETED(opaque))
2025  break;
2026  }
2027 
2028  /*
2029  * Now return to top of loop, resetting obknum to point to this
2030  * nondeleted page, and try again.
2031  */
2032  }
2033  else
2034  {
2035  /*
2036  * It wasn't deleted; the explanation had better be that the page
2037  * to the left got split or deleted. Without this check, we'd go
2038  * into an infinite loop if there's anything wrong.
2039  */
2040  if (opaque->btpo_prev == lblkno)
2041  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2042  obknum, RelationGetRelationName(rel));
2043  /* Okay to try again with new lblkno value */
2044  }
2045  }
2046 
2047  return InvalidBuffer;
2048 }
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
BlockNumber btpo_prev
Definition: nbtree.h:57
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:453
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define P_ISDELETED(opaque)
Definition: nbtree.h:191
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define elog(elevel,...)
Definition: elog.h:226
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78