PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/tqual.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static Buffer _bt_walk_left (Relation rel, Buffer buf, Snapshot snapshot)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
BTStack _bt_search (Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
 
int32 _bt_compare (Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 

Function Documentation

OffsetNumber _bt_binsrch ( Relation  rel,
Buffer  buf,
int  keysz,
ScanKey  scankey,
bool  nextkey 
)

Definition at line 319 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_doinsert(), _bt_findinsertloc(), _bt_first(), and _bt_search().

324 {
325  Page page;
326  BTPageOpaque opaque;
327  OffsetNumber low,
328  high;
329  int32 result,
330  cmpval;
331 
332  page = BufferGetPage(buf);
333  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
334 
335  low = P_FIRSTDATAKEY(opaque);
336  high = PageGetMaxOffsetNumber(page);
337 
338  /*
339  * If there are no keys on the page, return the first available slot. Note
340  * this covers two cases: the page is really empty (no keys), or it
341  * contains only a high key. The latter case is possible after vacuuming.
342  * This can never happen on an internal page, however, since they are
343  * never empty (an internal page must have children).
344  */
345  if (high < low)
346  return low;
347 
348  /*
349  * Binary search to find the first key on the page >= scan key, or first
350  * key > scankey when nextkey is true.
351  *
352  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
353  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
354  *
355  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
356  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
357  *
358  * We can fall out when high == low.
359  */
360  high++; /* establish the loop invariant for high */
361 
362  cmpval = nextkey ? 0 : 1; /* select comparison value */
363 
364  while (high > low)
365  {
366  OffsetNumber mid = low + ((high - low) / 2);
367 
368  /* We have low <= mid < high, so mid points at a real slot */
369 
370  result = _bt_compare(rel, keysz, scankey, page, mid);
371 
372  if (result >= cmpval)
373  low = mid + 1;
374  else
375  high = mid;
376  }
377 
378  /*
379  * At this point we have high == low, but be careful: they could point
380  * past the last slot on the page.
381  *
382  * On a leaf page, we always return the first key >= scan key (resp. >
383  * scan key), which could be the last slot + 1.
384  */
385  if (P_ISLEAF(opaque))
386  return low;
387 
388  /*
389  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
390  * There must be one if _bt_compare() is playing by the rules.
391  */
392  Assert(low > P_FIRSTDATAKEY(opaque));
393 
394  return OffsetNumberPrev(low);
395 }
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
signed int int32
Definition: c.h:253
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:65
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define Assert(condition)
Definition: c.h:667
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:424
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:177
int32 _bt_compare ( Relation  rel,
int  keysz,
ScanKey  scankey,
Page  page,
OffsetNumber  offnum 
)

Definition at line 424 of file nbtsearch.c.

References DatumGetInt32, FunctionCall2Coll(), i, index_getattr, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_findinsertloc(), and _bt_moveright().

429 {
430  TupleDesc itupdesc = RelationGetDescr(rel);
432  IndexTuple itup;
433  int i;
434 
435  /*
436  * Force result ">" if target item is first data item on an internal page
437  * --- see NOTE above.
438  */
439  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
440  return 1;
441 
442  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
443 
444  /*
445  * The scan key is set up with the attribute number associated with each
446  * term in the key. It is important that, if the index is multi-key, the
447  * scan contain the first k key attributes, and that they be in order. If
448  * you think about how multi-key ordering works, you'll understand why
449  * this is.
450  *
451  * We don't test for violation of this condition here, however. The
452  * initial setup for the index scan had better have gotten it right (see
453  * _bt_first).
454  */
455 
456  for (i = 1; i <= keysz; i++)
457  {
458  Datum datum;
459  bool isNull;
460  int32 result;
461 
462  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
463 
464  /* see comments about NULLs handling in btbuild */
465  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
466  {
467  if (isNull)
468  result = 0; /* NULL "=" NULL */
469  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
470  result = -1; /* NULL "<" NOT_NULL */
471  else
472  result = 1; /* NULL ">" NOT_NULL */
473  }
474  else if (isNull) /* key is NOT_NULL and item is NULL */
475  {
476  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
477  result = 1; /* NOT_NULL ">" NULL */
478  else
479  result = -1; /* NOT_NULL "<" NULL */
480  }
481  else
482  {
483  /*
484  * The sk_func needs to be passed the index value as left arg and
485  * the sk_argument as right arg (they might be of different
486  * types). Since it is convenient for callers to think of
487  * _bt_compare as comparing the scankey to the index item, we have
488  * to flip the sign of the comparison result. (Unless it's a DESC
489  * column, in which case we *don't* flip the sign.)
490  */
491  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
492  scankey->sk_collation,
493  datum,
494  scankey->sk_argument));
495 
496  if (!(scankey->sk_flags & SK_BT_DESC))
497  result = -result;
498  }
499 
500  /* if the keys are unequal, return the difference */
501  if (result != 0)
502  return result;
503 
504  scankey++;
505  }
506 
507  /* if we get here, the keys are equal */
508  return 0;
509 }
#define DatumGetInt32(X)
Definition: postgres.h:480
#define RelationGetDescr(relation)
Definition: rel.h:420
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1306
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
signed int int32
Definition: c.h:253
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define SK_ISNULL
Definition: skey.h:115
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:652
uintptr_t Datum
Definition: postgres.h:374
int sk_flags
Definition: skey.h:66
#define SK_BT_DESC
Definition: nbtree.h:651
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:177
static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 53 of file nbtsearch.c.

References BTScanPosData::buf, BUFFER_LOCK_UNLOCK, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, LockBuffer(), RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_endpoint(), _bt_first(), and _bt_steppage().

54 {
56 
57  if (IsMVCCSnapshot(scan->xs_snapshot) &&
59  !scan->xs_want_itup)
60  {
61  ReleaseBuffer(sp->buf);
62  sp->buf = InvalidBuffer;
63  }
64 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
Snapshot xs_snapshot
Definition: relscan.h:90
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3292
Relation indexRelation
Definition: relscan.h:89
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
bool xs_want_itup
Definition: relscan.h:95
#define IsMVCCSnapshot(snapshot)
Definition: tqual.h:31
#define RelationNeedsWAL(relation)
Definition: rel.h:497
Buffer buf
Definition: nbtree.h:522
static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1660 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_get_endpoint(), _bt_readpage(), _bt_steppage(), Assert, BTScanPosInvalidate, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog, ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, LockBuffer(), BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber, PageGetSpecialPointer, PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

1661 {
1662  Relation rel = scan->indexRelation;
1663  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1664  Buffer buf;
1665  Page page;
1666  BTPageOpaque opaque;
1667  OffsetNumber start;
1668  BTScanPosItem *currItem;
1669 
1670  /*
1671  * Scan down to the leftmost or rightmost leaf page. This is a simplified
1672  * version of _bt_search(). We don't maintain a stack since we know we
1673  * won't need it.
1674  */
1676 
1677  if (!BufferIsValid(buf))
1678  {
1679  /*
1680  * Empty index. Lock the whole relation, as nothing finer to lock
1681  * exists.
1682  */
1683  PredicateLockRelation(rel, scan->xs_snapshot);
1685  return false;
1686  }
1687 
1689  page = BufferGetPage(buf);
1690  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1691  Assert(P_ISLEAF(opaque));
1692 
1693  if (ScanDirectionIsForward(dir))
1694  {
1695  /* There could be dead pages to the left, so not this: */
1696  /* Assert(P_LEFTMOST(opaque)); */
1697 
1698  start = P_FIRSTDATAKEY(opaque);
1699  }
1700  else if (ScanDirectionIsBackward(dir))
1701  {
1702  Assert(P_RIGHTMOST(opaque));
1703 
1704  start = PageGetMaxOffsetNumber(page);
1705  }
1706  else
1707  {
1708  elog(ERROR, "invalid scan direction: %d", (int) dir);
1709  start = 0; /* keep compiler quiet */
1710  }
1711 
1712  /* remember which buffer we have pinned */
1713  so->currPos.buf = buf;
1714 
1715  /* initialize moreLeft/moreRight appropriately for scan direction */
1716  if (ScanDirectionIsForward(dir))
1717  {
1718  so->currPos.moreLeft = false;
1719  so->currPos.moreRight = true;
1720  }
1721  else
1722  {
1723  so->currPos.moreLeft = true;
1724  so->currPos.moreRight = false;
1725  }
1726  so->numKilled = 0; /* just paranoia */
1727  so->markItemIndex = -1; /* ditto */
1728 
1729  /*
1730  * Now load data from the first page of the scan.
1731  */
1732  if (!_bt_readpage(scan, dir, start))
1733  {
1734  /*
1735  * There's no actually-matching data on this page. Try to advance to
1736  * the next page. Return false if there's no matching data at all.
1737  */
1739  if (!_bt_steppage(scan, dir))
1740  return false;
1741  }
1742  else
1743  {
1744  /* Drop the lock, and maybe the pin, on the current page */
1746  }
1747 
1748  /* OK, itemIndex says what to return */
1749  currItem = &so->currPos.items[so->currPos.itemIndex];
1750  scan->xs_ctup.t_self = currItem->heapTid;
1751  if (scan->xs_want_itup)
1752  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1753 
1754  return true;
1755 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2438
bool moreRight
Definition: nbtree.h:535
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:107
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:554
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:53
int itemIndex
Definition: nbtree.h:552
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2415
char * currTuples
Definition: nbtree.h:624
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
Snapshot xs_snapshot
Definition: relscan.h:90
bool moreLeft
Definition: nbtree.h:534
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1578
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
Relation indexRelation
Definition: relscan.h:89
uint16 OffsetNumber
Definition: off.h:24
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:641
ItemPointerData t_self
Definition: htup.h:65
static char * buf
Definition: pg_test_fsync.c:65
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
int markItemIndex
Definition: nbtree.h:634
bool xs_want_itup
Definition: relscan.h:95
#define Assert(condition)
Definition: c.h:667
HeapTupleData xs_ctup
Definition: relscan.h:111
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:582
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
BTScanPosData currPos
Definition: nbtree.h:637
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1290
#define elog
Definition: elog.h:219
Buffer buf
Definition: nbtree.h:522
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:176
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1138
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:177
bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 532 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, NULL, BTScanOpaqueData::numberOfKeys, BTScanOpaqueData::numKilled, OffsetNumberPrev, IndexScanDescData::opaque, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

533 {
534  Relation rel = scan->indexRelation;
535  BTScanOpaque so = (BTScanOpaque) scan->opaque;
536  Buffer buf;
537  BTStack stack;
538  OffsetNumber offnum;
539  StrategyNumber strat;
540  bool nextkey;
541  bool goback;
542  ScanKey startKeys[INDEX_MAX_KEYS];
543  ScanKeyData scankeys[INDEX_MAX_KEYS];
544  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
545  int keysCount = 0;
546  int i;
547  StrategyNumber strat_total;
548  BTScanPosItem *currItem;
549 
551 
553 
554  /*
555  * Examine the scan keys and eliminate any redundant keys; also mark the
556  * keys that must be matched to continue the scan.
557  */
558  _bt_preprocess_keys(scan);
559 
560  /*
561  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
562  * never be satisfied (eg, x == 1 AND x > 2).
563  */
564  if (!so->qual_ok)
565  return false;
566 
567  /*----------
568  * Examine the scan keys to discover where we need to start the scan.
569  *
570  * We want to identify the keys that can be used as starting boundaries;
571  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
572  * a backwards scan. We can use keys for multiple attributes so long as
573  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
574  * a > or < boundary or find an attribute with no boundary (which can be
575  * thought of as the same as "> -infinity"), we can't use keys for any
576  * attributes to its right, because it would break our simplistic notion
577  * of what initial positioning strategy to use.
578  *
579  * When the scan keys include cross-type operators, _bt_preprocess_keys
580  * may not be able to eliminate redundant keys; in such cases we will
581  * arbitrarily pick a usable one for each attribute. This is correct
582  * but possibly not optimal behavior. (For example, with keys like
583  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
584  * x=5 would be more efficient.) Since the situation only arises given
585  * a poorly-worded query plus an incomplete opfamily, live with it.
586  *
587  * When both equality and inequality keys appear for a single attribute
588  * (again, only possible when cross-type operators appear), we *must*
589  * select one of the equality keys for the starting point, because
590  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
591  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
592  * start at x=4, we will fail and stop before reaching x=10. If multiple
593  * equality quals survive preprocessing, however, it doesn't matter which
594  * one we use --- by definition, they are either redundant or
595  * contradictory.
596  *
597  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
598  * If the index stores nulls at the end of the index we'll be starting
599  * from, and we have no boundary key for the column (which means the key
600  * we deduced NOT NULL from is an inequality key that constrains the other
601  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
602  * use as a boundary key. If we didn't do this, we might find ourselves
603  * traversing a lot of null entries at the start of the scan.
604  *
605  * In this loop, row-comparison keys are treated the same as keys on their
606  * first (leftmost) columns. We'll add on lower-order columns of the row
607  * comparison below, if possible.
608  *
609  * The selected scan keys (at most one per index column) are remembered by
610  * storing their addresses into the local startKeys[] array.
611  *----------
612  */
613  strat_total = BTEqualStrategyNumber;
614  if (so->numberOfKeys > 0)
615  {
616  AttrNumber curattr;
617  ScanKey chosen;
618  ScanKey impliesNN;
619  ScanKey cur;
620 
621  /*
622  * chosen is the so-far-chosen key for the current attribute, if any.
623  * We don't cast the decision in stone until we reach keys for the
624  * next attribute.
625  */
626  curattr = 1;
627  chosen = NULL;
628  /* Also remember any scankey that implies a NOT NULL constraint */
629  impliesNN = NULL;
630 
631  /*
632  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
633  * pass to handle after-last-key processing. Actual exit from the
634  * loop is at one of the "break" statements below.
635  */
636  for (cur = so->keyData, i = 0;; cur++, i++)
637  {
638  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
639  {
640  /*
641  * Done looking at keys for curattr. If we didn't find a
642  * usable boundary key, see if we can deduce a NOT NULL key.
643  */
644  if (chosen == NULL && impliesNN != NULL &&
645  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
648  {
649  /* Yes, so build the key in notnullkeys[keysCount] */
650  chosen = &notnullkeys[keysCount];
651  ScanKeyEntryInitialize(chosen,
653  (impliesNN->sk_flags &
655  curattr,
656  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
659  InvalidOid,
660  InvalidOid,
661  InvalidOid,
662  (Datum) 0);
663  }
664 
665  /*
666  * If we still didn't find a usable boundary key, quit; else
667  * save the boundary key pointer in startKeys.
668  */
669  if (chosen == NULL)
670  break;
671  startKeys[keysCount++] = chosen;
672 
673  /*
674  * Adjust strat_total, and quit if we have stored a > or <
675  * key.
676  */
677  strat = chosen->sk_strategy;
678  if (strat != BTEqualStrategyNumber)
679  {
680  strat_total = strat;
681  if (strat == BTGreaterStrategyNumber ||
682  strat == BTLessStrategyNumber)
683  break;
684  }
685 
686  /*
687  * Done if that was the last attribute, or if next key is not
688  * in sequence (implying no boundary key is available for the
689  * next attribute).
690  */
691  if (i >= so->numberOfKeys ||
692  cur->sk_attno != curattr + 1)
693  break;
694 
695  /*
696  * Reset for next attr.
697  */
698  curattr = cur->sk_attno;
699  chosen = NULL;
700  impliesNN = NULL;
701  }
702 
703  /*
704  * Can we use this key as a starting boundary for this attr?
705  *
706  * If not, does it imply a NOT NULL constraint? (Because
707  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
708  * *any* inequality key works for that; we need not test.)
709  */
710  switch (cur->sk_strategy)
711  {
714  if (chosen == NULL)
715  {
716  if (ScanDirectionIsBackward(dir))
717  chosen = cur;
718  else
719  impliesNN = cur;
720  }
721  break;
723  /* override any non-equality choice */
724  chosen = cur;
725  break;
728  if (chosen == NULL)
729  {
730  if (ScanDirectionIsForward(dir))
731  chosen = cur;
732  else
733  impliesNN = cur;
734  }
735  break;
736  }
737  }
738  }
739 
740  /*
741  * If we found no usable boundary keys, we have to start from one end of
742  * the tree. Walk down that edge to the first or last key, and scan from
743  * there.
744  */
745  if (keysCount == 0)
746  return _bt_endpoint(scan, dir);
747 
748  /*
749  * We want to start the scan somewhere within the index. Set up an
750  * insertion scankey we can use to search for the boundary point we
751  * identified above. The insertion scankey is built in the local
752  * scankeys[] array, using the keys identified by startKeys[].
753  */
754  Assert(keysCount <= INDEX_MAX_KEYS);
755  for (i = 0; i < keysCount; i++)
756  {
757  ScanKey cur = startKeys[i];
758 
759  Assert(cur->sk_attno == i + 1);
760 
761  if (cur->sk_flags & SK_ROW_HEADER)
762  {
763  /*
764  * Row comparison header: look to the first row member instead.
765  *
766  * The member scankeys are already in insertion format (ie, they
767  * have sk_func = 3-way-comparison function), but we have to watch
768  * out for nulls, which _bt_preprocess_keys didn't check. A null
769  * in the first row member makes the condition unmatchable, just
770  * like qual_ok = false.
771  */
772  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
773 
774  Assert(subkey->sk_flags & SK_ROW_MEMBER);
775  if (subkey->sk_flags & SK_ISNULL)
776  return false;
777  memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
778 
779  /*
780  * If the row comparison is the last positioning key we accepted,
781  * try to add additional keys from the lower-order row members.
782  * (If we accepted independent conditions on additional index
783  * columns, we use those instead --- doesn't seem worth trying to
784  * determine which is more restrictive.) Note that this is OK
785  * even if the row comparison is of ">" or "<" type, because the
786  * condition applied to all but the last row member is effectively
787  * ">=" or "<=", and so the extra keys don't break the positioning
788  * scheme. But, by the same token, if we aren't able to use all
789  * the row members, then the part of the row comparison that we
790  * did use has to be treated as just a ">=" or "<=" condition, and
791  * so we'd better adjust strat_total accordingly.
792  */
793  if (i == keysCount - 1)
794  {
795  bool used_all_subkeys = false;
796 
797  Assert(!(subkey->sk_flags & SK_ROW_END));
798  for (;;)
799  {
800  subkey++;
801  Assert(subkey->sk_flags & SK_ROW_MEMBER);
802  if (subkey->sk_attno != keysCount + 1)
803  break; /* out-of-sequence, can't use it */
804  if (subkey->sk_strategy != cur->sk_strategy)
805  break; /* wrong direction, can't use it */
806  if (subkey->sk_flags & SK_ISNULL)
807  break; /* can't use null keys */
808  Assert(keysCount < INDEX_MAX_KEYS);
809  memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
810  keysCount++;
811  if (subkey->sk_flags & SK_ROW_END)
812  {
813  used_all_subkeys = true;
814  break;
815  }
816  }
817  if (!used_all_subkeys)
818  {
819  switch (strat_total)
820  {
822  strat_total = BTLessEqualStrategyNumber;
823  break;
825  strat_total = BTGreaterEqualStrategyNumber;
826  break;
827  }
828  }
829  break; /* done with outer loop */
830  }
831  }
832  else
833  {
834  /*
835  * Ordinary comparison key. Transform the search-style scan key
836  * to an insertion scan key by replacing the sk_func with the
837  * appropriate btree comparison function.
838  *
839  * If scankey operator is not a cross-type comparison, we can use
840  * the cached comparison function; otherwise gotta look it up in
841  * the catalogs. (That can't lead to infinite recursion, since no
842  * indexscan initiated by syscache lookup will use cross-data-type
843  * operators.)
844  *
845  * We support the convention that sk_subtype == InvalidOid means
846  * the opclass input type; this is a hack to simplify life for
847  * ScanKeyInit().
848  */
849  if (cur->sk_subtype == rel->rd_opcintype[i] ||
850  cur->sk_subtype == InvalidOid)
851  {
852  FmgrInfo *procinfo;
853 
854  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
856  cur->sk_flags,
857  cur->sk_attno,
859  cur->sk_subtype,
860  cur->sk_collation,
861  procinfo,
862  cur->sk_argument);
863  }
864  else
865  {
866  RegProcedure cmp_proc;
867 
868  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
869  rel->rd_opcintype[i],
870  cur->sk_subtype,
871  BTORDER_PROC);
872  if (!RegProcedureIsValid(cmp_proc))
873  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
874  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
875  cur->sk_attno, RelationGetRelationName(rel));
876  ScanKeyEntryInitialize(scankeys + i,
877  cur->sk_flags,
878  cur->sk_attno,
880  cur->sk_subtype,
881  cur->sk_collation,
882  cmp_proc,
883  cur->sk_argument);
884  }
885  }
886  }
887 
888  /*----------
889  * Examine the selected initial-positioning strategy to determine exactly
890  * where we need to start the scan, and set flag variables to control the
891  * code below.
892  *
893  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
894  * item >= scan key. If nextkey = true, they will locate the first
895  * item > scan key.
896  *
897  * If goback = true, we will then step back one item, while if
898  * goback = false, we will start the scan on the located item.
899  *----------
900  */
901  switch (strat_total)
902  {
904 
905  /*
906  * Find first item >= scankey, then back up one to arrive at last
907  * item < scankey. (Note: this positioning strategy is only used
908  * for a backward scan, so that is always the correct starting
909  * position.)
910  */
911  nextkey = false;
912  goback = true;
913  break;
914 
916 
917  /*
918  * Find first item > scankey, then back up one to arrive at last
919  * item <= scankey. (Note: this positioning strategy is only used
920  * for a backward scan, so that is always the correct starting
921  * position.)
922  */
923  nextkey = true;
924  goback = true;
925  break;
926 
928 
929  /*
930  * If a backward scan was specified, need to start with last equal
931  * item not first one.
932  */
933  if (ScanDirectionIsBackward(dir))
934  {
935  /*
936  * This is the same as the <= strategy. We will check at the
937  * end whether the found item is actually =.
938  */
939  nextkey = true;
940  goback = true;
941  }
942  else
943  {
944  /*
945  * This is the same as the >= strategy. We will check at the
946  * end whether the found item is actually =.
947  */
948  nextkey = false;
949  goback = false;
950  }
951  break;
952 
954 
955  /*
956  * Find first item >= scankey. (This is only used for forward
957  * scans.)
958  */
959  nextkey = false;
960  goback = false;
961  break;
962 
964 
965  /*
966  * Find first item > scankey. (This is only used for forward
967  * scans.)
968  */
969  nextkey = true;
970  goback = false;
971  break;
972 
973  default:
974  /* can't get here, but keep compiler quiet */
975  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
976  return false;
977  }
978 
979  /*
980  * Use the manufactured insertion scan key to descend the tree and
981  * position ourselves on the target leaf page.
982  */
983  stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
984  scan->xs_snapshot);
985 
986  /* don't need to keep the stack around... */
987  _bt_freestack(stack);
988 
989  if (!BufferIsValid(buf))
990  {
991  /*
992  * We only get here if the index is completely empty. Lock relation
993  * because nothing finer to lock exists.
994  */
996  return false;
997  }
998  else
1000  scan->xs_snapshot);
1001 
1002  /* initialize moreLeft/moreRight appropriately for scan direction */
1003  if (ScanDirectionIsForward(dir))
1004  {
1005  so->currPos.moreLeft = false;
1006  so->currPos.moreRight = true;
1007  }
1008  else
1009  {
1010  so->currPos.moreLeft = true;
1011  so->currPos.moreRight = false;
1012  }
1013  so->numKilled = 0; /* just paranoia */
1014  Assert(so->markItemIndex == -1);
1015 
1016  /* position to the precise item on the page */
1017  offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
1018 
1019  /*
1020  * If nextkey = false, we are positioned at the first item >= scan key, or
1021  * possibly at the end of a page on which all the existing items are less
1022  * than the scan key and we know that everything on later pages is greater
1023  * than or equal to scan key.
1024  *
1025  * If nextkey = true, we are positioned at the first item > scan key, or
1026  * possibly at the end of a page on which all the existing items are less
1027  * than or equal to the scan key and we know that everything on later
1028  * pages is greater than scan key.
1029  *
1030  * The actually desired starting point is either this item or the prior
1031  * one, or in the end-of-page case it's the first item on the next page or
1032  * the last item on this page. Adjust the starting offset if needed. (If
1033  * this results in an offset before the first item or after the last one,
1034  * _bt_readpage will report no items found, and then we'll step to the
1035  * next page as needed.)
1036  */
1037  if (goback)
1038  offnum = OffsetNumberPrev(offnum);
1039 
1040  /* remember which buffer we have pinned, if any */
1042  so->currPos.buf = buf;
1043 
1044  /*
1045  * Now load data from the first page of the scan.
1046  */
1047  if (!_bt_readpage(scan, dir, offnum))
1048  {
1049  /*
1050  * There's no actually-matching data on this page. Try to advance to
1051  * the next page. Return false if there's no matching data at all.
1052  */
1054  if (!_bt_steppage(scan, dir))
1055  return false;
1056  }
1057  else
1058  {
1059  /* Drop the lock, and maybe the pin, on the current page */
1061  }
1062 
1063  /* OK, itemIndex says what to return */
1064  currItem = &so->currPos.items[so->currPos.itemIndex];
1065  scan->xs_ctup.t_self = currItem->heapTid;
1066  if (scan->xs_want_itup)
1067  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1068 
1069  return true;
1070 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
Definition: fmgr.h:53
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:164
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2438
#define BTORDER_PROC
Definition: nbtree.h:455
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:729
bool moreRight
Definition: nbtree.h:535
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:107
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:554
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:53
int itemIndex
Definition: nbtree.h:552
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2415
char * currTuples
Definition: nbtree.h:624
regproc RegProcedure
Definition: c.h:391
Snapshot xs_snapshot
Definition: relscan.h:90
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:576
bool moreLeft
Definition: nbtree.h:534
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:89
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:464
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:641
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData t_self
Definition: htup.h:65
static char * buf
Definition: pg_test_fsync.c:65
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1660
#define RegProcedureIsValid(p)
Definition: c.h:532
IndexTupleData * IndexTuple
Definition: itup.h:53
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:319
#define RelationGetRelationName(relation)
Definition: rel.h:428
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1152
#define SK_SEARCHNOTNULL
Definition: skey.h:122
Oid * rd_opfamily
Definition: rel.h:173
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:741
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:652
uintptr_t Datum
Definition: postgres.h:374
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:93
#define InvalidOid
Definition: postgres_ext.h:36
int markItemIndex
Definition: nbtree.h:634
bool xs_want_itup
Definition: relscan.h:95
int sk_flags
Definition: skey.h:66
#define NULL
Definition: c.h:226
#define Assert(condition)
Definition: c.h:667
#define SK_BT_DESC
Definition: nbtree.h:651
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
HeapTupleData xs_ctup
Definition: relscan.h:111
#define INDEX_MAX_KEYS
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int numberOfKeys
Definition: nbtree.h:605
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define DatumGetPointer(X)
Definition: postgres.h:557
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
BTScanPosData currPos
Definition: nbtree.h:637
ScanKey keyData
Definition: nbtree.h:606
Oid sk_collation
Definition: skey.h:70
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1290
#define elog
Definition: elog.h:219
Buffer buf
Definition: nbtree.h:522
Oid * rd_opcintype
Definition: rel.h:174
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1138
int16 AttrNumber
Definition: attnum.h:21
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67
Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 1578 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, buf, BufferGetPage, BufferIsValid, elog, ERROR, InvalidBuffer, ItemPointerGetBlockNumber, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, IndexTupleData::t_tid, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

1580 {
1581  Buffer buf;
1582  Page page;
1583  BTPageOpaque opaque;
1584  OffsetNumber offnum;
1585  BlockNumber blkno;
1586  IndexTuple itup;
1587 
1588  /*
1589  * If we are looking for a leaf page, okay to descend from fast root;
1590  * otherwise better descend from true root. (There is no point in being
1591  * smarter about intermediate levels.)
1592  */
1593  if (level == 0)
1594  buf = _bt_getroot(rel, BT_READ);
1595  else
1596  buf = _bt_gettrueroot(rel);
1597 
1598  if (!BufferIsValid(buf))
1599  return InvalidBuffer;
1600 
1601  page = BufferGetPage(buf);
1602  TestForOldSnapshot(snapshot, rel, page);
1603  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1604 
1605  for (;;)
1606  {
1607  /*
1608  * If we landed on a deleted page, step right to find a live page
1609  * (there must be one). Also, if we want the rightmost page, step
1610  * right if needed to get to it (this could happen if the page split
1611  * since we obtained a pointer to it).
1612  */
1613  while (P_IGNORE(opaque) ||
1614  (rightmost && !P_RIGHTMOST(opaque)))
1615  {
1616  blkno = opaque->btpo_next;
1617  if (blkno == P_NONE)
1618  elog(ERROR, "fell off the end of index \"%s\"",
1620  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1621  page = BufferGetPage(buf);
1622  TestForOldSnapshot(snapshot, rel, page);
1623  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1624  }
1625 
1626  /* Done? */
1627  if (opaque->btpo.level == level)
1628  break;
1629  if (opaque->btpo.level < level)
1630  elog(ERROR, "btree level %u not found in index \"%s\"",
1631  level, RelationGetRelationName(rel));
1632 
1633  /* Descend to leftmost or rightmost child page */
1634  if (rightmost)
1635  offnum = PageGetMaxOffsetNumber(page);
1636  else
1637  offnum = P_FIRSTDATAKEY(opaque);
1638 
1639  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1640  blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1641 
1642  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1643  page = BufferGetPage(buf);
1644  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1645  }
1646 
1647  return buf;
1648 }
BlockNumber btpo_next
Definition: nbtree.h:57
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:265
#define P_IGNORE(opaque)
Definition: nbtree.h:181
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:700
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
ItemPointerData t_tid
Definition: itup.h:37
#define P_NONE
Definition: nbtree.h:169
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
union BTPageOpaqueData::@39 btpo
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:464
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:65
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:428
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:103
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
uint32 level
Definition: nbtree.h:60
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:340
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define elog
Definition: elog.h:219
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:66
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:176
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
Buffer _bt_moveright ( Relation  rel,
Buffer  buf,
int  keysz,
ScanKey  scankey,
bool  nextkey,
bool  forupdate,
BTStack  stack,
int  access,
Snapshot  snapshot 
)

Definition at line 210 of file nbtsearch.c.

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BT_WRITE, BTPageOpaqueData::btpo_next, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, elog, ERROR, LockBuffer(), P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_doinsert(), and _bt_search().

219 {
220  Page page;
221  BTPageOpaque opaque;
222  int32 cmpval;
223 
224  /*
225  * When nextkey = false (normal case): if the scan key that brought us to
226  * this page is > the high key stored on the page, then the page has split
227  * and we need to move right. (If the scan key is equal to the high key,
228  * we might or might not need to move right; have to scan the page first
229  * anyway.)
230  *
231  * When nextkey = true: move right if the scan key is >= page's high key.
232  *
233  * The page could even have split more than once, so scan as far as
234  * needed.
235  *
236  * We also have to move right if we followed a link that brought us to a
237  * dead page.
238  */
239  cmpval = nextkey ? 0 : 1;
240 
241  for (;;)
242  {
243  page = BufferGetPage(buf);
244  TestForOldSnapshot(snapshot, rel, page);
245  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
246 
247  if (P_RIGHTMOST(opaque))
248  break;
249 
250  /*
251  * Finish any incomplete splits we encounter along the way.
252  */
253  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
254  {
256 
257  /* upgrade our lock if necessary */
258  if (access == BT_READ)
259  {
262  }
263 
264  if (P_INCOMPLETE_SPLIT(opaque))
265  _bt_finish_split(rel, buf, stack);
266  else
267  _bt_relbuf(rel, buf);
268 
269  /* re-acquire the lock in the right mode, and re-check */
270  buf = _bt_getbuf(rel, blkno, access);
271  continue;
272  }
273 
274  if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
275  {
276  /* step right one page */
277  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
278  continue;
279  }
280  else
281  break;
282  }
283 
284  if (P_IGNORE(opaque))
285  elog(ERROR, "fell off the end of index \"%s\"",
287 
288  return buf;
289 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
BlockNumber btpo_next
Definition: nbtree.h:57
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:265
#define P_IGNORE(opaque)
Definition: nbtree.h:181
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:700
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:569
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:183
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
signed int int32
Definition: c.h:253
#define BT_READ
Definition: nbtree.h:464
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1744
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:65
#define RelationGetRelationName(relation)
Definition: rel.h:428
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:719
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define P_HIKEY
Definition: nbtree.h:203
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
#define elog
Definition: elog.h:219
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:424
#define BT_WRITE
Definition: nbtree.h:465
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:176
Pointer Page
Definition: bufpage.h:74
bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1087 of file nbtsearch.c.

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

1088 {
1089  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1090  BTScanPosItem *currItem;
1091 
1092  /*
1093  * Advance to next tuple on current page; or if there's no more, try to
1094  * step to the next page with data.
1095  */
1096  if (ScanDirectionIsForward(dir))
1097  {
1098  if (++so->currPos.itemIndex > so->currPos.lastItem)
1099  {
1100  if (!_bt_steppage(scan, dir))
1101  return false;
1102  }
1103  }
1104  else
1105  {
1106  if (--so->currPos.itemIndex < so->currPos.firstItem)
1107  {
1108  if (!_bt_steppage(scan, dir))
1109  return false;
1110  }
1111  }
1112 
1113  /* OK, itemIndex says what to return */
1114  currItem = &so->currPos.items[so->currPos.itemIndex];
1115  scan->xs_ctup.t_self = currItem->heapTid;
1116  if (scan->xs_want_itup)
1117  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1118 
1119  return true;
1120 }
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:107
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:554
int itemIndex
Definition: nbtree.h:552
char * currTuples
Definition: nbtree.h:624
int lastItem
Definition: nbtree.h:551
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:641
ItemPointerData t_self
Definition: htup.h:65
IndexTupleData * IndexTuple
Definition: itup.h:53
int firstItem
Definition: nbtree.h:550
bool xs_want_itup
Definition: relscan.h:95
HeapTupleData xs_ctup
Definition: relscan.h:111
BTScanPosData currPos
Definition: nbtree.h:637
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1290
static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum 
)
static

Definition at line 1138 of file nbtsearch.c.

References _bt_checkkeys(), _bt_saveitem(), Assert, BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxIndexTuplesPerPage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, NULL, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_steppage().

1139 {
1140  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1141  Page page;
1142  BTPageOpaque opaque;
1143  OffsetNumber minoff;
1144  OffsetNumber maxoff;
1145  int itemIndex;
1146  IndexTuple itup;
1147  bool continuescan;
1148 
1149  /*
1150  * We must have the buffer pinned and locked, but the usual macro can't be
1151  * used here; this function is what makes it good for currPos.
1152  */
1154 
1155  page = BufferGetPage(so->currPos.buf);
1156  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1157  minoff = P_FIRSTDATAKEY(opaque);
1158  maxoff = PageGetMaxOffsetNumber(page);
1159 
1160  /*
1161  * We note the buffer's block number so that we can release the pin later.
1162  * This allows us to re-read the buffer if it is needed again for hinting.
1163  */
1165 
1166  /*
1167  * We save the LSN of the page as we read it, so that we know whether it
1168  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1169  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1170  */
1171  so->currPos.lsn = PageGetLSN(page);
1172 
1173  /*
1174  * we must save the page's right-link while scanning it; this tells us
1175  * where to step right to after we're done with these items. There is no
1176  * corresponding need for the left-link, since splits always go right.
1177  */
1178  so->currPos.nextPage = opaque->btpo_next;
1179 
1180  /* initialize tuple workspace to empty */
1181  so->currPos.nextTupleOffset = 0;
1182 
1183  /*
1184  * Now that the current page has been made consistent, the macro should be
1185  * good.
1186  */
1188 
1189  if (ScanDirectionIsForward(dir))
1190  {
1191  /* load items[] in ascending order */
1192  itemIndex = 0;
1193 
1194  offnum = Max(offnum, minoff);
1195 
1196  while (offnum <= maxoff)
1197  {
1198  itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1199  if (itup != NULL)
1200  {
1201  /* tuple passes all scan key conditions, so remember it */
1202  _bt_saveitem(so, itemIndex, offnum, itup);
1203  itemIndex++;
1204  }
1205  if (!continuescan)
1206  {
1207  /* there can't be any more matches, so stop */
1208  so->currPos.moreRight = false;
1209  break;
1210  }
1211 
1212  offnum = OffsetNumberNext(offnum);
1213  }
1214 
1215  Assert(itemIndex <= MaxIndexTuplesPerPage);
1216  so->currPos.firstItem = 0;
1217  so->currPos.lastItem = itemIndex - 1;
1218  so->currPos.itemIndex = 0;
1219  }
1220  else
1221  {
1222  /* load items[] in descending order */
1223  itemIndex = MaxIndexTuplesPerPage;
1224 
1225  offnum = Min(offnum, maxoff);
1226 
1227  while (offnum >= minoff)
1228  {
1229  itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1230  if (itup != NULL)
1231  {
1232  /* tuple passes all scan key conditions, so remember it */
1233  itemIndex--;
1234  _bt_saveitem(so, itemIndex, offnum, itup);
1235  }
1236  if (!continuescan)
1237  {
1238  /* there can't be any more matches, so stop */
1239  so->currPos.moreLeft = false;
1240  break;
1241  }
1242 
1243  offnum = OffsetNumberPrev(offnum);
1244  }
1245 
1246  Assert(itemIndex >= 0);
1247  so->currPos.firstItem = itemIndex;
1250  }
1251 
1252  return (so->currPos.firstItem <= so->currPos.lastItem);
1253 }
bool moreRight
Definition: nbtree.h:535
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1355
int itemIndex
Definition: nbtree.h:552
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
#define Min(x, y)
Definition: c.h:798
BlockNumber currPage
Definition: nbtree.h:525
bool moreLeft
Definition: nbtree.h:534
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
uint16 OffsetNumber
Definition: off.h:24
int nextTupleOffset
Definition: nbtree.h:541
int lastItem
Definition: nbtree.h:551
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:641
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:559
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1257
int firstItem
Definition: nbtree.h:550
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define Max(x, y)
Definition: c.h:792
#define NULL
Definition: c.h:226
#define Assert(condition)
Definition: c.h:667
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define PageGetLSN(page)
Definition: bufpage.h:363
BlockNumber nextPage
Definition: nbtree.h:526
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
BTScanPosData currPos
Definition: nbtree.h:637
#define MaxIndexTuplesPerPage
Definition: itup.h:137
XLogRecPtr lsn
Definition: nbtree.h:524
Buffer buf
Definition: nbtree.h:522
Pointer Page
Definition: bufpage.h:74
static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1257 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

1259 {
1260  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1261 
1262  currItem->heapTid = itup->t_tid;
1263  currItem->indexOffset = offnum;
1264  if (so->currTuples)
1265  {
1266  Size itupsz = IndexTupleSize(itup);
1267 
1268  currItem->tupleOffset = so->currPos.nextTupleOffset;
1269  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1270  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1271  }
1272 }
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:554
char * currTuples
Definition: nbtree.h:624
ItemPointerData t_tid
Definition: itup.h:37
OffsetNumber indexOffset
Definition: nbtree.h:516
int nextTupleOffset
Definition: nbtree.h:541
size_t Size
Definition: c.h:352
#define MAXALIGN(LEN)
Definition: c.h:580
ItemPointerData heapTid
Definition: nbtree.h:515
BTScanPosData currPos
Definition: nbtree.h:637
LocationIndex tupleOffset
Definition: nbtree.h:517
#define IndexTupleSize(itup)
Definition: itup.h:70
BTStack _bt_search ( Relation  rel,
int  keysz,
ScanKey  scankey,
bool  nextkey,
Buffer bufP,
int  access,
Snapshot  snapshot 
)

Definition at line 93 of file nbtsearch.c.

References _bt_binsrch(), _bt_getroot(), _bt_moveright(), _bt_relandgetbuf(), BT_READ, BT_WRITE, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, ItemPointerGetBlockNumber, NULL, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, palloc(), and IndexTupleData::t_tid.

Referenced by _bt_doinsert(), _bt_first(), and _bt_pagedel().

95 {
96  BTStack stack_in = NULL;
97 
98  /* Get the root page to start with */
99  *bufP = _bt_getroot(rel, access);
100 
101  /* If index is empty and access = BT_READ, no root page is created. */
102  if (!BufferIsValid(*bufP))
103  return (BTStack) NULL;
104 
105  /* Loop iterates once per level descended in the tree */
106  for (;;)
107  {
108  Page page;
109  BTPageOpaque opaque;
110  OffsetNumber offnum;
111  ItemId itemid;
112  IndexTuple itup;
113  BlockNumber blkno;
114  BlockNumber par_blkno;
115  BTStack new_stack;
116 
117  /*
118  * Race -- the page we just grabbed may have split since we read its
119  * pointer in the parent (or metapage). If it has, we may need to
120  * move right to its new sibling. Do that.
121  *
122  * In write-mode, allow _bt_moveright to finish any incomplete splits
123  * along the way. Strictly speaking, we'd only need to finish an
124  * incomplete split on the leaf page we're about to insert to, not on
125  * any of the upper levels (they are taken care of in _bt_getstackbuf,
126  * if the leaf page is split and we insert to the parent page). But
127  * this is a good opportunity to finish splits of internal pages too.
128  */
129  *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
130  (access == BT_WRITE), stack_in,
131  BT_READ, snapshot);
132 
133  /* if this is a leaf page, we're done */
134  page = BufferGetPage(*bufP);
135  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
136  if (P_ISLEAF(opaque))
137  break;
138 
139  /*
140  * Find the appropriate item on the internal page, and get the child
141  * page that it points to.
142  */
143  offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
144  itemid = PageGetItemId(page, offnum);
145  itup = (IndexTuple) PageGetItem(page, itemid);
146  blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
147  par_blkno = BufferGetBlockNumber(*bufP);
148 
149  /*
150  * We need to save the location of the index entry we chose in the
151  * parent page on a stack. In case we split the tree, we'll use the
152  * stack to work back up to the parent page. We also save the actual
153  * downlink (TID) to uniquely identify the index entry, in case it
154  * moves right while we're working lower in the tree. See the paper
155  * by Lehman and Yao for how this is detected and handled. (We use the
156  * child link to disambiguate duplicate keys in the index -- Lehman
157  * and Yao disallow duplicate keys.)
158  */
159  new_stack = (BTStack) palloc(sizeof(BTStackData));
160  new_stack->bts_blkno = par_blkno;
161  new_stack->bts_offset = offnum;
162  memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
163  new_stack->bts_parent = stack_in;
164 
165  /* drop the read lock on the parent page, acquire one on the child */
166  *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
167 
168  /* okay, all set to move down a level */
169  stack_in = new_stack;
170  }
171 
172  return stack_in;
173 }
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:700
ItemPointerData t_tid
Definition: itup.h:37
BTStackData * BTStack
Definition: nbtree.h:485
uint32 BlockNumber
Definition: block.h:31
IndexTupleData bts_btentry
Definition: nbtree.h:481
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:464
Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:210
OffsetNumber bts_offset
Definition: nbtree.h:480
IndexTupleData * IndexTuple
Definition: itup.h:53
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:319
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:103
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
BlockNumber bts_blkno
Definition: nbtree.h:479
#define NULL
Definition: c.h:226
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
struct BTStackData * bts_parent
Definition: nbtree.h:482
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
void * palloc(Size size)
Definition: mcxt.c:891
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:66
#define BT_WRITE
Definition: nbtree.h:465
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:177
static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1290 of file nbtsearch.c.

References _bt_drop_lock_and_maybe_pin(), _bt_getbuf(), _bt_killitems(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), Assert, BT_READ, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), IndexScanDescData::indexRelation, InvalidBuffer, BTScanPosData::itemIndex, BTScanPosData::lastItem, LockBuffer(), BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber, PageGetSpecialPointer, PredicateLockPage(), ScanDirectionIsForward, TestForOldSnapshot(), and IndexScanDescData::xs_snapshot.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

1291 {
1292  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1293  Relation rel;
1294  Page page;
1295  BTPageOpaque opaque;
1296 
1298 
1299  /* Before leaving current page, deal with any killed items */
1300  if (so->numKilled > 0)
1301  _bt_killitems(scan);
1302 
1303  /*
1304  * Before we modify currPos, make a copy of the page data if there was a
1305  * mark position that needs it.
1306  */
1307  if (so->markItemIndex >= 0)
1308  {
1309  /* bump pin on current buffer for assignment to mark buffer */
1310  if (BTScanPosIsPinned(so->currPos))
1312  memcpy(&so->markPos, &so->currPos,
1313  offsetof(BTScanPosData, items[1]) +
1314  so->currPos.lastItem * sizeof(BTScanPosItem));
1315  if (so->markTuples)
1316  memcpy(so->markTuples, so->currTuples,
1317  so->currPos.nextTupleOffset);
1318  so->markPos.itemIndex = so->markItemIndex;
1319  so->markItemIndex = -1;
1320  }
1321 
1322  rel = scan->indexRelation;
1323 
1324  if (ScanDirectionIsForward(dir))
1325  {
1326  /* Walk right to the next page with data */
1327  /* We must rely on the previously saved nextPage link! */
1328  BlockNumber blkno = so->currPos.nextPage;
1329 
1330  /* Remember we left a page with data */
1331  so->currPos.moreLeft = true;
1332 
1333  /* release the previous buffer, if pinned */
1335 
1336  for (;;)
1337  {
1338  /* if we're at end of scan, give up */
1339  if (blkno == P_NONE || !so->currPos.moreRight)
1340  {
1342  return false;
1343  }
1344  /* check for interrupts while we're not holding any buffer lock */
1346  /* step right one page */
1347  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1348  /* check for deleted page */
1349  page = BufferGetPage(so->currPos.buf);
1350  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1351  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1352  if (!P_IGNORE(opaque))
1353  {
1354  PredicateLockPage(rel, blkno, scan->xs_snapshot);
1355  /* see if there are any matches on this page */
1356  /* note that this will clear moreRight if we can stop */
1357  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1358  break;
1359  }
1360 
1361  /* nope, keep going */
1362  blkno = opaque->btpo_next;
1363  _bt_relbuf(rel, so->currPos.buf);
1364  }
1365  }
1366  else
1367  {
1368  /* Remember we left a page with data */
1369  so->currPos.moreRight = true;
1370 
1371  /*
1372  * Walk left to the next page with data. This is much more complex
1373  * than the walk-right case because of the possibility that the page
1374  * to our left splits while we are in flight to it, plus the
1375  * possibility that the page we were on gets deleted after we leave
1376  * it. See nbtree/README for details.
1377  *
1378  * It might be possible to rearrange this code to have less overhead
1379  * in pinning and locking, but that would require capturing the left
1380  * pointer when the page is initially read, and using it here, along
1381  * with big changes to _bt_walk_left() and the code below. It is not
1382  * clear whether this would be a win, since if the page immediately to
1383  * the left splits after we read this page and before we step left, we
1384  * would need to visit more pages than with the current code.
1385  *
1386  * Note that if we change the code so that we drop the pin for a scan
1387  * which uses a non-MVCC snapshot, we will need to modify the code for
1388  * walking left, to allow for the possibility that a referenced page
1389  * has been deleted. As long as the buffer is pinned or the snapshot
1390  * is MVCC the page cannot move past the half-dead state to fully
1391  * deleted.
1392  */
1393  if (BTScanPosIsPinned(so->currPos))
1394  LockBuffer(so->currPos.buf, BT_READ);
1395  else
1396  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1397 
1398  for (;;)
1399  {
1400  /* Done if we know there are no matching keys to the left */
1401  if (!so->currPos.moreLeft)
1402  {
1403  _bt_relbuf(rel, so->currPos.buf);
1405  return false;
1406  }
1407 
1408  /* Step to next physical page */
1409  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1410  scan->xs_snapshot);
1411 
1412  /* if we're physically at end of index, return failure */
1413  if (so->currPos.buf == InvalidBuffer)
1414  {
1416  return false;
1417  }
1418 
1419  /*
1420  * Okay, we managed to move left to a non-deleted page. Done if
1421  * it's not half-dead and contains matching tuples. Else loop back
1422  * and do it all again.
1423  */
1424  page = BufferGetPage(so->currPos.buf);
1425  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1426  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1427  if (!P_IGNORE(opaque))
1428  {
1430  /* see if there are any matches on this page */
1431  /* note that this will clear moreLeft if we can stop */
1432  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1433  break;
1434  }
1435  }
1436  }
1437 
1438  /* Drop the lock, and maybe the pin, on the current page */
1440 
1441  return true;
1442 }
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2438
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:265
#define P_IGNORE(opaque)
Definition: nbtree.h:181
bool moreRight
Definition: nbtree.h:535
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:569
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:53
int itemIndex
Definition: nbtree.h:552
char * currTuples
Definition: nbtree.h:624
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
#define P_NONE
Definition: nbtree.h:169
Snapshot xs_snapshot
Definition: relscan.h:90
#define InvalidBuffer
Definition: buf.h:25
BlockNumber currPage
Definition: nbtree.h:525
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:576
uint32 BlockNumber
Definition: block.h:31
bool moreLeft
Definition: nbtree.h:534
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
Relation indexRelation
Definition: relscan.h:89
#define BT_READ
Definition: nbtree.h:464
int nextTupleOffset
Definition: nbtree.h:541
int lastItem
Definition: nbtree.h:551
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1728
BTScanPosData markPos
Definition: nbtree.h:638
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:641
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:559
char * markTuples
Definition: nbtree.h:625
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:719
int markItemIndex
Definition: nbtree.h:634
#define Assert(condition)
Definition: c.h:667
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:582
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
BlockNumber nextPage
Definition: nbtree.h:526
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
BTScanPosData currPos
Definition: nbtree.h:637
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:97
Buffer buf
Definition: nbtree.h:522
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1138
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3330
#define offsetof(type, field)
Definition: c.h:547
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:570
Pointer Page
Definition: bufpage.h:74
static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:1459
static Buffer _bt_walk_left ( Relation  rel,
Buffer  buf,
Snapshot  snapshot 
)
static

Definition at line 1459 of file nbtsearch.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_steppage().

1460 {
1461  Page page;
1462  BTPageOpaque opaque;
1463 
1464  page = BufferGetPage(buf);
1465  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1466 
1467  for (;;)
1468  {
1469  BlockNumber obknum;
1470  BlockNumber lblkno;
1471  BlockNumber blkno;
1472  int tries;
1473 
1474  /* if we're at end of tree, release buf and return failure */
1475  if (P_LEFTMOST(opaque))
1476  {
1477  _bt_relbuf(rel, buf);
1478  break;
1479  }
1480  /* remember original page we are stepping left from */
1481  obknum = BufferGetBlockNumber(buf);
1482  /* step left */
1483  blkno = lblkno = opaque->btpo_prev;
1484  _bt_relbuf(rel, buf);
1485  /* check for interrupts while we're not holding any buffer lock */
1487  buf = _bt_getbuf(rel, blkno, BT_READ);
1488  page = BufferGetPage(buf);
1489  TestForOldSnapshot(snapshot, rel, page);
1490  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1491 
1492  /*
1493  * If this isn't the page we want, walk right till we find what we
1494  * want --- but go no more than four hops (an arbitrary limit). If we
1495  * don't find the correct page by then, the most likely bet is that
1496  * the original page got deleted and isn't in the sibling chain at all
1497  * anymore, not that its left sibling got split more than four times.
1498  *
1499  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1500  * because half-dead pages are still in the sibling chain. Caller
1501  * must reject half-dead pages if wanted.
1502  */
1503  tries = 0;
1504  for (;;)
1505  {
1506  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1507  {
1508  /* Found desired page, return it */
1509  return buf;
1510  }
1511  if (P_RIGHTMOST(opaque) || ++tries > 4)
1512  break;
1513  blkno = opaque->btpo_next;
1514  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1515  page = BufferGetPage(buf);
1516  TestForOldSnapshot(snapshot, rel, page);
1517  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1518  }
1519 
1520  /* Return to the original page to see what's up */
1521  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1522  page = BufferGetPage(buf);
1523  TestForOldSnapshot(snapshot, rel, page);
1524  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1525  if (P_ISDELETED(opaque))
1526  {
1527  /*
1528  * It was deleted. Move right to first nondeleted page (there
1529  * must be one); that is the page that has acquired the deleted
1530  * one's keyspace, so stepping left from it will take us where we
1531  * want to be.
1532  */
1533  for (;;)
1534  {
1535  if (P_RIGHTMOST(opaque))
1536  elog(ERROR, "fell off the end of index \"%s\"",
1538  blkno = opaque->btpo_next;
1539  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1540  page = BufferGetPage(buf);
1541  TestForOldSnapshot(snapshot, rel, page);
1542  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1543  if (!P_ISDELETED(opaque))
1544  break;
1545  }
1546 
1547  /*
1548  * Now return to top of loop, resetting obknum to point to this
1549  * nondeleted page, and try again.
1550  */
1551  }
1552  else
1553  {
1554  /*
1555  * It wasn't deleted; the explanation had better be that the page
1556  * to the left got split or deleted. Without this check, we'd go
1557  * into an infinite loop if there's anything wrong.
1558  */
1559  if (opaque->btpo_prev == lblkno)
1560  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1561  obknum, RelationGetRelationName(rel));
1562  /* Okay to try again with new lblkno value */
1563  }
1564  }
1565 
1566  return InvalidBuffer;
1567 }
BlockNumber btpo_next
Definition: nbtree.h:57
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:265
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:700
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:569
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
#define BT_READ
Definition: nbtree.h:464
#define ERROR
Definition: elog.h:43
BlockNumber btpo_prev
Definition: nbtree.h:56
static char * buf
Definition: pg_test_fsync.c:65
#define RelationGetRelationName(relation)
Definition: rel.h:428
#define P_LEFTMOST(opaque)
Definition: nbtree.h:175
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define P_ISDELETED(opaque)
Definition: nbtree.h:179
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:719
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:97
#define elog
Definition: elog.h:219
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:176
Pointer Page
Definition: bufpage.h:74