PostgreSQL Source Code  git master
nbtsearch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsearch.c
4  * Search code for postgres btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 
26 
29 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
30  OffsetNumber offnum);
31 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
32  OffsetNumber offnum, IndexTuple itup);
33 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
34 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
35 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
36  ScanDirection dir);
37 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
38 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40 
41 
42 /*
43  * _bt_drop_lock_and_maybe_pin()
44  *
45  * Unlock the buffer; and if it is safe to release the pin, do that, too. It
46  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47  * This will prevent vacuum from stalling in a blocked state trying to read a
48  * page when a cursor is sitting on it -- at least in many important cases.
49  *
50  * Set the buffer to invalid if the pin is released, since the buffer may be
51  * re-used. If we need to go back to this block (for example, to apply
52  * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53  * will remain in shared memory for as long as it takes to scan the index
54  * buffer page.
55  */
56 static void
58 {
60 
61  if (IsMVCCSnapshot(scan->xs_snapshot) &&
63  !scan->xs_want_itup)
64  {
65  ReleaseBuffer(sp->buf);
66  sp->buf = InvalidBuffer;
67  }
68 }
69 
70 /*
71  * _bt_search() -- Search the tree for a particular scankey,
72  * or more precisely for the first leaf page it could be on.
73  *
74  * The passed scankey is an insertion-type scankey (see nbtree/README),
75  * but it can omit the rightmost column(s) of the index.
76  *
77  * Return value is a stack of parent-page pointers. *bufP is set to the
78  * address of the leaf-page buffer, which is read-locked and pinned.
79  * No locks are held on the parent pages, however!
80  *
81  * If the snapshot parameter is not NULL, "old snapshot" checking will take
82  * place during the descent through the tree. This is not needed when
83  * positioning for an insert or delete, so NULL is used for those cases.
84  *
85  * The returned buffer is locked according to access parameter. Additionally,
86  * access = BT_WRITE will allow an empty root page to be created and returned.
87  * When access = BT_READ, an empty index will result in *bufP being set to
88  * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
89  * during the search will be finished.
90  */
91 BTStack
92 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
93  Snapshot snapshot)
94 {
95  BTStack stack_in = NULL;
96  int page_access = BT_READ;
97 
98  /* Get the root page to start with */
99  *bufP = _bt_getroot(rel, access);
100 
101  /* If index is empty and access = BT_READ, no root page is created. */
102  if (!BufferIsValid(*bufP))
103  return (BTStack) NULL;
104 
105  /* Loop iterates once per level descended in the tree */
106  for (;;)
107  {
108  Page page;
109  BTPageOpaque opaque;
110  OffsetNumber offnum;
111  ItemId itemid;
112  IndexTuple itup;
113  BlockNumber blkno;
114  BlockNumber par_blkno;
115  BTStack new_stack;
116 
117  /*
118  * Race -- the page we just grabbed may have split since we read its
119  * pointer in the parent (or metapage). If it has, we may need to
120  * move right to its new sibling. Do that.
121  *
122  * In write-mode, allow _bt_moveright to finish any incomplete splits
123  * along the way. Strictly speaking, we'd only need to finish an
124  * incomplete split on the leaf page we're about to insert to, not on
125  * any of the upper levels (they are taken care of in _bt_getstackbuf,
126  * if the leaf page is split and we insert to the parent page). But
127  * this is a good opportunity to finish splits of internal pages too.
128  */
129  *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
130  page_access, snapshot);
131 
132  /* if this is a leaf page, we're done */
133  page = BufferGetPage(*bufP);
134  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135  if (P_ISLEAF(opaque))
136  break;
137 
138  /*
139  * Find the appropriate item on the internal page, and get the child
140  * page that it points to.
141  */
142  offnum = _bt_binsrch(rel, key, *bufP);
143  itemid = PageGetItemId(page, offnum);
144  itup = (IndexTuple) PageGetItem(page, itemid);
145  blkno = BTreeInnerTupleGetDownLink(itup);
146  par_blkno = BufferGetBlockNumber(*bufP);
147 
148  /*
149  * We need to save the location of the index entry we chose in the
150  * parent page on a stack. In case we split the tree, we'll use the
151  * stack to work back up to the parent page. We also save the actual
152  * downlink (block) to uniquely identify the index entry, in case it
153  * moves right while we're working lower in the tree. See the paper
154  * by Lehman and Yao for how this is detected and handled. (We use the
155  * child link during the second half of a page split -- if caller ends
156  * up splitting the child it usually ends up inserting a new pivot
157  * tuple for child's new right sibling immediately after the original
158  * bts_offset offset recorded here. The downlink block will be needed
159  * to check if bts_offset remains the position of this same pivot
160  * tuple.)
161  */
162  new_stack = (BTStack) palloc(sizeof(BTStackData));
163  new_stack->bts_blkno = par_blkno;
164  new_stack->bts_offset = offnum;
165  new_stack->bts_btentry = blkno;
166  new_stack->bts_parent = stack_in;
167 
168  /*
169  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
170  * we're on the level 1 and asked to lock leaf page in write mode,
171  * then lock next page in write mode, because it must be a leaf.
172  */
173  if (opaque->btpo.level == 1 && access == BT_WRITE)
174  page_access = BT_WRITE;
175 
176  /* drop the read lock on the parent page, acquire one on the child */
177  *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
178 
179  /* okay, all set to move down a level */
180  stack_in = new_stack;
181  }
182 
183  /*
184  * If we're asked to lock leaf in write mode, but didn't manage to, then
185  * relock. This should only happen when the root page is a leaf page (and
186  * the only page in the index other than the metapage).
187  */
188  if (access == BT_WRITE && page_access == BT_READ)
189  {
190  /* trade in our read lock for a write lock */
192  LockBuffer(*bufP, BT_WRITE);
193 
194  /*
195  * If the page was split between the time that we surrendered our read
196  * lock and acquired our write lock, then this page may no longer be
197  * the right place for the key we want to insert. In this case, we
198  * need to move right in the tree. See Lehman and Yao for an
199  * excruciatingly precise description.
200  */
201  *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
202  snapshot);
203  }
204 
205  return stack_in;
206 }
207 
208 /*
209  * _bt_moveright() -- move right in the btree if necessary.
210  *
211  * When we follow a pointer to reach a page, it is possible that
212  * the page has changed in the meanwhile. If this happens, we're
213  * guaranteed that the page has "split right" -- that is, that any
214  * data that appeared on the page originally is either on the page
215  * or strictly to the right of it.
216  *
217  * This routine decides whether or not we need to move right in the
218  * tree by examining the high key entry on the page. If that entry is
219  * strictly less than the scankey, or <= the scankey in the
220  * key.nextkey=true case, then we followed the wrong link and we need
221  * to move right.
222  *
223  * The passed insertion-type scankey can omit the rightmost column(s) of the
224  * index. (see nbtree/README)
225  *
226  * When key.nextkey is false (the usual case), we are looking for the first
227  * item >= key. When key.nextkey is true, we are looking for the first item
228  * strictly greater than key.
229  *
230  * If forupdate is true, we will attempt to finish any incomplete splits
231  * that we encounter. This is required when locking a target page for an
232  * insertion, because we don't allow inserting on a page before the split
233  * is completed. 'stack' is only used if forupdate is true.
234  *
235  * On entry, we have the buffer pinned and a lock of the type specified by
236  * 'access'. If we move right, we release the buffer and lock and acquire
237  * the same on the right sibling. Return value is the buffer we stop at.
238  *
239  * If the snapshot parameter is not NULL, "old snapshot" checking will take
240  * place during the descent through the tree. This is not needed when
241  * positioning for an insert or delete, so NULL is used for those cases.
242  */
243 Buffer
246  Buffer buf,
247  bool forupdate,
248  BTStack stack,
249  int access,
250  Snapshot snapshot)
251 {
252  Page page;
253  BTPageOpaque opaque;
254  int32 cmpval;
255 
256  /*
257  * When nextkey = false (normal case): if the scan key that brought us to
258  * this page is > the high key stored on the page, then the page has split
259  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
260  * have some duplicates to the right as well as the left, but that's
261  * something that's only ever dealt with on the leaf level, after
262  * _bt_search has found an initial leaf page.)
263  *
264  * When nextkey = true: move right if the scan key is >= page's high key.
265  * (Note that key.scantid cannot be set in this case.)
266  *
267  * The page could even have split more than once, so scan as far as
268  * needed.
269  *
270  * We also have to move right if we followed a link that brought us to a
271  * dead page.
272  */
273  cmpval = key->nextkey ? 0 : 1;
274 
275  for (;;)
276  {
277  page = BufferGetPage(buf);
278  TestForOldSnapshot(snapshot, rel, page);
279  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
280 
281  if (P_RIGHTMOST(opaque))
282  break;
283 
284  /*
285  * Finish any incomplete splits we encounter along the way.
286  */
287  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
288  {
289  BlockNumber blkno = BufferGetBlockNumber(buf);
290 
291  /* upgrade our lock if necessary */
292  if (access == BT_READ)
293  {
295  LockBuffer(buf, BT_WRITE);
296  }
297 
298  if (P_INCOMPLETE_SPLIT(opaque))
299  _bt_finish_split(rel, buf, stack);
300  else
301  _bt_relbuf(rel, buf);
302 
303  /* re-acquire the lock in the right mode, and re-check */
304  buf = _bt_getbuf(rel, blkno, access);
305  continue;
306  }
307 
308  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
309  {
310  /* step right one page */
311  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
312  continue;
313  }
314  else
315  break;
316  }
317 
318  if (P_IGNORE(opaque))
319  elog(ERROR, "fell off the end of index \"%s\"",
321 
322  return buf;
323 }
324 
325 /*
326  * _bt_binsrch() -- Do a binary search for a key on a particular page.
327  *
328  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
329  * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
330  * particular, this means it is possible to return a value 1 greater than the
331  * number of keys on the page, if the scankey is > all keys on the page.)
332  *
333  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
334  * of the last key < given scankey, or last key <= given scankey if nextkey
335  * is true. (Since _bt_compare treats the first data key of such a page as
336  * minus infinity, there will be at least one key < scankey, so the result
337  * always points at one of the keys on the page.) This key indicates the
338  * right place to descend to be sure we find all leaf keys >= given scankey
339  * (or leaf keys > given scankey when nextkey is true).
340  *
341  * This procedure is not responsible for walking right, it just examines
342  * the given page. _bt_binsrch() has no lock or refcount side effects
343  * on the buffer.
344  */
345 static OffsetNumber
348  Buffer buf)
349 {
350  Page page;
351  BTPageOpaque opaque;
352  OffsetNumber low,
353  high;
354  int32 result,
355  cmpval;
356 
357  /* Requesting nextkey semantics while using scantid seems nonsensical */
358  Assert(!key->nextkey || key->scantid == NULL);
359 
360  page = BufferGetPage(buf);
361  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362 
363  low = P_FIRSTDATAKEY(opaque);
364  high = PageGetMaxOffsetNumber(page);
365 
366  /*
367  * If there are no keys on the page, return the first available slot. Note
368  * this covers two cases: the page is really empty (no keys), or it
369  * contains only a high key. The latter case is possible after vacuuming.
370  * This can never happen on an internal page, however, since they are
371  * never empty (an internal page must have children).
372  */
373  if (unlikely(high < low))
374  return low;
375 
376  /*
377  * Binary search to find the first key on the page >= scan key, or first
378  * key > scankey when nextkey is true.
379  *
380  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
382  *
383  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
385  *
386  * We can fall out when high == low.
387  */
388  high++; /* establish the loop invariant for high */
389 
390  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
391 
392  while (high > low)
393  {
394  OffsetNumber mid = low + ((high - low) / 2);
395 
396  /* We have low <= mid < high, so mid points at a real slot */
397 
398  result = _bt_compare(rel, key, page, mid);
399 
400  if (result >= cmpval)
401  low = mid + 1;
402  else
403  high = mid;
404  }
405 
406  /*
407  * At this point we have high == low, but be careful: they could point
408  * past the last slot on the page.
409  *
410  * On a leaf page, we always return the first key >= scan key (resp. >
411  * scan key), which could be the last slot + 1.
412  */
413  if (P_ISLEAF(opaque))
414  return low;
415 
416  /*
417  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
418  * There must be one if _bt_compare() is playing by the rules.
419  */
420  Assert(low > P_FIRSTDATAKEY(opaque));
421 
422  return OffsetNumberPrev(low);
423 }
424 
425 /*
426  *
427  * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
428  *
429  * Like _bt_binsrch(), but with support for caching the binary search
430  * bounds. Only used during insertion, and only on the leaf page that it
431  * looks like caller will insert tuple on. Exclusive-locked and pinned
432  * leaf page is contained within insertstate.
433  *
434  * Caches the bounds fields in insertstate so that a subsequent call can
435  * reuse the low and strict high bounds of original binary search. Callers
436  * that use these fields directly must be prepared for the case where low
437  * and/or stricthigh are not on the same page (one or both exceed maxoff
438  * for the page). The case where there are no items on the page (high <
439  * low) makes bounds invalid.
440  *
441  * Caller is responsible for invalidating bounds when it modifies the page
442  * before calling here a second time.
443  */
446 {
447  BTScanInsert key = insertstate->itup_key;
448  Page page;
449  BTPageOpaque opaque;
450  OffsetNumber low,
451  high,
452  stricthigh;
453  int32 result,
454  cmpval;
455 
456  page = BufferGetPage(insertstate->buf);
457  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
458 
459  Assert(P_ISLEAF(opaque));
460  Assert(!key->nextkey);
461 
462  if (!insertstate->bounds_valid)
463  {
464  /* Start new binary search */
465  low = P_FIRSTDATAKEY(opaque);
466  high = PageGetMaxOffsetNumber(page);
467  }
468  else
469  {
470  /* Restore result of previous binary search against same page */
471  low = insertstate->low;
472  high = insertstate->stricthigh;
473  }
474 
475  /* If there are no keys on the page, return the first available slot */
476  if (unlikely(high < low))
477  {
478  /* Caller can't reuse bounds */
479  insertstate->low = InvalidOffsetNumber;
480  insertstate->stricthigh = InvalidOffsetNumber;
481  insertstate->bounds_valid = false;
482  return low;
483  }
484 
485  /*
486  * Binary search to find the first key on the page >= scan key. (nextkey
487  * is always false when inserting).
488  *
489  * The loop invariant is: all slots before 'low' are < scan key, all slots
490  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491  * maintained to save additional search effort for caller.
492  *
493  * We can fall out when high == low.
494  */
495  if (!insertstate->bounds_valid)
496  high++; /* establish the loop invariant for high */
497  stricthigh = high; /* high initially strictly higher */
498 
499  cmpval = 1; /* !nextkey comparison value */
500 
501  while (high > low)
502  {
503  OffsetNumber mid = low + ((high - low) / 2);
504 
505  /* We have low <= mid < high, so mid points at a real slot */
506 
507  result = _bt_compare(rel, key, page, mid);
508 
509  if (result >= cmpval)
510  low = mid + 1;
511  else
512  {
513  high = mid;
514  if (result != 0)
515  stricthigh = high;
516  }
517  }
518 
519  /*
520  * On a leaf page, a binary search always returns the first key >= scan
521  * key (at least in !nextkey case), which could be the last slot + 1. This
522  * is also the lower bound of cached search.
523  *
524  * stricthigh may also be the last slot + 1, which prevents caller from
525  * using bounds directly, but is still useful to us if we're called a
526  * second time with cached bounds (cached low will be < stricthigh when
527  * that happens).
528  */
529  insertstate->low = low;
530  insertstate->stricthigh = stricthigh;
531  insertstate->bounds_valid = true;
532 
533  return low;
534 }
535 
536 /*----------
537  * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
538  *
539  * page/offnum: location of btree item to be compared to.
540  *
541  * This routine returns:
542  * <0 if scankey < tuple at offnum;
543  * 0 if scankey == tuple at offnum;
544  * >0 if scankey > tuple at offnum.
545  * NULLs in the keys are treated as sortable values. Therefore
546  * "equality" does not necessarily mean that the item should be
547  * returned to the caller as a matching key!
548  *
549  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
550  * "minus infinity": this routine will always claim it is less than the
551  * scankey. The actual key value stored is explicitly truncated to 0
552  * attributes (explicitly minus infinity) with version 3+ indexes, but
553  * that isn't relied upon. This allows us to implement the Lehman and
554  * Yao convention that the first down-link pointer is before the first
555  * key. See backend/access/nbtree/README for details.
556  *----------
557  */
558 int32
561  Page page,
562  OffsetNumber offnum)
563 {
564  TupleDesc itupdesc = RelationGetDescr(rel);
566  IndexTuple itup;
567  ItemPointer heapTid;
568  ScanKey scankey;
569  int ncmpkey;
570  int ntupatts;
571 
572  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
574  Assert(key->heapkeyspace || key->scantid == NULL);
575 
576  /*
577  * Force result ">" if target item is first data item on an internal page
578  * --- see NOTE above.
579  */
580  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
581  return 1;
582 
583  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
584  ntupatts = BTreeTupleGetNAtts(itup, rel);
585 
586  /*
587  * The scan key is set up with the attribute number associated with each
588  * term in the key. It is important that, if the index is multi-key, the
589  * scan contain the first k key attributes, and that they be in order. If
590  * you think about how multi-key ordering works, you'll understand why
591  * this is.
592  *
593  * We don't test for violation of this condition here, however. The
594  * initial setup for the index scan had better have gotten it right (see
595  * _bt_first).
596  */
597 
598  ncmpkey = Min(ntupatts, key->keysz);
599  Assert(key->heapkeyspace || ncmpkey == key->keysz);
600  scankey = key->scankeys;
601  for (int i = 1; i <= ncmpkey; i++)
602  {
603  Datum datum;
604  bool isNull;
605  int32 result;
606 
607  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
608 
609  /* see comments about NULLs handling in btbuild */
610  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
611  {
612  if (isNull)
613  result = 0; /* NULL "=" NULL */
614  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
615  result = -1; /* NULL "<" NOT_NULL */
616  else
617  result = 1; /* NULL ">" NOT_NULL */
618  }
619  else if (isNull) /* key is NOT_NULL and item is NULL */
620  {
621  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
622  result = 1; /* NOT_NULL ">" NULL */
623  else
624  result = -1; /* NOT_NULL "<" NULL */
625  }
626  else
627  {
628  /*
629  * The sk_func needs to be passed the index value as left arg and
630  * the sk_argument as right arg (they might be of different
631  * types). Since it is convenient for callers to think of
632  * _bt_compare as comparing the scankey to the index item, we have
633  * to flip the sign of the comparison result. (Unless it's a DESC
634  * column, in which case we *don't* flip the sign.)
635  */
636  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
637  scankey->sk_collation,
638  datum,
639  scankey->sk_argument));
640 
641  if (!(scankey->sk_flags & SK_BT_DESC))
642  INVERT_COMPARE_RESULT(result);
643  }
644 
645  /* if the keys are unequal, return the difference */
646  if (result != 0)
647  return result;
648 
649  scankey++;
650  }
651 
652  /*
653  * All non-truncated attributes (other than heap TID) were found to be
654  * equal. Treat truncated attributes as minus infinity when scankey has a
655  * key attribute value that would otherwise be compared directly.
656  *
657  * Note: it doesn't matter if ntupatts includes non-key attributes;
658  * scankey won't, so explicitly excluding non-key attributes isn't
659  * necessary.
660  */
661  if (key->keysz > ntupatts)
662  return 1;
663 
664  /*
665  * Use the heap TID attribute and scantid to try to break the tie. The
666  * rules are the same as any other key attribute -- only the
667  * representation differs.
668  */
669  heapTid = BTreeTupleGetHeapTID(itup);
670  if (key->scantid == NULL)
671  {
672  /*
673  * Most searches have a scankey that is considered greater than a
674  * truncated pivot tuple if and when the scankey has equal values for
675  * attributes up to and including the least significant untruncated
676  * attribute in tuple.
677  *
678  * For example, if an index has the minimum two attributes (single
679  * user key attribute, plus heap TID attribute), and a page's high key
680  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
681  * will not descend to the page to the left. The search will descend
682  * right instead. The truncated attribute in pivot tuple means that
683  * all non-pivot tuples on the page to the left are strictly < 'foo',
684  * so it isn't necessary to descend left. In other words, search
685  * doesn't have to descend left because it isn't interested in a match
686  * that has a heap TID value of -inf.
687  *
688  * However, some searches (pivotsearch searches) actually require that
689  * we descend left when this happens. -inf is treated as a possible
690  * match for omitted scankey attribute(s). This is needed by page
691  * deletion, which must re-find leaf pages that are targets for
692  * deletion using their high keys.
693  *
694  * Note: the heap TID part of the test ensures that scankey is being
695  * compared to a pivot tuple with one or more truncated key
696  * attributes.
697  *
698  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
699  * left here, since they have no heap TID attribute (and cannot have
700  * any -inf key values in any case, since truncation can only remove
701  * non-key attributes). !heapkeyspace searches must always be
702  * prepared to deal with matches on both sides of the pivot once the
703  * leaf level is reached.
704  */
705  if (key->heapkeyspace && !key->pivotsearch &&
706  key->keysz == ntupatts && heapTid == NULL)
707  return 1;
708 
709  /* All provided scankey arguments found to be equal */
710  return 0;
711  }
712 
713  /*
714  * Treat truncated heap TID as minus infinity, since scankey has a key
715  * attribute value (scantid) that would otherwise be compared directly
716  */
718  if (heapTid == NULL)
719  return 1;
720 
722  return ItemPointerCompare(key->scantid, heapTid);
723 }
724 
725 /*
726  * _bt_first() -- Find the first item in a scan.
727  *
728  * We need to be clever about the direction of scan, the search
729  * conditions, and the tree ordering. We find the first item (or,
730  * if backwards scan, the last item) in the tree that satisfies the
731  * qualifications in the scan key. On success exit, the page containing
732  * the current index tuple is pinned but not locked, and data about
733  * the matching tuple(s) on the page has been loaded into so->currPos.
734  * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
735  * and if requested, scan->xs_itup points to a copy of the index tuple.
736  *
737  * If there are no matching items in the index, we return false, with no
738  * pins or locks held.
739  *
740  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
741  * are both search-type scankeys (see nbtree/README for more about this).
742  * Within this routine, we build a temporary insertion-type scankey to use
743  * in locating the scan start position.
744  */
745 bool
747 {
748  Relation rel = scan->indexRelation;
749  BTScanOpaque so = (BTScanOpaque) scan->opaque;
750  Buffer buf;
751  BTStack stack;
752  OffsetNumber offnum;
753  StrategyNumber strat;
754  bool nextkey;
755  bool goback;
756  BTScanInsertData inskey;
757  ScanKey startKeys[INDEX_MAX_KEYS];
758  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
759  int keysCount = 0;
760  int i;
761  bool status = true;
762  StrategyNumber strat_total;
763  BTScanPosItem *currItem;
764  BlockNumber blkno;
765 
767 
769 
770  /*
771  * Examine the scan keys and eliminate any redundant keys; also mark the
772  * keys that must be matched to continue the scan.
773  */
774  _bt_preprocess_keys(scan);
775 
776  /*
777  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
778  * never be satisfied (eg, x == 1 AND x > 2).
779  */
780  if (!so->qual_ok)
781  return false;
782 
783  /*
784  * For parallel scans, get the starting page from shared state. If the
785  * scan has not started, proceed to find out first leaf page in the usual
786  * way while keeping other participating processes waiting. If the scan
787  * has already begun, use the page number from the shared structure.
788  */
789  if (scan->parallel_scan != NULL)
790  {
791  status = _bt_parallel_seize(scan, &blkno);
792  if (!status)
793  return false;
794  else if (blkno == P_NONE)
795  {
796  _bt_parallel_done(scan);
797  return false;
798  }
799  else if (blkno != InvalidBlockNumber)
800  {
801  if (!_bt_parallel_readpage(scan, blkno, dir))
802  return false;
803  goto readcomplete;
804  }
805  }
806 
807  /*----------
808  * Examine the scan keys to discover where we need to start the scan.
809  *
810  * We want to identify the keys that can be used as starting boundaries;
811  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
812  * a backwards scan. We can use keys for multiple attributes so long as
813  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
814  * a > or < boundary or find an attribute with no boundary (which can be
815  * thought of as the same as "> -infinity"), we can't use keys for any
816  * attributes to its right, because it would break our simplistic notion
817  * of what initial positioning strategy to use.
818  *
819  * When the scan keys include cross-type operators, _bt_preprocess_keys
820  * may not be able to eliminate redundant keys; in such cases we will
821  * arbitrarily pick a usable one for each attribute. This is correct
822  * but possibly not optimal behavior. (For example, with keys like
823  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
824  * x=5 would be more efficient.) Since the situation only arises given
825  * a poorly-worded query plus an incomplete opfamily, live with it.
826  *
827  * When both equality and inequality keys appear for a single attribute
828  * (again, only possible when cross-type operators appear), we *must*
829  * select one of the equality keys for the starting point, because
830  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
831  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
832  * start at x=4, we will fail and stop before reaching x=10. If multiple
833  * equality quals survive preprocessing, however, it doesn't matter which
834  * one we use --- by definition, they are either redundant or
835  * contradictory.
836  *
837  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
838  * If the index stores nulls at the end of the index we'll be starting
839  * from, and we have no boundary key for the column (which means the key
840  * we deduced NOT NULL from is an inequality key that constrains the other
841  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
842  * use as a boundary key. If we didn't do this, we might find ourselves
843  * traversing a lot of null entries at the start of the scan.
844  *
845  * In this loop, row-comparison keys are treated the same as keys on their
846  * first (leftmost) columns. We'll add on lower-order columns of the row
847  * comparison below, if possible.
848  *
849  * The selected scan keys (at most one per index column) are remembered by
850  * storing their addresses into the local startKeys[] array.
851  *----------
852  */
853  strat_total = BTEqualStrategyNumber;
854  if (so->numberOfKeys > 0)
855  {
856  AttrNumber curattr;
857  ScanKey chosen;
858  ScanKey impliesNN;
859  ScanKey cur;
860 
861  /*
862  * chosen is the so-far-chosen key for the current attribute, if any.
863  * We don't cast the decision in stone until we reach keys for the
864  * next attribute.
865  */
866  curattr = 1;
867  chosen = NULL;
868  /* Also remember any scankey that implies a NOT NULL constraint */
869  impliesNN = NULL;
870 
871  /*
872  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
873  * pass to handle after-last-key processing. Actual exit from the
874  * loop is at one of the "break" statements below.
875  */
876  for (cur = so->keyData, i = 0;; cur++, i++)
877  {
878  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
879  {
880  /*
881  * Done looking at keys for curattr. If we didn't find a
882  * usable boundary key, see if we can deduce a NOT NULL key.
883  */
884  if (chosen == NULL && impliesNN != NULL &&
885  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
888  {
889  /* Yes, so build the key in notnullkeys[keysCount] */
890  chosen = &notnullkeys[keysCount];
891  ScanKeyEntryInitialize(chosen,
893  (impliesNN->sk_flags &
895  curattr,
896  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
899  InvalidOid,
900  InvalidOid,
901  InvalidOid,
902  (Datum) 0);
903  }
904 
905  /*
906  * If we still didn't find a usable boundary key, quit; else
907  * save the boundary key pointer in startKeys.
908  */
909  if (chosen == NULL)
910  break;
911  startKeys[keysCount++] = chosen;
912 
913  /*
914  * Adjust strat_total, and quit if we have stored a > or <
915  * key.
916  */
917  strat = chosen->sk_strategy;
918  if (strat != BTEqualStrategyNumber)
919  {
920  strat_total = strat;
921  if (strat == BTGreaterStrategyNumber ||
922  strat == BTLessStrategyNumber)
923  break;
924  }
925 
926  /*
927  * Done if that was the last attribute, or if next key is not
928  * in sequence (implying no boundary key is available for the
929  * next attribute).
930  */
931  if (i >= so->numberOfKeys ||
932  cur->sk_attno != curattr + 1)
933  break;
934 
935  /*
936  * Reset for next attr.
937  */
938  curattr = cur->sk_attno;
939  chosen = NULL;
940  impliesNN = NULL;
941  }
942 
943  /*
944  * Can we use this key as a starting boundary for this attr?
945  *
946  * If not, does it imply a NOT NULL constraint? (Because
947  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
948  * *any* inequality key works for that; we need not test.)
949  */
950  switch (cur->sk_strategy)
951  {
954  if (chosen == NULL)
955  {
956  if (ScanDirectionIsBackward(dir))
957  chosen = cur;
958  else
959  impliesNN = cur;
960  }
961  break;
963  /* override any non-equality choice */
964  chosen = cur;
965  break;
968  if (chosen == NULL)
969  {
970  if (ScanDirectionIsForward(dir))
971  chosen = cur;
972  else
973  impliesNN = cur;
974  }
975  break;
976  }
977  }
978  }
979 
980  /*
981  * If we found no usable boundary keys, we have to start from one end of
982  * the tree. Walk down that edge to the first or last key, and scan from
983  * there.
984  */
985  if (keysCount == 0)
986  {
987  bool match;
988 
989  match = _bt_endpoint(scan, dir);
990 
991  if (!match)
992  {
993  /* No match, so mark (parallel) scan finished */
994  _bt_parallel_done(scan);
995  }
996 
997  return match;
998  }
999 
1000  /*
1001  * We want to start the scan somewhere within the index. Set up an
1002  * insertion scankey we can use to search for the boundary point we
1003  * identified above. The insertion scankey is built using the keys
1004  * identified by startKeys[]. (Remaining insertion scankey fields are
1005  * initialized after initial-positioning strategy is finalized.)
1006  */
1007  Assert(keysCount <= INDEX_MAX_KEYS);
1008  for (i = 0; i < keysCount; i++)
1009  {
1010  ScanKey cur = startKeys[i];
1011 
1012  Assert(cur->sk_attno == i + 1);
1013 
1014  if (cur->sk_flags & SK_ROW_HEADER)
1015  {
1016  /*
1017  * Row comparison header: look to the first row member instead.
1018  *
1019  * The member scankeys are already in insertion format (ie, they
1020  * have sk_func = 3-way-comparison function), but we have to watch
1021  * out for nulls, which _bt_preprocess_keys didn't check. A null
1022  * in the first row member makes the condition unmatchable, just
1023  * like qual_ok = false.
1024  */
1025  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1026 
1027  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1028  if (subkey->sk_flags & SK_ISNULL)
1029  {
1030  _bt_parallel_done(scan);
1031  return false;
1032  }
1033  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1034 
1035  /*
1036  * If the row comparison is the last positioning key we accepted,
1037  * try to add additional keys from the lower-order row members.
1038  * (If we accepted independent conditions on additional index
1039  * columns, we use those instead --- doesn't seem worth trying to
1040  * determine which is more restrictive.) Note that this is OK
1041  * even if the row comparison is of ">" or "<" type, because the
1042  * condition applied to all but the last row member is effectively
1043  * ">=" or "<=", and so the extra keys don't break the positioning
1044  * scheme. But, by the same token, if we aren't able to use all
1045  * the row members, then the part of the row comparison that we
1046  * did use has to be treated as just a ">=" or "<=" condition, and
1047  * so we'd better adjust strat_total accordingly.
1048  */
1049  if (i == keysCount - 1)
1050  {
1051  bool used_all_subkeys = false;
1052 
1053  Assert(!(subkey->sk_flags & SK_ROW_END));
1054  for (;;)
1055  {
1056  subkey++;
1057  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1058  if (subkey->sk_attno != keysCount + 1)
1059  break; /* out-of-sequence, can't use it */
1060  if (subkey->sk_strategy != cur->sk_strategy)
1061  break; /* wrong direction, can't use it */
1062  if (subkey->sk_flags & SK_ISNULL)
1063  break; /* can't use null keys */
1064  Assert(keysCount < INDEX_MAX_KEYS);
1065  memcpy(inskey.scankeys + keysCount, subkey,
1066  sizeof(ScanKeyData));
1067  keysCount++;
1068  if (subkey->sk_flags & SK_ROW_END)
1069  {
1070  used_all_subkeys = true;
1071  break;
1072  }
1073  }
1074  if (!used_all_subkeys)
1075  {
1076  switch (strat_total)
1077  {
1078  case BTLessStrategyNumber:
1079  strat_total = BTLessEqualStrategyNumber;
1080  break;
1082  strat_total = BTGreaterEqualStrategyNumber;
1083  break;
1084  }
1085  }
1086  break; /* done with outer loop */
1087  }
1088  }
1089  else
1090  {
1091  /*
1092  * Ordinary comparison key. Transform the search-style scan key
1093  * to an insertion scan key by replacing the sk_func with the
1094  * appropriate btree comparison function.
1095  *
1096  * If scankey operator is not a cross-type comparison, we can use
1097  * the cached comparison function; otherwise gotta look it up in
1098  * the catalogs. (That can't lead to infinite recursion, since no
1099  * indexscan initiated by syscache lookup will use cross-data-type
1100  * operators.)
1101  *
1102  * We support the convention that sk_subtype == InvalidOid means
1103  * the opclass input type; this is a hack to simplify life for
1104  * ScanKeyInit().
1105  */
1106  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1107  cur->sk_subtype == InvalidOid)
1108  {
1109  FmgrInfo *procinfo;
1110 
1111  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1112  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1113  cur->sk_flags,
1114  cur->sk_attno,
1116  cur->sk_subtype,
1117  cur->sk_collation,
1118  procinfo,
1119  cur->sk_argument);
1120  }
1121  else
1122  {
1123  RegProcedure cmp_proc;
1124 
1125  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1126  rel->rd_opcintype[i],
1127  cur->sk_subtype,
1128  BTORDER_PROC);
1129  if (!RegProcedureIsValid(cmp_proc))
1130  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1131  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1132  cur->sk_attno, RelationGetRelationName(rel));
1133  ScanKeyEntryInitialize(inskey.scankeys + i,
1134  cur->sk_flags,
1135  cur->sk_attno,
1137  cur->sk_subtype,
1138  cur->sk_collation,
1139  cmp_proc,
1140  cur->sk_argument);
1141  }
1142  }
1143  }
1144 
1145  /*----------
1146  * Examine the selected initial-positioning strategy to determine exactly
1147  * where we need to start the scan, and set flag variables to control the
1148  * code below.
1149  *
1150  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1151  * item >= scan key. If nextkey = true, they will locate the first
1152  * item > scan key.
1153  *
1154  * If goback = true, we will then step back one item, while if
1155  * goback = false, we will start the scan on the located item.
1156  *----------
1157  */
1158  switch (strat_total)
1159  {
1160  case BTLessStrategyNumber:
1161 
1162  /*
1163  * Find first item >= scankey, then back up one to arrive at last
1164  * item < scankey. (Note: this positioning strategy is only used
1165  * for a backward scan, so that is always the correct starting
1166  * position.)
1167  */
1168  nextkey = false;
1169  goback = true;
1170  break;
1171 
1173 
1174  /*
1175  * Find first item > scankey, then back up one to arrive at last
1176  * item <= scankey. (Note: this positioning strategy is only used
1177  * for a backward scan, so that is always the correct starting
1178  * position.)
1179  */
1180  nextkey = true;
1181  goback = true;
1182  break;
1183 
1184  case BTEqualStrategyNumber:
1185 
1186  /*
1187  * If a backward scan was specified, need to start with last equal
1188  * item not first one.
1189  */
1190  if (ScanDirectionIsBackward(dir))
1191  {
1192  /*
1193  * This is the same as the <= strategy. We will check at the
1194  * end whether the found item is actually =.
1195  */
1196  nextkey = true;
1197  goback = true;
1198  }
1199  else
1200  {
1201  /*
1202  * This is the same as the >= strategy. We will check at the
1203  * end whether the found item is actually =.
1204  */
1205  nextkey = false;
1206  goback = false;
1207  }
1208  break;
1209 
1211 
1212  /*
1213  * Find first item >= scankey. (This is only used for forward
1214  * scans.)
1215  */
1216  nextkey = false;
1217  goback = false;
1218  break;
1219 
1221 
1222  /*
1223  * Find first item > scankey. (This is only used for forward
1224  * scans.)
1225  */
1226  nextkey = true;
1227  goback = false;
1228  break;
1229 
1230  default:
1231  /* can't get here, but keep compiler quiet */
1232  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1233  return false;
1234  }
1235 
1236  /* Initialize remaining insertion scan key fields */
1237  inskey.heapkeyspace = _bt_heapkeyspace(rel);
1238  inskey.anynullkeys = false; /* unused */
1239  inskey.nextkey = nextkey;
1240  inskey.pivotsearch = false;
1241  inskey.scantid = NULL;
1242  inskey.keysz = keysCount;
1243 
1244  /*
1245  * Use the manufactured insertion scan key to descend the tree and
1246  * position ourselves on the target leaf page.
1247  */
1248  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1249 
1250  /* don't need to keep the stack around... */
1251  _bt_freestack(stack);
1252 
1253  if (!BufferIsValid(buf))
1254  {
1255  /*
1256  * We only get here if the index is completely empty. Lock relation
1257  * because nothing finer to lock exists.
1258  */
1259  PredicateLockRelation(rel, scan->xs_snapshot);
1260 
1261  /*
1262  * mark parallel scan as done, so that all the workers can finish
1263  * their scan
1264  */
1265  _bt_parallel_done(scan);
1267 
1268  return false;
1269  }
1270  else
1272  scan->xs_snapshot);
1273 
1274  _bt_initialize_more_data(so, dir);
1275 
1276  /* position to the precise item on the page */
1277  offnum = _bt_binsrch(rel, &inskey, buf);
1278 
1279  /*
1280  * If nextkey = false, we are positioned at the first item >= scan key, or
1281  * possibly at the end of a page on which all the existing items are less
1282  * than the scan key and we know that everything on later pages is greater
1283  * than or equal to scan key.
1284  *
1285  * If nextkey = true, we are positioned at the first item > scan key, or
1286  * possibly at the end of a page on which all the existing items are less
1287  * than or equal to the scan key and we know that everything on later
1288  * pages is greater than scan key.
1289  *
1290  * The actually desired starting point is either this item or the prior
1291  * one, or in the end-of-page case it's the first item on the next page or
1292  * the last item on this page. Adjust the starting offset if needed. (If
1293  * this results in an offset before the first item or after the last one,
1294  * _bt_readpage will report no items found, and then we'll step to the
1295  * next page as needed.)
1296  */
1297  if (goback)
1298  offnum = OffsetNumberPrev(offnum);
1299 
1300  /* remember which buffer we have pinned, if any */
1302  so->currPos.buf = buf;
1303 
1304  /*
1305  * Now load data from the first page of the scan.
1306  */
1307  if (!_bt_readpage(scan, dir, offnum))
1308  {
1309  /*
1310  * There's no actually-matching data on this page. Try to advance to
1311  * the next page. Return false if there's no matching data at all.
1312  */
1314  if (!_bt_steppage(scan, dir))
1315  return false;
1316  }
1317  else
1318  {
1319  /* Drop the lock, and maybe the pin, on the current page */
1321  }
1322 
1323 readcomplete:
1324  /* OK, itemIndex says what to return */
1325  currItem = &so->currPos.items[so->currPos.itemIndex];
1326  scan->xs_heaptid = currItem->heapTid;
1327  if (scan->xs_want_itup)
1328  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1329 
1330  return true;
1331 }
1332 
1333 /*
1334  * _bt_next() -- Get the next item in a scan.
1335  *
1336  * On entry, so->currPos describes the current page, which may be pinned
1337  * but is not locked, and so->currPos.itemIndex identifies which item was
1338  * previously returned.
1339  *
1340  * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1341  * next heap tuple, and if requested, scan->xs_itup points to a copy of
1342  * the index tuple. so->currPos is updated as needed.
1343  *
1344  * On failure exit (no more tuples), we release pin and set
1345  * so->currPos.buf to InvalidBuffer.
1346  */
1347 bool
1349 {
1350  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1351  BTScanPosItem *currItem;
1352 
1353  /*
1354  * Advance to next tuple on current page; or if there's no more, try to
1355  * step to the next page with data.
1356  */
1357  if (ScanDirectionIsForward(dir))
1358  {
1359  if (++so->currPos.itemIndex > so->currPos.lastItem)
1360  {
1361  if (!_bt_steppage(scan, dir))
1362  return false;
1363  }
1364  }
1365  else
1366  {
1367  if (--so->currPos.itemIndex < so->currPos.firstItem)
1368  {
1369  if (!_bt_steppage(scan, dir))
1370  return false;
1371  }
1372  }
1373 
1374  /* OK, itemIndex says what to return */
1375  currItem = &so->currPos.items[so->currPos.itemIndex];
1376  scan->xs_heaptid = currItem->heapTid;
1377  if (scan->xs_want_itup)
1378  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1379 
1380  return true;
1381 }
1382 
1383 /*
1384  * _bt_readpage() -- Load data from current index page into so->currPos
1385  *
1386  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1387  * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1388  * they are updated as appropriate. All other fields of so->currPos are
1389  * initialized from scratch here.
1390  *
1391  * We scan the current page starting at offnum and moving in the indicated
1392  * direction. All items matching the scan keys are loaded into currPos.items.
1393  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1394  * that there can be no more matching tuples in the current scan direction.
1395  *
1396  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1397  * prior to calling this function; this function will invoke
1398  * _bt_parallel_release before returning.
1399  *
1400  * Returns true if any matching items found on the page, false if none.
1401  */
1402 static bool
1404 {
1405  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1406  Page page;
1407  BTPageOpaque opaque;
1408  OffsetNumber minoff;
1409  OffsetNumber maxoff;
1410  int itemIndex;
1411  bool continuescan;
1412  int indnatts;
1413 
1414  /*
1415  * We must have the buffer pinned and locked, but the usual macro can't be
1416  * used here; this function is what makes it good for currPos.
1417  */
1419 
1420  page = BufferGetPage(so->currPos.buf);
1421  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1422 
1423  /* allow next page be processed by parallel worker */
1424  if (scan->parallel_scan)
1425  {
1426  if (ScanDirectionIsForward(dir))
1427  _bt_parallel_release(scan, opaque->btpo_next);
1428  else
1430  }
1431 
1432  continuescan = true; /* default assumption */
1434  minoff = P_FIRSTDATAKEY(opaque);
1435  maxoff = PageGetMaxOffsetNumber(page);
1436 
1437  /*
1438  * We note the buffer's block number so that we can release the pin later.
1439  * This allows us to re-read the buffer if it is needed again for hinting.
1440  */
1442 
1443  /*
1444  * We save the LSN of the page as we read it, so that we know whether it
1445  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1446  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1447  */
1449 
1450  /*
1451  * we must save the page's right-link while scanning it; this tells us
1452  * where to step right to after we're done with these items. There is no
1453  * corresponding need for the left-link, since splits always go right.
1454  */
1455  so->currPos.nextPage = opaque->btpo_next;
1456 
1457  /* initialize tuple workspace to empty */
1458  so->currPos.nextTupleOffset = 0;
1459 
1460  /*
1461  * Now that the current page has been made consistent, the macro should be
1462  * good.
1463  */
1465 
1466  if (ScanDirectionIsForward(dir))
1467  {
1468  /* load items[] in ascending order */
1469  itemIndex = 0;
1470 
1471  offnum = Max(offnum, minoff);
1472 
1473  while (offnum <= maxoff)
1474  {
1475  ItemId iid = PageGetItemId(page, offnum);
1476  IndexTuple itup;
1477 
1478  /*
1479  * If the scan specifies not to return killed tuples, then we
1480  * treat a killed tuple as not passing the qual
1481  */
1482  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1483  {
1484  offnum = OffsetNumberNext(offnum);
1485  continue;
1486  }
1487 
1488  itup = (IndexTuple) PageGetItem(page, iid);
1489 
1490  if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1491  {
1492  /* tuple passes all scan key conditions, so remember it */
1493  _bt_saveitem(so, itemIndex, offnum, itup);
1494  itemIndex++;
1495  }
1496  /* When !continuescan, there can't be any more matches, so stop */
1497  if (!continuescan)
1498  break;
1499 
1500  offnum = OffsetNumberNext(offnum);
1501  }
1502 
1503  /*
1504  * We don't need to visit page to the right when the high key
1505  * indicates that no more matches will be found there.
1506  *
1507  * Checking the high key like this works out more often than you might
1508  * think. Leaf page splits pick a split point between the two most
1509  * dissimilar tuples (this is weighed against the need to evenly share
1510  * free space). Leaf pages with high key attribute values that can
1511  * only appear on non-pivot tuples on the right sibling page are
1512  * common.
1513  */
1514  if (continuescan && !P_RIGHTMOST(opaque))
1515  {
1516  ItemId iid = PageGetItemId(page, P_HIKEY);
1517  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1518  int truncatt;
1519 
1520  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1521  _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1522  }
1523 
1524  if (!continuescan)
1525  so->currPos.moreRight = false;
1526 
1527  Assert(itemIndex <= MaxIndexTuplesPerPage);
1528  so->currPos.firstItem = 0;
1529  so->currPos.lastItem = itemIndex - 1;
1530  so->currPos.itemIndex = 0;
1531  }
1532  else
1533  {
1534  /* load items[] in descending order */
1535  itemIndex = MaxIndexTuplesPerPage;
1536 
1537  offnum = Min(offnum, maxoff);
1538 
1539  while (offnum >= minoff)
1540  {
1541  ItemId iid = PageGetItemId(page, offnum);
1542  IndexTuple itup;
1543  bool tuple_alive;
1544  bool passes_quals;
1545 
1546  /*
1547  * If the scan specifies not to return killed tuples, then we
1548  * treat a killed tuple as not passing the qual. Most of the
1549  * time, it's a win to not bother examining the tuple's index
1550  * keys, but just skip to the next tuple (previous, actually,
1551  * since we're scanning backwards). However, if this is the first
1552  * tuple on the page, we do check the index keys, to prevent
1553  * uselessly advancing to the page to the left. This is similar
1554  * to the high key optimization used by forward scans.
1555  */
1556  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1557  {
1558  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1559  if (offnum > P_FIRSTDATAKEY(opaque))
1560  {
1561  offnum = OffsetNumberPrev(offnum);
1562  continue;
1563  }
1564 
1565  tuple_alive = false;
1566  }
1567  else
1568  tuple_alive = true;
1569 
1570  itup = (IndexTuple) PageGetItem(page, iid);
1571 
1572  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1573  &continuescan);
1574  if (passes_quals && tuple_alive)
1575  {
1576  /* tuple passes all scan key conditions, so remember it */
1577  itemIndex--;
1578  _bt_saveitem(so, itemIndex, offnum, itup);
1579  }
1580  if (!continuescan)
1581  {
1582  /* there can't be any more matches, so stop */
1583  so->currPos.moreLeft = false;
1584  break;
1585  }
1586 
1587  offnum = OffsetNumberPrev(offnum);
1588  }
1589 
1590  Assert(itemIndex >= 0);
1591  so->currPos.firstItem = itemIndex;
1594  }
1595 
1596  return (so->currPos.firstItem <= so->currPos.lastItem);
1597 }
1598 
1599 /* Save an index item into so->currPos.items[itemIndex] */
1600 static void
1601 _bt_saveitem(BTScanOpaque so, int itemIndex,
1602  OffsetNumber offnum, IndexTuple itup)
1603 {
1604  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1605 
1606  currItem->heapTid = itup->t_tid;
1607  currItem->indexOffset = offnum;
1608  if (so->currTuples)
1609  {
1610  Size itupsz = IndexTupleSize(itup);
1611 
1612  currItem->tupleOffset = so->currPos.nextTupleOffset;
1613  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1614  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1615  }
1616 }
1617 
1618 /*
1619  * _bt_steppage() -- Step to next page containing valid data for scan
1620  *
1621  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1622  * if pinned, we'll drop the pin before moving to next page. The buffer is
1623  * not locked on entry.
1624  *
1625  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1626  * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1627  * to InvalidBuffer. We return true to indicate success.
1628  */
1629 static bool
1631 {
1632  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1634  bool status = true;
1635 
1637 
1638  /* Before leaving current page, deal with any killed items */
1639  if (so->numKilled > 0)
1640  _bt_killitems(scan);
1641 
1642  /*
1643  * Before we modify currPos, make a copy of the page data if there was a
1644  * mark position that needs it.
1645  */
1646  if (so->markItemIndex >= 0)
1647  {
1648  /* bump pin on current buffer for assignment to mark buffer */
1649  if (BTScanPosIsPinned(so->currPos))
1651  memcpy(&so->markPos, &so->currPos,
1652  offsetof(BTScanPosData, items[1]) +
1653  so->currPos.lastItem * sizeof(BTScanPosItem));
1654  if (so->markTuples)
1655  memcpy(so->markTuples, so->currTuples,
1656  so->currPos.nextTupleOffset);
1657  so->markPos.itemIndex = so->markItemIndex;
1658  so->markItemIndex = -1;
1659  }
1660 
1661  if (ScanDirectionIsForward(dir))
1662  {
1663  /* Walk right to the next page with data */
1664  if (scan->parallel_scan != NULL)
1665  {
1666  /*
1667  * Seize the scan to get the next block number; if the scan has
1668  * ended already, bail out.
1669  */
1670  status = _bt_parallel_seize(scan, &blkno);
1671  if (!status)
1672  {
1673  /* release the previous buffer, if pinned */
1676  return false;
1677  }
1678  }
1679  else
1680  {
1681  /* Not parallel, so use the previously-saved nextPage link. */
1682  blkno = so->currPos.nextPage;
1683  }
1684 
1685  /* Remember we left a page with data */
1686  so->currPos.moreLeft = true;
1687 
1688  /* release the previous buffer, if pinned */
1690  }
1691  else
1692  {
1693  /* Remember we left a page with data */
1694  so->currPos.moreRight = true;
1695 
1696  if (scan->parallel_scan != NULL)
1697  {
1698  /*
1699  * Seize the scan to get the current block number; if the scan has
1700  * ended already, bail out.
1701  */
1702  status = _bt_parallel_seize(scan, &blkno);
1704  if (!status)
1705  {
1707  return false;
1708  }
1709  }
1710  else
1711  {
1712  /* Not parallel, so just use our own notion of the current page */
1713  blkno = so->currPos.currPage;
1714  }
1715  }
1716 
1717  if (!_bt_readnextpage(scan, blkno, dir))
1718  return false;
1719 
1720  /* Drop the lock, and maybe the pin, on the current page */
1722 
1723  return true;
1724 }
1725 
1726 /*
1727  * _bt_readnextpage() -- Read next page containing valid data for scan
1728  *
1729  * On success exit, so->currPos is updated to contain data from the next
1730  * interesting page. Caller is responsible to release lock and pin on
1731  * buffer on success. We return true to indicate success.
1732  *
1733  * If there are no more matching records in the given direction, we drop all
1734  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1735  */
1736 static bool
1738 {
1739  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1740  Relation rel;
1741  Page page;
1742  BTPageOpaque opaque;
1743  bool status = true;
1744 
1745  rel = scan->indexRelation;
1746 
1747  if (ScanDirectionIsForward(dir))
1748  {
1749  for (;;)
1750  {
1751  /*
1752  * if we're at end of scan, give up and mark parallel scan as
1753  * done, so that all the workers can finish their scan
1754  */
1755  if (blkno == P_NONE || !so->currPos.moreRight)
1756  {
1757  _bt_parallel_done(scan);
1759  return false;
1760  }
1761  /* check for interrupts while we're not holding any buffer lock */
1763  /* step right one page */
1764  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1765  page = BufferGetPage(so->currPos.buf);
1766  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1767  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1768  /* check for deleted page */
1769  if (!P_IGNORE(opaque))
1770  {
1771  PredicateLockPage(rel, blkno, scan->xs_snapshot);
1772  /* see if there are any matches on this page */
1773  /* note that this will clear moreRight if we can stop */
1774  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1775  break;
1776  }
1777  else if (scan->parallel_scan != NULL)
1778  {
1779  /* allow next page be processed by parallel worker */
1780  _bt_parallel_release(scan, opaque->btpo_next);
1781  }
1782 
1783  /* nope, keep going */
1784  if (scan->parallel_scan != NULL)
1785  {
1786  _bt_relbuf(rel, so->currPos.buf);
1787  status = _bt_parallel_seize(scan, &blkno);
1788  if (!status)
1789  {
1791  return false;
1792  }
1793  }
1794  else
1795  {
1796  blkno = opaque->btpo_next;
1797  _bt_relbuf(rel, so->currPos.buf);
1798  }
1799  }
1800  }
1801  else
1802  {
1803  /*
1804  * Should only happen in parallel cases, when some other backend
1805  * advanced the scan.
1806  */
1807  if (so->currPos.currPage != blkno)
1808  {
1810  so->currPos.currPage = blkno;
1811  }
1812 
1813  /*
1814  * Walk left to the next page with data. This is much more complex
1815  * than the walk-right case because of the possibility that the page
1816  * to our left splits while we are in flight to it, plus the
1817  * possibility that the page we were on gets deleted after we leave
1818  * it. See nbtree/README for details.
1819  *
1820  * It might be possible to rearrange this code to have less overhead
1821  * in pinning and locking, but that would require capturing the left
1822  * pointer when the page is initially read, and using it here, along
1823  * with big changes to _bt_walk_left() and the code below. It is not
1824  * clear whether this would be a win, since if the page immediately to
1825  * the left splits after we read this page and before we step left, we
1826  * would need to visit more pages than with the current code.
1827  *
1828  * Note that if we change the code so that we drop the pin for a scan
1829  * which uses a non-MVCC snapshot, we will need to modify the code for
1830  * walking left, to allow for the possibility that a referenced page
1831  * has been deleted. As long as the buffer is pinned or the snapshot
1832  * is MVCC the page cannot move past the half-dead state to fully
1833  * deleted.
1834  */
1835  if (BTScanPosIsPinned(so->currPos))
1836  LockBuffer(so->currPos.buf, BT_READ);
1837  else
1838  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1839 
1840  for (;;)
1841  {
1842  /* Done if we know there are no matching keys to the left */
1843  if (!so->currPos.moreLeft)
1844  {
1845  _bt_relbuf(rel, so->currPos.buf);
1846  _bt_parallel_done(scan);
1848  return false;
1849  }
1850 
1851  /* Step to next physical page */
1852  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1853  scan->xs_snapshot);
1854 
1855  /* if we're physically at end of index, return failure */
1856  if (so->currPos.buf == InvalidBuffer)
1857  {
1858  _bt_parallel_done(scan);
1860  return false;
1861  }
1862 
1863  /*
1864  * Okay, we managed to move left to a non-deleted page. Done if
1865  * it's not half-dead and contains matching tuples. Else loop back
1866  * and do it all again.
1867  */
1868  page = BufferGetPage(so->currPos.buf);
1869  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1870  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1871  if (!P_IGNORE(opaque))
1872  {
1874  /* see if there are any matches on this page */
1875  /* note that this will clear moreLeft if we can stop */
1876  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1877  break;
1878  }
1879  else if (scan->parallel_scan != NULL)
1880  {
1881  /* allow next page be processed by parallel worker */
1883  }
1884 
1885  /*
1886  * For parallel scans, get the last page scanned as it is quite
1887  * possible that by the time we try to seize the scan, some other
1888  * worker has already advanced the scan to a different page. We
1889  * must continue based on the latest page scanned by any worker.
1890  */
1891  if (scan->parallel_scan != NULL)
1892  {
1893  _bt_relbuf(rel, so->currPos.buf);
1894  status = _bt_parallel_seize(scan, &blkno);
1895  if (!status)
1896  {
1898  return false;
1899  }
1900  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1901  }
1902  }
1903  }
1904 
1905  return true;
1906 }
1907 
1908 /*
1909  * _bt_parallel_readpage() -- Read current page containing valid data for scan
1910  *
1911  * On success, release lock and maybe pin on buffer. We return true to
1912  * indicate success.
1913  */
1914 static bool
1916 {
1917  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1918 
1919  _bt_initialize_more_data(so, dir);
1920 
1921  if (!_bt_readnextpage(scan, blkno, dir))
1922  return false;
1923 
1924  /* Drop the lock, and maybe the pin, on the current page */
1926 
1927  return true;
1928 }
1929 
1930 /*
1931  * _bt_walk_left() -- step left one page, if possible
1932  *
1933  * The given buffer must be pinned and read-locked. This will be dropped
1934  * before stepping left. On return, we have pin and read lock on the
1935  * returned page, instead.
1936  *
1937  * Returns InvalidBuffer if there is no page to the left (no lock is held
1938  * in that case).
1939  *
1940  * When working on a non-leaf level, it is possible for the returned page
1941  * to be half-dead; the caller should check that condition and step left
1942  * again if it's important.
1943  */
1944 static Buffer
1946 {
1947  Page page;
1948  BTPageOpaque opaque;
1949 
1950  page = BufferGetPage(buf);
1951  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1952 
1953  for (;;)
1954  {
1955  BlockNumber obknum;
1956  BlockNumber lblkno;
1957  BlockNumber blkno;
1958  int tries;
1959 
1960  /* if we're at end of tree, release buf and return failure */
1961  if (P_LEFTMOST(opaque))
1962  {
1963  _bt_relbuf(rel, buf);
1964  break;
1965  }
1966  /* remember original page we are stepping left from */
1967  obknum = BufferGetBlockNumber(buf);
1968  /* step left */
1969  blkno = lblkno = opaque->btpo_prev;
1970  _bt_relbuf(rel, buf);
1971  /* check for interrupts while we're not holding any buffer lock */
1973  buf = _bt_getbuf(rel, blkno, BT_READ);
1974  page = BufferGetPage(buf);
1975  TestForOldSnapshot(snapshot, rel, page);
1976  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1977 
1978  /*
1979  * If this isn't the page we want, walk right till we find what we
1980  * want --- but go no more than four hops (an arbitrary limit). If we
1981  * don't find the correct page by then, the most likely bet is that
1982  * the original page got deleted and isn't in the sibling chain at all
1983  * anymore, not that its left sibling got split more than four times.
1984  *
1985  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1986  * because half-dead pages are still in the sibling chain. Caller
1987  * must reject half-dead pages if wanted.
1988  */
1989  tries = 0;
1990  for (;;)
1991  {
1992  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1993  {
1994  /* Found desired page, return it */
1995  return buf;
1996  }
1997  if (P_RIGHTMOST(opaque) || ++tries > 4)
1998  break;
1999  blkno = opaque->btpo_next;
2000  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2001  page = BufferGetPage(buf);
2002  TestForOldSnapshot(snapshot, rel, page);
2003  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2004  }
2005 
2006  /* Return to the original page to see what's up */
2007  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2008  page = BufferGetPage(buf);
2009  TestForOldSnapshot(snapshot, rel, page);
2010  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2011  if (P_ISDELETED(opaque))
2012  {
2013  /*
2014  * It was deleted. Move right to first nondeleted page (there
2015  * must be one); that is the page that has acquired the deleted
2016  * one's keyspace, so stepping left from it will take us where we
2017  * want to be.
2018  */
2019  for (;;)
2020  {
2021  if (P_RIGHTMOST(opaque))
2022  elog(ERROR, "fell off the end of index \"%s\"",
2024  blkno = opaque->btpo_next;
2025  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2026  page = BufferGetPage(buf);
2027  TestForOldSnapshot(snapshot, rel, page);
2028  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2029  if (!P_ISDELETED(opaque))
2030  break;
2031  }
2032 
2033  /*
2034  * Now return to top of loop, resetting obknum to point to this
2035  * nondeleted page, and try again.
2036  */
2037  }
2038  else
2039  {
2040  /*
2041  * It wasn't deleted; the explanation had better be that the page
2042  * to the left got split or deleted. Without this check, we'd go
2043  * into an infinite loop if there's anything wrong.
2044  */
2045  if (opaque->btpo_prev == lblkno)
2046  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2047  obknum, RelationGetRelationName(rel));
2048  /* Okay to try again with new lblkno value */
2049  }
2050  }
2051 
2052  return InvalidBuffer;
2053 }
2054 
2055 /*
2056  * _bt_get_endpoint() -- Find the first or last page on a given tree level
2057  *
2058  * If the index is empty, we will return InvalidBuffer; any other failure
2059  * condition causes ereport(). We will not return a dead page.
2060  *
2061  * The returned buffer is pinned and read-locked.
2062  */
2063 Buffer
2064 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2065  Snapshot snapshot)
2066 {
2067  Buffer buf;
2068  Page page;
2069  BTPageOpaque opaque;
2070  OffsetNumber offnum;
2071  BlockNumber blkno;
2072  IndexTuple itup;
2073 
2074  /*
2075  * If we are looking for a leaf page, okay to descend from fast root;
2076  * otherwise better descend from true root. (There is no point in being
2077  * smarter about intermediate levels.)
2078  */
2079  if (level == 0)
2080  buf = _bt_getroot(rel, BT_READ);
2081  else
2082  buf = _bt_gettrueroot(rel);
2083 
2084  if (!BufferIsValid(buf))
2085  return InvalidBuffer;
2086 
2087  page = BufferGetPage(buf);
2088  TestForOldSnapshot(snapshot, rel, page);
2089  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2090 
2091  for (;;)
2092  {
2093  /*
2094  * If we landed on a deleted page, step right to find a live page
2095  * (there must be one). Also, if we want the rightmost page, step
2096  * right if needed to get to it (this could happen if the page split
2097  * since we obtained a pointer to it).
2098  */
2099  while (P_IGNORE(opaque) ||
2100  (rightmost && !P_RIGHTMOST(opaque)))
2101  {
2102  blkno = opaque->btpo_next;
2103  if (blkno == P_NONE)
2104  elog(ERROR, "fell off the end of index \"%s\"",
2106  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2107  page = BufferGetPage(buf);
2108  TestForOldSnapshot(snapshot, rel, page);
2109  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2110  }
2111 
2112  /* Done? */
2113  if (opaque->btpo.level == level)
2114  break;
2115  if (opaque->btpo.level < level)
2116  elog(ERROR, "btree level %u not found in index \"%s\"",
2117  level, RelationGetRelationName(rel));
2118 
2119  /* Descend to leftmost or rightmost child page */
2120  if (rightmost)
2121  offnum = PageGetMaxOffsetNumber(page);
2122  else
2123  offnum = P_FIRSTDATAKEY(opaque);
2124 
2125  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2126  blkno = BTreeInnerTupleGetDownLink(itup);
2127 
2128  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2129  page = BufferGetPage(buf);
2130  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2131  }
2132 
2133  return buf;
2134 }
2135 
2136 /*
2137  * _bt_endpoint() -- Find the first or last page in the index, and scan
2138  * from there to the first key satisfying all the quals.
2139  *
2140  * This is used by _bt_first() to set up a scan when we've determined
2141  * that the scan must start at the beginning or end of the index (for
2142  * a forward or backward scan respectively). Exit conditions are the
2143  * same as for _bt_first().
2144  */
2145 static bool
2147 {
2148  Relation rel = scan->indexRelation;
2149  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2150  Buffer buf;
2151  Page page;
2152  BTPageOpaque opaque;
2153  OffsetNumber start;
2154  BTScanPosItem *currItem;
2155 
2156  /*
2157  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2158  * version of _bt_search(). We don't maintain a stack since we know we
2159  * won't need it.
2160  */
2162 
2163  if (!BufferIsValid(buf))
2164  {
2165  /*
2166  * Empty index. Lock the whole relation, as nothing finer to lock
2167  * exists.
2168  */
2169  PredicateLockRelation(rel, scan->xs_snapshot);
2171  return false;
2172  }
2173 
2175  page = BufferGetPage(buf);
2176  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2177  Assert(P_ISLEAF(opaque));
2178 
2179  if (ScanDirectionIsForward(dir))
2180  {
2181  /* There could be dead pages to the left, so not this: */
2182  /* Assert(P_LEFTMOST(opaque)); */
2183 
2184  start = P_FIRSTDATAKEY(opaque);
2185  }
2186  else if (ScanDirectionIsBackward(dir))
2187  {
2188  Assert(P_RIGHTMOST(opaque));
2189 
2190  start = PageGetMaxOffsetNumber(page);
2191  }
2192  else
2193  {
2194  elog(ERROR, "invalid scan direction: %d", (int) dir);
2195  start = 0; /* keep compiler quiet */
2196  }
2197 
2198  /* remember which buffer we have pinned */
2199  so->currPos.buf = buf;
2200 
2201  _bt_initialize_more_data(so, dir);
2202 
2203  /*
2204  * Now load data from the first page of the scan.
2205  */
2206  if (!_bt_readpage(scan, dir, start))
2207  {
2208  /*
2209  * There's no actually-matching data on this page. Try to advance to
2210  * the next page. Return false if there's no matching data at all.
2211  */
2213  if (!_bt_steppage(scan, dir))
2214  return false;
2215  }
2216  else
2217  {
2218  /* Drop the lock, and maybe the pin, on the current page */
2220  }
2221 
2222  /* OK, itemIndex says what to return */
2223  currItem = &so->currPos.items[so->currPos.itemIndex];
2224  scan->xs_heaptid = currItem->heapTid;
2225  if (scan->xs_want_itup)
2226  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2227 
2228  return true;
2229 }
2230 
2231 /*
2232  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2233  * for scan direction
2234  */
2235 static inline void
2237 {
2238  /* initialize moreLeft/moreRight appropriately for scan direction */
2239  if (ScanDirectionIsForward(dir))
2240  {
2241  so->currPos.moreLeft = false;
2242  so->currPos.moreRight = true;
2243  }
2244  else
2245  {
2246  so->currPos.moreLeft = true;
2247  so->currPos.moreRight = false;
2248  }
2249  so->numKilled = 0; /* just paranoia */
2250  so->markItemIndex = -1; /* ditto */
2251 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
BlockNumber btpo_next
Definition: nbtree.h:58
#define BTORDER_PROC
Definition: nbtree.h:392
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:794
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
bool moreRight
Definition: nbtree.h:563
bool bounds_valid
Definition: nbtree.h:508
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:582
#define DatumGetInt32(X)
Definition: postgres.h:472
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:932
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
#define RelationGetDescr(relation)
Definition: rel.h:440
ItemPointer scantid
Definition: nbtree.h:476
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1915
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:580
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2503
char * currTuples
Definition: nbtree.h:654
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
regproc RegProcedure
Definition: c.h:505
#define Min(x, y)
Definition: c.h:890
union BTPageOpaqueData::@46 btpo
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:346
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
BlockNumber currPage
Definition: nbtree.h:553
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
OffsetNumber stricthigh
Definition: nbtree.h:510
BTStackData * BTStack
Definition: nbtree.h:423
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:604
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
OffsetNumber indexOffset
Definition: nbtree.h:544
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
OffsetNumber low
Definition: nbtree.h:509
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
bool moreLeft
Definition: nbtree.h:562
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2064
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:746
bool ignore_killed_tuples
Definition: relscan.h:114
signed int int32
Definition: c.h:346
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:346
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
int nextTupleOffset
Definition: nbtree.h:569
#define BT_READ
Definition: nbtree.h:402
int lastItem
Definition: nbtree.h:579
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
BTScanPosData markPos
Definition: nbtree.h:668
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1854
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:671
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:132
BlockNumber btpo_prev
Definition: nbtree.h:57
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2876
OffsetNumber bts_offset
Definition: nbtree.h:418
static char * buf
Definition: pg_test_fsync.c:68
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2146
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:587
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:445
#define RegProcedureIsValid(p)
Definition: c.h:640
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1601
ScanDirection
Definition: sdir.h:22
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:426
char * markTuples
Definition: nbtree.h:655
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1374
unsigned int uint32
Definition: c.h:358
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
Oid * rd_opfamily
Definition: rel.h:166
int firstItem
Definition: nbtree.h:578
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:433
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:291
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:559
#define SK_ISNULL
Definition: skey.h:115
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool pivotsearch
Definition: nbtree.h:475
#define P_ISDELETED(opaque)
Definition: nbtree.h:191
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:744
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:682
uint32 level
Definition: nbtree.h:61
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2236
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:539
bool _bt_heapkeyspace(Relation rel)
Definition: nbtpage.c:684
bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1357
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:98
#define InvalidOffsetNumber
Definition: off.h:26
#define InvalidOid
Definition: postgres_ext.h:36
BlockNumber bts_blkno
Definition: nbtree.h:417
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
int markItemIndex
Definition: nbtree.h:664
BlockNumber bts_btentry
Definition: nbtree.h:419
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1737
#define Max(x, y)
Definition: c.h:884
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2382
int sk_flags
Definition: skey.h:66
Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:244
BTScanInsert itup_key
Definition: nbtree.h:498
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:681
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
bool heapkeyspace
Definition: nbtree.h:472
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:610
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int numberOfKeys
Definition: nbtree.h:633
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
struct BTStackData * bts_parent
Definition: nbtree.h:420
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:478
#define RelationNeedsWAL(relation)
Definition: rel.h:516
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:699
ItemPointerData heapTid
Definition: nbtree.h:543
#define DatumGetPointer(X)
Definition: postgres.h:549
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber nextPage
Definition: nbtree.h:554
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
BTScanPosData currPos
Definition: nbtree.h:667
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:924
ScanKey keyData
Definition: nbtree.h:634
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:226
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1630
#define unlikely(x)
Definition: c.h:208
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1348
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
XLogRecPtr lsn
Definition: nbtree.h:552
Buffer buf
Definition: nbtree.h:550
Oid * rd_opcintype
Definition: rel.h:167
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
#define BT_WRITE
Definition: nbtree.h:403
#define BTLessStrategyNumber
Definition: stratnum.h:29
LocationIndex tupleOffset
Definition: nbtree.h:545
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1403
int16 AttrNumber
Definition: attnum.h:21
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3391
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1030
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define offsetof(type, field)
Definition: c.h:655
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:598
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189
static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:1945