PostgreSQL Source Code  git master
nbtsearch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsearch.c
4  * Search code for postgres btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 
26 
29 static int _bt_binsrch_posting(BTScanInsert key, Page page,
30  OffsetNumber offnum);
31 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
32  OffsetNumber offnum);
33 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
34  OffsetNumber offnum, IndexTuple itup);
35 static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
36  OffsetNumber offnum, ItemPointer heapTid,
37  IndexTuple itup);
38 static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
39  OffsetNumber offnum,
40  ItemPointer heapTid, int tupleOffset);
41 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
42 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
43 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
44  ScanDirection dir);
45 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
46 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
47 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
48 
49 
50 /*
51  * _bt_drop_lock_and_maybe_pin()
52  *
53  * Unlock the buffer; and if it is safe to release the pin, do that, too. It
54  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
55  * This will prevent vacuum from stalling in a blocked state trying to read a
56  * page when a cursor is sitting on it -- at least in many important cases.
57  *
58  * Set the buffer to invalid if the pin is released, since the buffer may be
59  * re-used. If we need to go back to this block (for example, to apply
60  * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
61  * will remain in shared memory for as long as it takes to scan the index
62  * buffer page.
63  */
64 static void
66 {
68 
69  if (IsMVCCSnapshot(scan->xs_snapshot) &&
71  !scan->xs_want_itup)
72  {
73  ReleaseBuffer(sp->buf);
74  sp->buf = InvalidBuffer;
75  }
76 }
77 
78 /*
79  * _bt_search() -- Search the tree for a particular scankey,
80  * or more precisely for the first leaf page it could be on.
81  *
82  * The passed scankey is an insertion-type scankey (see nbtree/README),
83  * but it can omit the rightmost column(s) of the index.
84  *
85  * Return value is a stack of parent-page pointers (i.e. there is no entry for
86  * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
87  * which is locked and pinned. No locks are held on the parent pages,
88  * however!
89  *
90  * If the snapshot parameter is not NULL, "old snapshot" checking will take
91  * place during the descent through the tree. This is not needed when
92  * positioning for an insert or delete, so NULL is used for those cases.
93  *
94  * The returned buffer is locked according to access parameter. Additionally,
95  * access = BT_WRITE will allow an empty root page to be created and returned.
96  * When access = BT_READ, an empty index will result in *bufP being set to
97  * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
98  * during the search will be finished.
99  */
100 BTStack
101 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
102  Snapshot snapshot)
103 {
104  BTStack stack_in = NULL;
105  int page_access = BT_READ;
106 
107  /* Get the root page to start with */
108  *bufP = _bt_getroot(rel, access);
109 
110  /* If index is empty and access = BT_READ, no root page is created. */
111  if (!BufferIsValid(*bufP))
112  return (BTStack) NULL;
113 
114  /* Loop iterates once per level descended in the tree */
115  for (;;)
116  {
117  Page page;
118  BTPageOpaque opaque;
119  OffsetNumber offnum;
120  ItemId itemid;
121  IndexTuple itup;
122  BlockNumber child;
123  BTStack new_stack;
124 
125  /*
126  * Race -- the page we just grabbed may have split since we read its
127  * downlink in its parent page (or the metapage). If it has, we may
128  * need to move right to its new sibling. Do that.
129  *
130  * In write-mode, allow _bt_moveright to finish any incomplete splits
131  * along the way. Strictly speaking, we'd only need to finish an
132  * incomplete split on the leaf page we're about to insert to, not on
133  * any of the upper levels (internal pages with incomplete splits are
134  * also taken care of in _bt_getstackbuf). But this is a good
135  * opportunity to finish splits of internal pages too.
136  */
137  *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
138  page_access, snapshot);
139 
140  /* if this is a leaf page, we're done */
141  page = BufferGetPage(*bufP);
142  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
143  if (P_ISLEAF(opaque))
144  break;
145 
146  /*
147  * Find the appropriate pivot tuple on this page. Its downlink points
148  * to the child page that we're about to descend to.
149  */
150  offnum = _bt_binsrch(rel, key, *bufP);
151  itemid = PageGetItemId(page, offnum);
152  itup = (IndexTuple) PageGetItem(page, itemid);
153  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
154  child = BTreeTupleGetDownLink(itup);
155 
156  /*
157  * We need to save the location of the pivot tuple we chose in a new
158  * stack entry for this page/level. If caller ends up splitting a
159  * page one level down, it usually ends up inserting a new pivot
160  * tuple/downlink immediately after the location recorded here.
161  */
162  new_stack = (BTStack) palloc(sizeof(BTStackData));
163  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
164  new_stack->bts_offset = offnum;
165  new_stack->bts_parent = stack_in;
166 
167  /*
168  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
169  * we're on the level 1 and asked to lock leaf page in write mode,
170  * then lock next page in write mode, because it must be a leaf.
171  */
172  if (opaque->btpo.level == 1 && access == BT_WRITE)
173  page_access = BT_WRITE;
174 
175  /* drop the read lock on the page, then acquire one on its child */
176  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
177 
178  /* okay, all set to move down a level */
179  stack_in = new_stack;
180  }
181 
182  /*
183  * If we're asked to lock leaf in write mode, but didn't manage to, then
184  * relock. This should only happen when the root page is a leaf page (and
185  * the only page in the index other than the metapage).
186  */
187  if (access == BT_WRITE && page_access == BT_READ)
188  {
189  /* trade in our read lock for a write lock */
191  LockBuffer(*bufP, BT_WRITE);
192 
193  /*
194  * If the page was split between the time that we surrendered our read
195  * lock and acquired our write lock, then this page may no longer be
196  * the right place for the key we want to insert. In this case, we
197  * need to move right in the tree.
198  */
199  *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
200  snapshot);
201  }
202 
203  return stack_in;
204 }
205 
206 /*
207  * _bt_moveright() -- move right in the btree if necessary.
208  *
209  * When we follow a pointer to reach a page, it is possible that
210  * the page has changed in the meanwhile. If this happens, we're
211  * guaranteed that the page has "split right" -- that is, that any
212  * data that appeared on the page originally is either on the page
213  * or strictly to the right of it.
214  *
215  * This routine decides whether or not we need to move right in the
216  * tree by examining the high key entry on the page. If that entry is
217  * strictly less than the scankey, or <= the scankey in the
218  * key.nextkey=true case, then we followed the wrong link and we need
219  * to move right.
220  *
221  * The passed insertion-type scankey can omit the rightmost column(s) of the
222  * index. (see nbtree/README)
223  *
224  * When key.nextkey is false (the usual case), we are looking for the first
225  * item >= key. When key.nextkey is true, we are looking for the first item
226  * strictly greater than key.
227  *
228  * If forupdate is true, we will attempt to finish any incomplete splits
229  * that we encounter. This is required when locking a target page for an
230  * insertion, because we don't allow inserting on a page before the split
231  * is completed. 'stack' is only used if forupdate is true.
232  *
233  * On entry, we have the buffer pinned and a lock of the type specified by
234  * 'access'. If we move right, we release the buffer and lock and acquire
235  * the same on the right sibling. Return value is the buffer we stop at.
236  *
237  * If the snapshot parameter is not NULL, "old snapshot" checking will take
238  * place during the descent through the tree. This is not needed when
239  * positioning for an insert or delete, so NULL is used for those cases.
240  */
241 Buffer
244  Buffer buf,
245  bool forupdate,
246  BTStack stack,
247  int access,
248  Snapshot snapshot)
249 {
250  Page page;
251  BTPageOpaque opaque;
252  int32 cmpval;
253 
254  /*
255  * When nextkey = false (normal case): if the scan key that brought us to
256  * this page is > the high key stored on the page, then the page has split
257  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
258  * have some duplicates to the right as well as the left, but that's
259  * something that's only ever dealt with on the leaf level, after
260  * _bt_search has found an initial leaf page.)
261  *
262  * When nextkey = true: move right if the scan key is >= page's high key.
263  * (Note that key.scantid cannot be set in this case.)
264  *
265  * The page could even have split more than once, so scan as far as
266  * needed.
267  *
268  * We also have to move right if we followed a link that brought us to a
269  * dead page.
270  */
271  cmpval = key->nextkey ? 0 : 1;
272 
273  for (;;)
274  {
275  page = BufferGetPage(buf);
276  TestForOldSnapshot(snapshot, rel, page);
277  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
278 
279  if (P_RIGHTMOST(opaque))
280  break;
281 
282  /*
283  * Finish any incomplete splits we encounter along the way.
284  */
285  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
286  {
287  BlockNumber blkno = BufferGetBlockNumber(buf);
288 
289  /* upgrade our lock if necessary */
290  if (access == BT_READ)
291  {
293  LockBuffer(buf, BT_WRITE);
294  }
295 
296  if (P_INCOMPLETE_SPLIT(opaque))
297  _bt_finish_split(rel, buf, stack);
298  else
299  _bt_relbuf(rel, buf);
300 
301  /* re-acquire the lock in the right mode, and re-check */
302  buf = _bt_getbuf(rel, blkno, access);
303  continue;
304  }
305 
306  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
307  {
308  /* step right one page */
309  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
310  continue;
311  }
312  else
313  break;
314  }
315 
316  if (P_IGNORE(opaque))
317  elog(ERROR, "fell off the end of index \"%s\"",
319 
320  return buf;
321 }
322 
323 /*
324  * _bt_binsrch() -- Do a binary search for a key on a particular page.
325  *
326  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
327  * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
328  * particular, this means it is possible to return a value 1 greater than the
329  * number of keys on the page, if the scankey is > all keys on the page.)
330  *
331  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
332  * of the last key < given scankey, or last key <= given scankey if nextkey
333  * is true. (Since _bt_compare treats the first data key of such a page as
334  * minus infinity, there will be at least one key < scankey, so the result
335  * always points at one of the keys on the page.) This key indicates the
336  * right place to descend to be sure we find all leaf keys >= given scankey
337  * (or leaf keys > given scankey when nextkey is true).
338  *
339  * This procedure is not responsible for walking right, it just examines
340  * the given page. _bt_binsrch() has no lock or refcount side effects
341  * on the buffer.
342  */
343 static OffsetNumber
346  Buffer buf)
347 {
348  Page page;
349  BTPageOpaque opaque;
350  OffsetNumber low,
351  high;
352  int32 result,
353  cmpval;
354 
355  page = BufferGetPage(buf);
356  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
357 
358  /* Requesting nextkey semantics while using scantid seems nonsensical */
359  Assert(!key->nextkey || key->scantid == NULL);
360  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
361  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
362 
363  low = P_FIRSTDATAKEY(opaque);
364  high = PageGetMaxOffsetNumber(page);
365 
366  /*
367  * If there are no keys on the page, return the first available slot. Note
368  * this covers two cases: the page is really empty (no keys), or it
369  * contains only a high key. The latter case is possible after vacuuming.
370  * This can never happen on an internal page, however, since they are
371  * never empty (an internal page must have children).
372  */
373  if (unlikely(high < low))
374  return low;
375 
376  /*
377  * Binary search to find the first key on the page >= scan key, or first
378  * key > scankey when nextkey is true.
379  *
380  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
382  *
383  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
385  *
386  * We can fall out when high == low.
387  */
388  high++; /* establish the loop invariant for high */
389 
390  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
391 
392  while (high > low)
393  {
394  OffsetNumber mid = low + ((high - low) / 2);
395 
396  /* We have low <= mid < high, so mid points at a real slot */
397 
398  result = _bt_compare(rel, key, page, mid);
399 
400  if (result >= cmpval)
401  low = mid + 1;
402  else
403  high = mid;
404  }
405 
406  /*
407  * At this point we have high == low, but be careful: they could point
408  * past the last slot on the page.
409  *
410  * On a leaf page, we always return the first key >= scan key (resp. >
411  * scan key), which could be the last slot + 1.
412  */
413  if (P_ISLEAF(opaque))
414  return low;
415 
416  /*
417  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
418  * There must be one if _bt_compare() is playing by the rules.
419  */
420  Assert(low > P_FIRSTDATAKEY(opaque));
421 
422  return OffsetNumberPrev(low);
423 }
424 
425 /*
426  *
427  * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
428  *
429  * Like _bt_binsrch(), but with support for caching the binary search
430  * bounds. Only used during insertion, and only on the leaf page that it
431  * looks like caller will insert tuple on. Exclusive-locked and pinned
432  * leaf page is contained within insertstate.
433  *
434  * Caches the bounds fields in insertstate so that a subsequent call can
435  * reuse the low and strict high bounds of original binary search. Callers
436  * that use these fields directly must be prepared for the case where low
437  * and/or stricthigh are not on the same page (one or both exceed maxoff
438  * for the page). The case where there are no items on the page (high <
439  * low) makes bounds invalid.
440  *
441  * Caller is responsible for invalidating bounds when it modifies the page
442  * before calling here a second time, and for dealing with posting list
443  * tuple matches (callers can use insertstate's postingoff field to
444  * determine which existing heap TID will need to be replaced by a posting
445  * list split).
446  */
449 {
450  BTScanInsert key = insertstate->itup_key;
451  Page page;
452  BTPageOpaque opaque;
453  OffsetNumber low,
454  high,
455  stricthigh;
456  int32 result,
457  cmpval;
458 
459  page = BufferGetPage(insertstate->buf);
460  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
461 
462  Assert(P_ISLEAF(opaque));
463  Assert(!key->nextkey);
464  Assert(insertstate->postingoff == 0);
465 
466  if (!insertstate->bounds_valid)
467  {
468  /* Start new binary search */
469  low = P_FIRSTDATAKEY(opaque);
470  high = PageGetMaxOffsetNumber(page);
471  }
472  else
473  {
474  /* Restore result of previous binary search against same page */
475  low = insertstate->low;
476  high = insertstate->stricthigh;
477  }
478 
479  /* If there are no keys on the page, return the first available slot */
480  if (unlikely(high < low))
481  {
482  /* Caller can't reuse bounds */
483  insertstate->low = InvalidOffsetNumber;
484  insertstate->stricthigh = InvalidOffsetNumber;
485  insertstate->bounds_valid = false;
486  return low;
487  }
488 
489  /*
490  * Binary search to find the first key on the page >= scan key. (nextkey
491  * is always false when inserting).
492  *
493  * The loop invariant is: all slots before 'low' are < scan key, all slots
494  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
495  * maintained to save additional search effort for caller.
496  *
497  * We can fall out when high == low.
498  */
499  if (!insertstate->bounds_valid)
500  high++; /* establish the loop invariant for high */
501  stricthigh = high; /* high initially strictly higher */
502 
503  cmpval = 1; /* !nextkey comparison value */
504 
505  while (high > low)
506  {
507  OffsetNumber mid = low + ((high - low) / 2);
508 
509  /* We have low <= mid < high, so mid points at a real slot */
510 
511  result = _bt_compare(rel, key, page, mid);
512 
513  if (result >= cmpval)
514  low = mid + 1;
515  else
516  {
517  high = mid;
518  if (result != 0)
519  stricthigh = high;
520  }
521 
522  /*
523  * If tuple at offset located by binary search is a posting list whose
524  * TID range overlaps with caller's scantid, perform posting list
525  * binary search to set postingoff for caller. Caller must split the
526  * posting list when postingoff is set. This should happen
527  * infrequently.
528  */
529  if (unlikely(result == 0 && key->scantid != NULL))
530  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
531  }
532 
533  /*
534  * On a leaf page, a binary search always returns the first key >= scan
535  * key (at least in !nextkey case), which could be the last slot + 1. This
536  * is also the lower bound of cached search.
537  *
538  * stricthigh may also be the last slot + 1, which prevents caller from
539  * using bounds directly, but is still useful to us if we're called a
540  * second time with cached bounds (cached low will be < stricthigh when
541  * that happens).
542  */
543  insertstate->low = low;
544  insertstate->stricthigh = stricthigh;
545  insertstate->bounds_valid = true;
546 
547  return low;
548 }
549 
550 /*----------
551  * _bt_binsrch_posting() -- posting list binary search.
552  *
553  * Helper routine for _bt_binsrch_insert().
554  *
555  * Returns offset into posting list where caller's scantid belongs.
556  *----------
557  */
558 static int
560 {
561  IndexTuple itup;
562  ItemId itemid;
563  int low,
564  high,
565  mid,
566  res;
567 
568  /*
569  * If this isn't a posting tuple, then the index must be corrupt (if it is
570  * an ordinary non-pivot tuple then there must be an existing tuple with a
571  * heap TID that equals inserter's new heap TID/scantid). Defensively
572  * check that tuple is a posting list tuple whose posting list range
573  * includes caller's scantid.
574  *
575  * (This is also needed because contrib/amcheck's rootdescend option needs
576  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
577  */
578  itemid = PageGetItemId(page, offnum);
579  itup = (IndexTuple) PageGetItem(page, itemid);
580  if (!BTreeTupleIsPosting(itup))
581  return 0;
582 
583  Assert(key->heapkeyspace && key->allequalimage);
584 
585  /*
586  * In the event that posting list tuple has LP_DEAD bit set, indicate this
587  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
588  * second call to _bt_binsrch_insert() can take place when its caller has
589  * removed the dead item.
590  */
591  if (ItemIdIsDead(itemid))
592  return -1;
593 
594  /* "high" is past end of posting list for loop invariant */
595  low = 0;
596  high = BTreeTupleGetNPosting(itup);
597  Assert(high >= 2);
598 
599  while (high > low)
600  {
601  mid = low + ((high - low) / 2);
602  res = ItemPointerCompare(key->scantid,
603  BTreeTupleGetPostingN(itup, mid));
604 
605  if (res > 0)
606  low = mid + 1;
607  else if (res < 0)
608  high = mid;
609  else
610  return mid;
611  }
612 
613  /* Exact match not found */
614  return low;
615 }
616 
617 /*----------
618  * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
619  *
620  * page/offnum: location of btree item to be compared to.
621  *
622  * This routine returns:
623  * <0 if scankey < tuple at offnum;
624  * 0 if scankey == tuple at offnum;
625  * >0 if scankey > tuple at offnum.
626  *
627  * NULLs in the keys are treated as sortable values. Therefore
628  * "equality" does not necessarily mean that the item should be returned
629  * to the caller as a matching key. Similarly, an insertion scankey
630  * with its scantid set is treated as equal to a posting tuple whose TID
631  * range overlaps with their scantid. There generally won't be a
632  * matching TID in the posting tuple, which caller must handle
633  * themselves (e.g., by splitting the posting list tuple).
634  *
635  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
636  * "minus infinity": this routine will always claim it is less than the
637  * scankey. The actual key value stored is explicitly truncated to 0
638  * attributes (explicitly minus infinity) with version 3+ indexes, but
639  * that isn't relied upon. This allows us to implement the Lehman and
640  * Yao convention that the first down-link pointer is before the first
641  * key. See backend/access/nbtree/README for details.
642  *----------
643  */
644 int32
647  Page page,
648  OffsetNumber offnum)
649 {
650  TupleDesc itupdesc = RelationGetDescr(rel);
652  IndexTuple itup;
653  ItemPointer heapTid;
654  ScanKey scankey;
655  int ncmpkey;
656  int ntupatts;
657  int32 result;
658 
659  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
661  Assert(key->heapkeyspace || key->scantid == NULL);
662 
663  /*
664  * Force result ">" if target item is first data item on an internal page
665  * --- see NOTE above.
666  */
667  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
668  return 1;
669 
670  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
671  ntupatts = BTreeTupleGetNAtts(itup, rel);
672 
673  /*
674  * The scan key is set up with the attribute number associated with each
675  * term in the key. It is important that, if the index is multi-key, the
676  * scan contain the first k key attributes, and that they be in order. If
677  * you think about how multi-key ordering works, you'll understand why
678  * this is.
679  *
680  * We don't test for violation of this condition here, however. The
681  * initial setup for the index scan had better have gotten it right (see
682  * _bt_first).
683  */
684 
685  ncmpkey = Min(ntupatts, key->keysz);
686  Assert(key->heapkeyspace || ncmpkey == key->keysz);
687  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
688  scankey = key->scankeys;
689  for (int i = 1; i <= ncmpkey; i++)
690  {
691  Datum datum;
692  bool isNull;
693 
694  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
695 
696  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
697  {
698  if (isNull)
699  result = 0; /* NULL "=" NULL */
700  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
701  result = -1; /* NULL "<" NOT_NULL */
702  else
703  result = 1; /* NULL ">" NOT_NULL */
704  }
705  else if (isNull) /* key is NOT_NULL and item is NULL */
706  {
707  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
708  result = 1; /* NOT_NULL ">" NULL */
709  else
710  result = -1; /* NOT_NULL "<" NULL */
711  }
712  else
713  {
714  /*
715  * The sk_func needs to be passed the index value as left arg and
716  * the sk_argument as right arg (they might be of different
717  * types). Since it is convenient for callers to think of
718  * _bt_compare as comparing the scankey to the index item, we have
719  * to flip the sign of the comparison result. (Unless it's a DESC
720  * column, in which case we *don't* flip the sign.)
721  */
722  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
723  scankey->sk_collation,
724  datum,
725  scankey->sk_argument));
726 
727  if (!(scankey->sk_flags & SK_BT_DESC))
728  INVERT_COMPARE_RESULT(result);
729  }
730 
731  /* if the keys are unequal, return the difference */
732  if (result != 0)
733  return result;
734 
735  scankey++;
736  }
737 
738  /*
739  * All non-truncated attributes (other than heap TID) were found to be
740  * equal. Treat truncated attributes as minus infinity when scankey has a
741  * key attribute value that would otherwise be compared directly.
742  *
743  * Note: it doesn't matter if ntupatts includes non-key attributes;
744  * scankey won't, so explicitly excluding non-key attributes isn't
745  * necessary.
746  */
747  if (key->keysz > ntupatts)
748  return 1;
749 
750  /*
751  * Use the heap TID attribute and scantid to try to break the tie. The
752  * rules are the same as any other key attribute -- only the
753  * representation differs.
754  */
755  heapTid = BTreeTupleGetHeapTID(itup);
756  if (key->scantid == NULL)
757  {
758  /*
759  * Most searches have a scankey that is considered greater than a
760  * truncated pivot tuple if and when the scankey has equal values for
761  * attributes up to and including the least significant untruncated
762  * attribute in tuple.
763  *
764  * For example, if an index has the minimum two attributes (single
765  * user key attribute, plus heap TID attribute), and a page's high key
766  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
767  * will not descend to the page to the left. The search will descend
768  * right instead. The truncated attribute in pivot tuple means that
769  * all non-pivot tuples on the page to the left are strictly < 'foo',
770  * so it isn't necessary to descend left. In other words, search
771  * doesn't have to descend left because it isn't interested in a match
772  * that has a heap TID value of -inf.
773  *
774  * However, some searches (pivotsearch searches) actually require that
775  * we descend left when this happens. -inf is treated as a possible
776  * match for omitted scankey attribute(s). This is needed by page
777  * deletion, which must re-find leaf pages that are targets for
778  * deletion using their high keys.
779  *
780  * Note: the heap TID part of the test ensures that scankey is being
781  * compared to a pivot tuple with one or more truncated key
782  * attributes.
783  *
784  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
785  * left here, since they have no heap TID attribute (and cannot have
786  * any -inf key values in any case, since truncation can only remove
787  * non-key attributes). !heapkeyspace searches must always be
788  * prepared to deal with matches on both sides of the pivot once the
789  * leaf level is reached.
790  */
791  if (key->heapkeyspace && !key->pivotsearch &&
792  key->keysz == ntupatts && heapTid == NULL)
793  return 1;
794 
795  /* All provided scankey arguments found to be equal */
796  return 0;
797  }
798 
799  /*
800  * Treat truncated heap TID as minus infinity, since scankey has a key
801  * attribute value (scantid) that would otherwise be compared directly
802  */
804  if (heapTid == NULL)
805  return 1;
806 
807  /*
808  * Scankey must be treated as equal to a posting list tuple if its scantid
809  * value falls within the range of the posting list. In all other cases
810  * there can only be a single heap TID value, which is compared directly
811  * with scantid.
812  */
814  result = ItemPointerCompare(key->scantid, heapTid);
815  if (result <= 0 || !BTreeTupleIsPosting(itup))
816  return result;
817  else
818  {
819  result = ItemPointerCompare(key->scantid,
821  if (result > 0)
822  return 1;
823  }
824 
825  return 0;
826 }
827 
828 /*
829  * _bt_first() -- Find the first item in a scan.
830  *
831  * We need to be clever about the direction of scan, the search
832  * conditions, and the tree ordering. We find the first item (or,
833  * if backwards scan, the last item) in the tree that satisfies the
834  * qualifications in the scan key. On success exit, the page containing
835  * the current index tuple is pinned but not locked, and data about
836  * the matching tuple(s) on the page has been loaded into so->currPos.
837  * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
838  * and if requested, scan->xs_itup points to a copy of the index tuple.
839  *
840  * If there are no matching items in the index, we return false, with no
841  * pins or locks held.
842  *
843  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
844  * are both search-type scankeys (see nbtree/README for more about this).
845  * Within this routine, we build a temporary insertion-type scankey to use
846  * in locating the scan start position.
847  */
848 bool
850 {
851  Relation rel = scan->indexRelation;
852  BTScanOpaque so = (BTScanOpaque) scan->opaque;
853  Buffer buf;
854  BTStack stack;
855  OffsetNumber offnum;
856  StrategyNumber strat;
857  bool nextkey;
858  bool goback;
859  BTScanInsertData inskey;
860  ScanKey startKeys[INDEX_MAX_KEYS];
861  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
862  int keysCount = 0;
863  int i;
864  bool status = true;
865  StrategyNumber strat_total;
866  BTScanPosItem *currItem;
867  BlockNumber blkno;
868 
870 
872 
873  /*
874  * Examine the scan keys and eliminate any redundant keys; also mark the
875  * keys that must be matched to continue the scan.
876  */
877  _bt_preprocess_keys(scan);
878 
879  /*
880  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
881  * never be satisfied (eg, x == 1 AND x > 2).
882  */
883  if (!so->qual_ok)
884  return false;
885 
886  /*
887  * For parallel scans, get the starting page from shared state. If the
888  * scan has not started, proceed to find out first leaf page in the usual
889  * way while keeping other participating processes waiting. If the scan
890  * has already begun, use the page number from the shared structure.
891  */
892  if (scan->parallel_scan != NULL)
893  {
894  status = _bt_parallel_seize(scan, &blkno);
895  if (!status)
896  return false;
897  else if (blkno == P_NONE)
898  {
899  _bt_parallel_done(scan);
900  return false;
901  }
902  else if (blkno != InvalidBlockNumber)
903  {
904  if (!_bt_parallel_readpage(scan, blkno, dir))
905  return false;
906  goto readcomplete;
907  }
908  }
909 
910  /*----------
911  * Examine the scan keys to discover where we need to start the scan.
912  *
913  * We want to identify the keys that can be used as starting boundaries;
914  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
915  * a backwards scan. We can use keys for multiple attributes so long as
916  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
917  * a > or < boundary or find an attribute with no boundary (which can be
918  * thought of as the same as "> -infinity"), we can't use keys for any
919  * attributes to its right, because it would break our simplistic notion
920  * of what initial positioning strategy to use.
921  *
922  * When the scan keys include cross-type operators, _bt_preprocess_keys
923  * may not be able to eliminate redundant keys; in such cases we will
924  * arbitrarily pick a usable one for each attribute. This is correct
925  * but possibly not optimal behavior. (For example, with keys like
926  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
927  * x=5 would be more efficient.) Since the situation only arises given
928  * a poorly-worded query plus an incomplete opfamily, live with it.
929  *
930  * When both equality and inequality keys appear for a single attribute
931  * (again, only possible when cross-type operators appear), we *must*
932  * select one of the equality keys for the starting point, because
933  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
934  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
935  * start at x=4, we will fail and stop before reaching x=10. If multiple
936  * equality quals survive preprocessing, however, it doesn't matter which
937  * one we use --- by definition, they are either redundant or
938  * contradictory.
939  *
940  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
941  * If the index stores nulls at the end of the index we'll be starting
942  * from, and we have no boundary key for the column (which means the key
943  * we deduced NOT NULL from is an inequality key that constrains the other
944  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
945  * use as a boundary key. If we didn't do this, we might find ourselves
946  * traversing a lot of null entries at the start of the scan.
947  *
948  * In this loop, row-comparison keys are treated the same as keys on their
949  * first (leftmost) columns. We'll add on lower-order columns of the row
950  * comparison below, if possible.
951  *
952  * The selected scan keys (at most one per index column) are remembered by
953  * storing their addresses into the local startKeys[] array.
954  *----------
955  */
956  strat_total = BTEqualStrategyNumber;
957  if (so->numberOfKeys > 0)
958  {
959  AttrNumber curattr;
960  ScanKey chosen;
961  ScanKey impliesNN;
962  ScanKey cur;
963 
964  /*
965  * chosen is the so-far-chosen key for the current attribute, if any.
966  * We don't cast the decision in stone until we reach keys for the
967  * next attribute.
968  */
969  curattr = 1;
970  chosen = NULL;
971  /* Also remember any scankey that implies a NOT NULL constraint */
972  impliesNN = NULL;
973 
974  /*
975  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
976  * pass to handle after-last-key processing. Actual exit from the
977  * loop is at one of the "break" statements below.
978  */
979  for (cur = so->keyData, i = 0;; cur++, i++)
980  {
981  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
982  {
983  /*
984  * Done looking at keys for curattr. If we didn't find a
985  * usable boundary key, see if we can deduce a NOT NULL key.
986  */
987  if (chosen == NULL && impliesNN != NULL &&
988  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
991  {
992  /* Yes, so build the key in notnullkeys[keysCount] */
993  chosen = &notnullkeys[keysCount];
994  ScanKeyEntryInitialize(chosen,
996  (impliesNN->sk_flags &
998  curattr,
999  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1002  InvalidOid,
1003  InvalidOid,
1004  InvalidOid,
1005  (Datum) 0);
1006  }
1007 
1008  /*
1009  * If we still didn't find a usable boundary key, quit; else
1010  * save the boundary key pointer in startKeys.
1011  */
1012  if (chosen == NULL)
1013  break;
1014  startKeys[keysCount++] = chosen;
1015 
1016  /*
1017  * Adjust strat_total, and quit if we have stored a > or <
1018  * key.
1019  */
1020  strat = chosen->sk_strategy;
1021  if (strat != BTEqualStrategyNumber)
1022  {
1023  strat_total = strat;
1024  if (strat == BTGreaterStrategyNumber ||
1025  strat == BTLessStrategyNumber)
1026  break;
1027  }
1028 
1029  /*
1030  * Done if that was the last attribute, or if next key is not
1031  * in sequence (implying no boundary key is available for the
1032  * next attribute).
1033  */
1034  if (i >= so->numberOfKeys ||
1035  cur->sk_attno != curattr + 1)
1036  break;
1037 
1038  /*
1039  * Reset for next attr.
1040  */
1041  curattr = cur->sk_attno;
1042  chosen = NULL;
1043  impliesNN = NULL;
1044  }
1045 
1046  /*
1047  * Can we use this key as a starting boundary for this attr?
1048  *
1049  * If not, does it imply a NOT NULL constraint? (Because
1050  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1051  * *any* inequality key works for that; we need not test.)
1052  */
1053  switch (cur->sk_strategy)
1054  {
1055  case BTLessStrategyNumber:
1057  if (chosen == NULL)
1058  {
1059  if (ScanDirectionIsBackward(dir))
1060  chosen = cur;
1061  else
1062  impliesNN = cur;
1063  }
1064  break;
1065  case BTEqualStrategyNumber:
1066  /* override any non-equality choice */
1067  chosen = cur;
1068  break;
1071  if (chosen == NULL)
1072  {
1073  if (ScanDirectionIsForward(dir))
1074  chosen = cur;
1075  else
1076  impliesNN = cur;
1077  }
1078  break;
1079  }
1080  }
1081  }
1082 
1083  /*
1084  * If we found no usable boundary keys, we have to start from one end of
1085  * the tree. Walk down that edge to the first or last key, and scan from
1086  * there.
1087  */
1088  if (keysCount == 0)
1089  {
1090  bool match;
1091 
1092  match = _bt_endpoint(scan, dir);
1093 
1094  if (!match)
1095  {
1096  /* No match, so mark (parallel) scan finished */
1097  _bt_parallel_done(scan);
1098  }
1099 
1100  return match;
1101  }
1102 
1103  /*
1104  * We want to start the scan somewhere within the index. Set up an
1105  * insertion scankey we can use to search for the boundary point we
1106  * identified above. The insertion scankey is built using the keys
1107  * identified by startKeys[]. (Remaining insertion scankey fields are
1108  * initialized after initial-positioning strategy is finalized.)
1109  */
1110  Assert(keysCount <= INDEX_MAX_KEYS);
1111  for (i = 0; i < keysCount; i++)
1112  {
1113  ScanKey cur = startKeys[i];
1114 
1115  Assert(cur->sk_attno == i + 1);
1116 
1117  if (cur->sk_flags & SK_ROW_HEADER)
1118  {
1119  /*
1120  * Row comparison header: look to the first row member instead.
1121  *
1122  * The member scankeys are already in insertion format (ie, they
1123  * have sk_func = 3-way-comparison function), but we have to watch
1124  * out for nulls, which _bt_preprocess_keys didn't check. A null
1125  * in the first row member makes the condition unmatchable, just
1126  * like qual_ok = false.
1127  */
1128  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1129 
1130  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1131  if (subkey->sk_flags & SK_ISNULL)
1132  {
1133  _bt_parallel_done(scan);
1134  return false;
1135  }
1136  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1137 
1138  /*
1139  * If the row comparison is the last positioning key we accepted,
1140  * try to add additional keys from the lower-order row members.
1141  * (If we accepted independent conditions on additional index
1142  * columns, we use those instead --- doesn't seem worth trying to
1143  * determine which is more restrictive.) Note that this is OK
1144  * even if the row comparison is of ">" or "<" type, because the
1145  * condition applied to all but the last row member is effectively
1146  * ">=" or "<=", and so the extra keys don't break the positioning
1147  * scheme. But, by the same token, if we aren't able to use all
1148  * the row members, then the part of the row comparison that we
1149  * did use has to be treated as just a ">=" or "<=" condition, and
1150  * so we'd better adjust strat_total accordingly.
1151  */
1152  if (i == keysCount - 1)
1153  {
1154  bool used_all_subkeys = false;
1155 
1156  Assert(!(subkey->sk_flags & SK_ROW_END));
1157  for (;;)
1158  {
1159  subkey++;
1160  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1161  if (subkey->sk_attno != keysCount + 1)
1162  break; /* out-of-sequence, can't use it */
1163  if (subkey->sk_strategy != cur->sk_strategy)
1164  break; /* wrong direction, can't use it */
1165  if (subkey->sk_flags & SK_ISNULL)
1166  break; /* can't use null keys */
1167  Assert(keysCount < INDEX_MAX_KEYS);
1168  memcpy(inskey.scankeys + keysCount, subkey,
1169  sizeof(ScanKeyData));
1170  keysCount++;
1171  if (subkey->sk_flags & SK_ROW_END)
1172  {
1173  used_all_subkeys = true;
1174  break;
1175  }
1176  }
1177  if (!used_all_subkeys)
1178  {
1179  switch (strat_total)
1180  {
1181  case BTLessStrategyNumber:
1182  strat_total = BTLessEqualStrategyNumber;
1183  break;
1185  strat_total = BTGreaterEqualStrategyNumber;
1186  break;
1187  }
1188  }
1189  break; /* done with outer loop */
1190  }
1191  }
1192  else
1193  {
1194  /*
1195  * Ordinary comparison key. Transform the search-style scan key
1196  * to an insertion scan key by replacing the sk_func with the
1197  * appropriate btree comparison function.
1198  *
1199  * If scankey operator is not a cross-type comparison, we can use
1200  * the cached comparison function; otherwise gotta look it up in
1201  * the catalogs. (That can't lead to infinite recursion, since no
1202  * indexscan initiated by syscache lookup will use cross-data-type
1203  * operators.)
1204  *
1205  * We support the convention that sk_subtype == InvalidOid means
1206  * the opclass input type; this is a hack to simplify life for
1207  * ScanKeyInit().
1208  */
1209  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1210  cur->sk_subtype == InvalidOid)
1211  {
1212  FmgrInfo *procinfo;
1213 
1214  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1215  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1216  cur->sk_flags,
1217  cur->sk_attno,
1219  cur->sk_subtype,
1220  cur->sk_collation,
1221  procinfo,
1222  cur->sk_argument);
1223  }
1224  else
1225  {
1226  RegProcedure cmp_proc;
1227 
1228  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1229  rel->rd_opcintype[i],
1230  cur->sk_subtype,
1231  BTORDER_PROC);
1232  if (!RegProcedureIsValid(cmp_proc))
1233  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1234  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1235  cur->sk_attno, RelationGetRelationName(rel));
1236  ScanKeyEntryInitialize(inskey.scankeys + i,
1237  cur->sk_flags,
1238  cur->sk_attno,
1240  cur->sk_subtype,
1241  cur->sk_collation,
1242  cmp_proc,
1243  cur->sk_argument);
1244  }
1245  }
1246  }
1247 
1248  /*----------
1249  * Examine the selected initial-positioning strategy to determine exactly
1250  * where we need to start the scan, and set flag variables to control the
1251  * code below.
1252  *
1253  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1254  * item >= scan key. If nextkey = true, they will locate the first
1255  * item > scan key.
1256  *
1257  * If goback = true, we will then step back one item, while if
1258  * goback = false, we will start the scan on the located item.
1259  *----------
1260  */
1261  switch (strat_total)
1262  {
1263  case BTLessStrategyNumber:
1264 
1265  /*
1266  * Find first item >= scankey, then back up one to arrive at last
1267  * item < scankey. (Note: this positioning strategy is only used
1268  * for a backward scan, so that is always the correct starting
1269  * position.)
1270  */
1271  nextkey = false;
1272  goback = true;
1273  break;
1274 
1276 
1277  /*
1278  * Find first item > scankey, then back up one to arrive at last
1279  * item <= scankey. (Note: this positioning strategy is only used
1280  * for a backward scan, so that is always the correct starting
1281  * position.)
1282  */
1283  nextkey = true;
1284  goback = true;
1285  break;
1286 
1287  case BTEqualStrategyNumber:
1288 
1289  /*
1290  * If a backward scan was specified, need to start with last equal
1291  * item not first one.
1292  */
1293  if (ScanDirectionIsBackward(dir))
1294  {
1295  /*
1296  * This is the same as the <= strategy. We will check at the
1297  * end whether the found item is actually =.
1298  */
1299  nextkey = true;
1300  goback = true;
1301  }
1302  else
1303  {
1304  /*
1305  * This is the same as the >= strategy. We will check at the
1306  * end whether the found item is actually =.
1307  */
1308  nextkey = false;
1309  goback = false;
1310  }
1311  break;
1312 
1314 
1315  /*
1316  * Find first item >= scankey. (This is only used for forward
1317  * scans.)
1318  */
1319  nextkey = false;
1320  goback = false;
1321  break;
1322 
1324 
1325  /*
1326  * Find first item > scankey. (This is only used for forward
1327  * scans.)
1328  */
1329  nextkey = true;
1330  goback = false;
1331  break;
1332 
1333  default:
1334  /* can't get here, but keep compiler quiet */
1335  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1336  return false;
1337  }
1338 
1339  /* Initialize remaining insertion scan key fields */
1340  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1341  inskey.anynullkeys = false; /* unused */
1342  inskey.nextkey = nextkey;
1343  inskey.pivotsearch = false;
1344  inskey.scantid = NULL;
1345  inskey.keysz = keysCount;
1346 
1347  /*
1348  * Use the manufactured insertion scan key to descend the tree and
1349  * position ourselves on the target leaf page.
1350  */
1351  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1352 
1353  /* don't need to keep the stack around... */
1354  _bt_freestack(stack);
1355 
1356  if (!BufferIsValid(buf))
1357  {
1358  /*
1359  * We only get here if the index is completely empty. Lock relation
1360  * because nothing finer to lock exists.
1361  */
1362  PredicateLockRelation(rel, scan->xs_snapshot);
1363 
1364  /*
1365  * mark parallel scan as done, so that all the workers can finish
1366  * their scan
1367  */
1368  _bt_parallel_done(scan);
1370 
1371  return false;
1372  }
1373  else
1375  scan->xs_snapshot);
1376 
1377  _bt_initialize_more_data(so, dir);
1378 
1379  /* position to the precise item on the page */
1380  offnum = _bt_binsrch(rel, &inskey, buf);
1381 
1382  /*
1383  * If nextkey = false, we are positioned at the first item >= scan key, or
1384  * possibly at the end of a page on which all the existing items are less
1385  * than the scan key and we know that everything on later pages is greater
1386  * than or equal to scan key.
1387  *
1388  * If nextkey = true, we are positioned at the first item > scan key, or
1389  * possibly at the end of a page on which all the existing items are less
1390  * than or equal to the scan key and we know that everything on later
1391  * pages is greater than scan key.
1392  *
1393  * The actually desired starting point is either this item or the prior
1394  * one, or in the end-of-page case it's the first item on the next page or
1395  * the last item on this page. Adjust the starting offset if needed. (If
1396  * this results in an offset before the first item or after the last one,
1397  * _bt_readpage will report no items found, and then we'll step to the
1398  * next page as needed.)
1399  */
1400  if (goback)
1401  offnum = OffsetNumberPrev(offnum);
1402 
1403  /* remember which buffer we have pinned, if any */
1405  so->currPos.buf = buf;
1406 
1407  /*
1408  * Now load data from the first page of the scan.
1409  */
1410  if (!_bt_readpage(scan, dir, offnum))
1411  {
1412  /*
1413  * There's no actually-matching data on this page. Try to advance to
1414  * the next page. Return false if there's no matching data at all.
1415  */
1417  if (!_bt_steppage(scan, dir))
1418  return false;
1419  }
1420  else
1421  {
1422  /* Drop the lock, and maybe the pin, on the current page */
1424  }
1425 
1426 readcomplete:
1427  /* OK, itemIndex says what to return */
1428  currItem = &so->currPos.items[so->currPos.itemIndex];
1429  scan->xs_heaptid = currItem->heapTid;
1430  if (scan->xs_want_itup)
1431  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1432 
1433  return true;
1434 }
1435 
1436 /*
1437  * _bt_next() -- Get the next item in a scan.
1438  *
1439  * On entry, so->currPos describes the current page, which may be pinned
1440  * but is not locked, and so->currPos.itemIndex identifies which item was
1441  * previously returned.
1442  *
1443  * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1444  * next heap tuple, and if requested, scan->xs_itup points to a copy of
1445  * the index tuple. so->currPos is updated as needed.
1446  *
1447  * On failure exit (no more tuples), we release pin and set
1448  * so->currPos.buf to InvalidBuffer.
1449  */
1450 bool
1452 {
1453  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1454  BTScanPosItem *currItem;
1455 
1456  /*
1457  * Advance to next tuple on current page; or if there's no more, try to
1458  * step to the next page with data.
1459  */
1460  if (ScanDirectionIsForward(dir))
1461  {
1462  if (++so->currPos.itemIndex > so->currPos.lastItem)
1463  {
1464  if (!_bt_steppage(scan, dir))
1465  return false;
1466  }
1467  }
1468  else
1469  {
1470  if (--so->currPos.itemIndex < so->currPos.firstItem)
1471  {
1472  if (!_bt_steppage(scan, dir))
1473  return false;
1474  }
1475  }
1476 
1477  /* OK, itemIndex says what to return */
1478  currItem = &so->currPos.items[so->currPos.itemIndex];
1479  scan->xs_heaptid = currItem->heapTid;
1480  if (scan->xs_want_itup)
1481  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1482 
1483  return true;
1484 }
1485 
1486 /*
1487  * _bt_readpage() -- Load data from current index page into so->currPos
1488  *
1489  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1490  * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1491  * they are updated as appropriate. All other fields of so->currPos are
1492  * initialized from scratch here.
1493  *
1494  * We scan the current page starting at offnum and moving in the indicated
1495  * direction. All items matching the scan keys are loaded into currPos.items.
1496  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1497  * that there can be no more matching tuples in the current scan direction.
1498  *
1499  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1500  * prior to calling this function; this function will invoke
1501  * _bt_parallel_release before returning.
1502  *
1503  * Returns true if any matching items found on the page, false if none.
1504  */
1505 static bool
1507 {
1508  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1509  Page page;
1510  BTPageOpaque opaque;
1511  OffsetNumber minoff;
1512  OffsetNumber maxoff;
1513  int itemIndex;
1514  bool continuescan;
1515  int indnatts;
1516 
1517  /*
1518  * We must have the buffer pinned and locked, but the usual macro can't be
1519  * used here; this function is what makes it good for currPos.
1520  */
1522 
1523  page = BufferGetPage(so->currPos.buf);
1524  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1525 
1526  /* allow next page be processed by parallel worker */
1527  if (scan->parallel_scan)
1528  {
1529  if (ScanDirectionIsForward(dir))
1530  _bt_parallel_release(scan, opaque->btpo_next);
1531  else
1533  }
1534 
1535  continuescan = true; /* default assumption */
1537  minoff = P_FIRSTDATAKEY(opaque);
1538  maxoff = PageGetMaxOffsetNumber(page);
1539 
1540  /*
1541  * We note the buffer's block number so that we can release the pin later.
1542  * This allows us to re-read the buffer if it is needed again for hinting.
1543  */
1545 
1546  /*
1547  * We save the LSN of the page as we read it, so that we know whether it
1548  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1549  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1550  */
1552 
1553  /*
1554  * we must save the page's right-link while scanning it; this tells us
1555  * where to step right to after we're done with these items. There is no
1556  * corresponding need for the left-link, since splits always go right.
1557  */
1558  so->currPos.nextPage = opaque->btpo_next;
1559 
1560  /* initialize tuple workspace to empty */
1561  so->currPos.nextTupleOffset = 0;
1562 
1563  /*
1564  * Now that the current page has been made consistent, the macro should be
1565  * good.
1566  */
1568 
1569  if (ScanDirectionIsForward(dir))
1570  {
1571  /* load items[] in ascending order */
1572  itemIndex = 0;
1573 
1574  offnum = Max(offnum, minoff);
1575 
1576  while (offnum <= maxoff)
1577  {
1578  ItemId iid = PageGetItemId(page, offnum);
1579  IndexTuple itup;
1580 
1581  /*
1582  * If the scan specifies not to return killed tuples, then we
1583  * treat a killed tuple as not passing the qual
1584  */
1585  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1586  {
1587  offnum = OffsetNumberNext(offnum);
1588  continue;
1589  }
1590 
1591  itup = (IndexTuple) PageGetItem(page, iid);
1592 
1593  if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1594  {
1595  /* tuple passes all scan key conditions */
1596  if (!BTreeTupleIsPosting(itup))
1597  {
1598  /* Remember it */
1599  _bt_saveitem(so, itemIndex, offnum, itup);
1600  itemIndex++;
1601  }
1602  else
1603  {
1604  int tupleOffset;
1605 
1606  /*
1607  * Set up state to return posting list, and remember first
1608  * TID
1609  */
1610  tupleOffset =
1611  _bt_setuppostingitems(so, itemIndex, offnum,
1612  BTreeTupleGetPostingN(itup, 0),
1613  itup);
1614  itemIndex++;
1615  /* Remember additional TIDs */
1616  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1617  {
1618  _bt_savepostingitem(so, itemIndex, offnum,
1619  BTreeTupleGetPostingN(itup, i),
1620  tupleOffset);
1621  itemIndex++;
1622  }
1623  }
1624  }
1625  /* When !continuescan, there can't be any more matches, so stop */
1626  if (!continuescan)
1627  break;
1628 
1629  offnum = OffsetNumberNext(offnum);
1630  }
1631 
1632  /*
1633  * We don't need to visit page to the right when the high key
1634  * indicates that no more matches will be found there.
1635  *
1636  * Checking the high key like this works out more often than you might
1637  * think. Leaf page splits pick a split point between the two most
1638  * dissimilar tuples (this is weighed against the need to evenly share
1639  * free space). Leaf pages with high key attribute values that can
1640  * only appear on non-pivot tuples on the right sibling page are
1641  * common.
1642  */
1643  if (continuescan && !P_RIGHTMOST(opaque))
1644  {
1645  ItemId iid = PageGetItemId(page, P_HIKEY);
1646  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1647  int truncatt;
1648 
1649  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1650  _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1651  }
1652 
1653  if (!continuescan)
1654  so->currPos.moreRight = false;
1655 
1656  Assert(itemIndex <= MaxTIDsPerBTreePage);
1657  so->currPos.firstItem = 0;
1658  so->currPos.lastItem = itemIndex - 1;
1659  so->currPos.itemIndex = 0;
1660  }
1661  else
1662  {
1663  /* load items[] in descending order */
1664  itemIndex = MaxTIDsPerBTreePage;
1665 
1666  offnum = Min(offnum, maxoff);
1667 
1668  while (offnum >= minoff)
1669  {
1670  ItemId iid = PageGetItemId(page, offnum);
1671  IndexTuple itup;
1672  bool tuple_alive;
1673  bool passes_quals;
1674 
1675  /*
1676  * If the scan specifies not to return killed tuples, then we
1677  * treat a killed tuple as not passing the qual. Most of the
1678  * time, it's a win to not bother examining the tuple's index
1679  * keys, but just skip to the next tuple (previous, actually,
1680  * since we're scanning backwards). However, if this is the first
1681  * tuple on the page, we do check the index keys, to prevent
1682  * uselessly advancing to the page to the left. This is similar
1683  * to the high key optimization used by forward scans.
1684  */
1685  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1686  {
1687  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1688  if (offnum > P_FIRSTDATAKEY(opaque))
1689  {
1690  offnum = OffsetNumberPrev(offnum);
1691  continue;
1692  }
1693 
1694  tuple_alive = false;
1695  }
1696  else
1697  tuple_alive = true;
1698 
1699  itup = (IndexTuple) PageGetItem(page, iid);
1700 
1701  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1702  &continuescan);
1703  if (passes_quals && tuple_alive)
1704  {
1705  /* tuple passes all scan key conditions */
1706  if (!BTreeTupleIsPosting(itup))
1707  {
1708  /* Remember it */
1709  itemIndex--;
1710  _bt_saveitem(so, itemIndex, offnum, itup);
1711  }
1712  else
1713  {
1714  int tupleOffset;
1715 
1716  /*
1717  * Set up state to return posting list, and remember first
1718  * TID.
1719  *
1720  * Note that we deliberately save/return items from
1721  * posting lists in ascending heap TID order for backwards
1722  * scans. This allows _bt_killitems() to make a
1723  * consistent assumption about the order of items
1724  * associated with the same posting list tuple.
1725  */
1726  itemIndex--;
1727  tupleOffset =
1728  _bt_setuppostingitems(so, itemIndex, offnum,
1729  BTreeTupleGetPostingN(itup, 0),
1730  itup);
1731  /* Remember additional TIDs */
1732  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1733  {
1734  itemIndex--;
1735  _bt_savepostingitem(so, itemIndex, offnum,
1736  BTreeTupleGetPostingN(itup, i),
1737  tupleOffset);
1738  }
1739  }
1740  }
1741  if (!continuescan)
1742  {
1743  /* there can't be any more matches, so stop */
1744  so->currPos.moreLeft = false;
1745  break;
1746  }
1747 
1748  offnum = OffsetNumberPrev(offnum);
1749  }
1750 
1751  Assert(itemIndex >= 0);
1752  so->currPos.firstItem = itemIndex;
1755  }
1756 
1757  return (so->currPos.firstItem <= so->currPos.lastItem);
1758 }
1759 
1760 /* Save an index item into so->currPos.items[itemIndex] */
1761 static void
1762 _bt_saveitem(BTScanOpaque so, int itemIndex,
1763  OffsetNumber offnum, IndexTuple itup)
1764 {
1765  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1766 
1767  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1768 
1769  currItem->heapTid = itup->t_tid;
1770  currItem->indexOffset = offnum;
1771  if (so->currTuples)
1772  {
1773  Size itupsz = IndexTupleSize(itup);
1774 
1775  currItem->tupleOffset = so->currPos.nextTupleOffset;
1776  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1777  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1778  }
1779 }
1780 
1781 /*
1782  * Setup state to save TIDs/items from a single posting list tuple.
1783  *
1784  * Saves an index item into so->currPos.items[itemIndex] for TID that is
1785  * returned to scan first. Second or subsequent TIDs for posting list should
1786  * be saved by calling _bt_savepostingitem().
1787  *
1788  * Returns an offset into tuple storage space that main tuple is stored at if
1789  * needed.
1790  */
1791 static int
1793  ItemPointer heapTid, IndexTuple itup)
1794 {
1795  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1796 
1797  Assert(BTreeTupleIsPosting(itup));
1798 
1799  currItem->heapTid = *heapTid;
1800  currItem->indexOffset = offnum;
1801  if (so->currTuples)
1802  {
1803  /* Save base IndexTuple (truncate posting list) */
1804  IndexTuple base;
1805  Size itupsz = BTreeTupleGetPostingOffset(itup);
1806 
1807  itupsz = MAXALIGN(itupsz);
1808  currItem->tupleOffset = so->currPos.nextTupleOffset;
1809  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1810  memcpy(base, itup, itupsz);
1811  /* Defensively reduce work area index tuple header size */
1812  base->t_info &= ~INDEX_SIZE_MASK;
1813  base->t_info |= itupsz;
1814  so->currPos.nextTupleOffset += itupsz;
1815 
1816  return currItem->tupleOffset;
1817  }
1818 
1819  return 0;
1820 }
1821 
1822 /*
1823  * Save an index item into so->currPos.items[itemIndex] for current posting
1824  * tuple.
1825  *
1826  * Assumes that _bt_setuppostingitems() has already been called for current
1827  * posting list tuple. Caller passes its return value as tupleOffset.
1828  */
1829 static inline void
1831  ItemPointer heapTid, int tupleOffset)
1832 {
1833  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1834 
1835  currItem->heapTid = *heapTid;
1836  currItem->indexOffset = offnum;
1837 
1838  /*
1839  * Have index-only scans return the same base IndexTuple for every TID
1840  * that originates from the same posting list
1841  */
1842  if (so->currTuples)
1843  currItem->tupleOffset = tupleOffset;
1844 }
1845 
1846 /*
1847  * _bt_steppage() -- Step to next page containing valid data for scan
1848  *
1849  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1850  * if pinned, we'll drop the pin before moving to next page. The buffer is
1851  * not locked on entry.
1852  *
1853  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1854  * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1855  * to InvalidBuffer. We return true to indicate success.
1856  */
1857 static bool
1859 {
1860  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1862  bool status = true;
1863 
1865 
1866  /* Before leaving current page, deal with any killed items */
1867  if (so->numKilled > 0)
1868  _bt_killitems(scan);
1869 
1870  /*
1871  * Before we modify currPos, make a copy of the page data if there was a
1872  * mark position that needs it.
1873  */
1874  if (so->markItemIndex >= 0)
1875  {
1876  /* bump pin on current buffer for assignment to mark buffer */
1877  if (BTScanPosIsPinned(so->currPos))
1879  memcpy(&so->markPos, &so->currPos,
1880  offsetof(BTScanPosData, items[1]) +
1881  so->currPos.lastItem * sizeof(BTScanPosItem));
1882  if (so->markTuples)
1883  memcpy(so->markTuples, so->currTuples,
1884  so->currPos.nextTupleOffset);
1885  so->markPos.itemIndex = so->markItemIndex;
1886  so->markItemIndex = -1;
1887  }
1888 
1889  if (ScanDirectionIsForward(dir))
1890  {
1891  /* Walk right to the next page with data */
1892  if (scan->parallel_scan != NULL)
1893  {
1894  /*
1895  * Seize the scan to get the next block number; if the scan has
1896  * ended already, bail out.
1897  */
1898  status = _bt_parallel_seize(scan, &blkno);
1899  if (!status)
1900  {
1901  /* release the previous buffer, if pinned */
1904  return false;
1905  }
1906  }
1907  else
1908  {
1909  /* Not parallel, so use the previously-saved nextPage link. */
1910  blkno = so->currPos.nextPage;
1911  }
1912 
1913  /* Remember we left a page with data */
1914  so->currPos.moreLeft = true;
1915 
1916  /* release the previous buffer, if pinned */
1918  }
1919  else
1920  {
1921  /* Remember we left a page with data */
1922  so->currPos.moreRight = true;
1923 
1924  if (scan->parallel_scan != NULL)
1925  {
1926  /*
1927  * Seize the scan to get the current block number; if the scan has
1928  * ended already, bail out.
1929  */
1930  status = _bt_parallel_seize(scan, &blkno);
1932  if (!status)
1933  {
1935  return false;
1936  }
1937  }
1938  else
1939  {
1940  /* Not parallel, so just use our own notion of the current page */
1941  blkno = so->currPos.currPage;
1942  }
1943  }
1944 
1945  if (!_bt_readnextpage(scan, blkno, dir))
1946  return false;
1947 
1948  /* Drop the lock, and maybe the pin, on the current page */
1950 
1951  return true;
1952 }
1953 
1954 /*
1955  * _bt_readnextpage() -- Read next page containing valid data for scan
1956  *
1957  * On success exit, so->currPos is updated to contain data from the next
1958  * interesting page. Caller is responsible to release lock and pin on
1959  * buffer on success. We return true to indicate success.
1960  *
1961  * If there are no more matching records in the given direction, we drop all
1962  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1963  */
1964 static bool
1966 {
1967  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1968  Relation rel;
1969  Page page;
1970  BTPageOpaque opaque;
1971  bool status = true;
1972 
1973  rel = scan->indexRelation;
1974 
1975  if (ScanDirectionIsForward(dir))
1976  {
1977  for (;;)
1978  {
1979  /*
1980  * if we're at end of scan, give up and mark parallel scan as
1981  * done, so that all the workers can finish their scan
1982  */
1983  if (blkno == P_NONE || !so->currPos.moreRight)
1984  {
1985  _bt_parallel_done(scan);
1987  return false;
1988  }
1989  /* check for interrupts while we're not holding any buffer lock */
1991  /* step right one page */
1992  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1993  page = BufferGetPage(so->currPos.buf);
1994  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1995  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1996  /* check for deleted page */
1997  if (!P_IGNORE(opaque))
1998  {
1999  PredicateLockPage(rel, blkno, scan->xs_snapshot);
2000  /* see if there are any matches on this page */
2001  /* note that this will clear moreRight if we can stop */
2002  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
2003  break;
2004  }
2005  else if (scan->parallel_scan != NULL)
2006  {
2007  /* allow next page be processed by parallel worker */
2008  _bt_parallel_release(scan, opaque->btpo_next);
2009  }
2010 
2011  /* nope, keep going */
2012  if (scan->parallel_scan != NULL)
2013  {
2014  _bt_relbuf(rel, so->currPos.buf);
2015  status = _bt_parallel_seize(scan, &blkno);
2016  if (!status)
2017  {
2019  return false;
2020  }
2021  }
2022  else
2023  {
2024  blkno = opaque->btpo_next;
2025  _bt_relbuf(rel, so->currPos.buf);
2026  }
2027  }
2028  }
2029  else
2030  {
2031  /*
2032  * Should only happen in parallel cases, when some other backend
2033  * advanced the scan.
2034  */
2035  if (so->currPos.currPage != blkno)
2036  {
2038  so->currPos.currPage = blkno;
2039  }
2040 
2041  /*
2042  * Walk left to the next page with data. This is much more complex
2043  * than the walk-right case because of the possibility that the page
2044  * to our left splits while we are in flight to it, plus the
2045  * possibility that the page we were on gets deleted after we leave
2046  * it. See nbtree/README for details.
2047  *
2048  * It might be possible to rearrange this code to have less overhead
2049  * in pinning and locking, but that would require capturing the left
2050  * pointer when the page is initially read, and using it here, along
2051  * with big changes to _bt_walk_left() and the code below. It is not
2052  * clear whether this would be a win, since if the page immediately to
2053  * the left splits after we read this page and before we step left, we
2054  * would need to visit more pages than with the current code.
2055  *
2056  * Note that if we change the code so that we drop the pin for a scan
2057  * which uses a non-MVCC snapshot, we will need to modify the code for
2058  * walking left, to allow for the possibility that a referenced page
2059  * has been deleted. As long as the buffer is pinned or the snapshot
2060  * is MVCC the page cannot move past the half-dead state to fully
2061  * deleted.
2062  */
2063  if (BTScanPosIsPinned(so->currPos))
2064  LockBuffer(so->currPos.buf, BT_READ);
2065  else
2066  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2067 
2068  for (;;)
2069  {
2070  /* Done if we know there are no matching keys to the left */
2071  if (!so->currPos.moreLeft)
2072  {
2073  _bt_relbuf(rel, so->currPos.buf);
2074  _bt_parallel_done(scan);
2076  return false;
2077  }
2078 
2079  /* Step to next physical page */
2080  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
2081  scan->xs_snapshot);
2082 
2083  /* if we're physically at end of index, return failure */
2084  if (so->currPos.buf == InvalidBuffer)
2085  {
2086  _bt_parallel_done(scan);
2088  return false;
2089  }
2090 
2091  /*
2092  * Okay, we managed to move left to a non-deleted page. Done if
2093  * it's not half-dead and contains matching tuples. Else loop back
2094  * and do it all again.
2095  */
2096  page = BufferGetPage(so->currPos.buf);
2097  TestForOldSnapshot(scan->xs_snapshot, rel, page);
2098  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2099  if (!P_IGNORE(opaque))
2100  {
2102  /* see if there are any matches on this page */
2103  /* note that this will clear moreLeft if we can stop */
2104  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
2105  break;
2106  }
2107  else if (scan->parallel_scan != NULL)
2108  {
2109  /* allow next page be processed by parallel worker */
2111  }
2112 
2113  /*
2114  * For parallel scans, get the last page scanned as it is quite
2115  * possible that by the time we try to seize the scan, some other
2116  * worker has already advanced the scan to a different page. We
2117  * must continue based on the latest page scanned by any worker.
2118  */
2119  if (scan->parallel_scan != NULL)
2120  {
2121  _bt_relbuf(rel, so->currPos.buf);
2122  status = _bt_parallel_seize(scan, &blkno);
2123  if (!status)
2124  {
2126  return false;
2127  }
2128  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2129  }
2130  }
2131  }
2132 
2133  return true;
2134 }
2135 
2136 /*
2137  * _bt_parallel_readpage() -- Read current page containing valid data for scan
2138  *
2139  * On success, release lock and maybe pin on buffer. We return true to
2140  * indicate success.
2141  */
2142 static bool
2144 {
2145  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2146 
2147  _bt_initialize_more_data(so, dir);
2148 
2149  if (!_bt_readnextpage(scan, blkno, dir))
2150  return false;
2151 
2152  /* Drop the lock, and maybe the pin, on the current page */
2154 
2155  return true;
2156 }
2157 
2158 /*
2159  * _bt_walk_left() -- step left one page, if possible
2160  *
2161  * The given buffer must be pinned and read-locked. This will be dropped
2162  * before stepping left. On return, we have pin and read lock on the
2163  * returned page, instead.
2164  *
2165  * Returns InvalidBuffer if there is no page to the left (no lock is held
2166  * in that case).
2167  *
2168  * When working on a non-leaf level, it is possible for the returned page
2169  * to be half-dead; the caller should check that condition and step left
2170  * again if it's important.
2171  */
2172 static Buffer
2174 {
2175  Page page;
2176  BTPageOpaque opaque;
2177 
2178  page = BufferGetPage(buf);
2179  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2180 
2181  for (;;)
2182  {
2183  BlockNumber obknum;
2184  BlockNumber lblkno;
2185  BlockNumber blkno;
2186  int tries;
2187 
2188  /* if we're at end of tree, release buf and return failure */
2189  if (P_LEFTMOST(opaque))
2190  {
2191  _bt_relbuf(rel, buf);
2192  break;
2193  }
2194  /* remember original page we are stepping left from */
2195  obknum = BufferGetBlockNumber(buf);
2196  /* step left */
2197  blkno = lblkno = opaque->btpo_prev;
2198  _bt_relbuf(rel, buf);
2199  /* check for interrupts while we're not holding any buffer lock */
2201  buf = _bt_getbuf(rel, blkno, BT_READ);
2202  page = BufferGetPage(buf);
2203  TestForOldSnapshot(snapshot, rel, page);
2204  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2205 
2206  /*
2207  * If this isn't the page we want, walk right till we find what we
2208  * want --- but go no more than four hops (an arbitrary limit). If we
2209  * don't find the correct page by then, the most likely bet is that
2210  * the original page got deleted and isn't in the sibling chain at all
2211  * anymore, not that its left sibling got split more than four times.
2212  *
2213  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2214  * because half-dead pages are still in the sibling chain. Caller
2215  * must reject half-dead pages if wanted.
2216  */
2217  tries = 0;
2218  for (;;)
2219  {
2220  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2221  {
2222  /* Found desired page, return it */
2223  return buf;
2224  }
2225  if (P_RIGHTMOST(opaque) || ++tries > 4)
2226  break;
2227  blkno = opaque->btpo_next;
2228  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2229  page = BufferGetPage(buf);
2230  TestForOldSnapshot(snapshot, rel, page);
2231  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2232  }
2233 
2234  /* Return to the original page to see what's up */
2235  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2236  page = BufferGetPage(buf);
2237  TestForOldSnapshot(snapshot, rel, page);
2238  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2239  if (P_ISDELETED(opaque))
2240  {
2241  /*
2242  * It was deleted. Move right to first nondeleted page (there
2243  * must be one); that is the page that has acquired the deleted
2244  * one's keyspace, so stepping left from it will take us where we
2245  * want to be.
2246  */
2247  for (;;)
2248  {
2249  if (P_RIGHTMOST(opaque))
2250  elog(ERROR, "fell off the end of index \"%s\"",
2252  blkno = opaque->btpo_next;
2253  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2254  page = BufferGetPage(buf);
2255  TestForOldSnapshot(snapshot, rel, page);
2256  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2257  if (!P_ISDELETED(opaque))
2258  break;
2259  }
2260 
2261  /*
2262  * Now return to top of loop, resetting obknum to point to this
2263  * nondeleted page, and try again.
2264  */
2265  }
2266  else
2267  {
2268  /*
2269  * It wasn't deleted; the explanation had better be that the page
2270  * to the left got split or deleted. Without this check, we'd go
2271  * into an infinite loop if there's anything wrong.
2272  */
2273  if (opaque->btpo_prev == lblkno)
2274  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2275  obknum, RelationGetRelationName(rel));
2276  /* Okay to try again with new lblkno value */
2277  }
2278  }
2279 
2280  return InvalidBuffer;
2281 }
2282 
2283 /*
2284  * _bt_get_endpoint() -- Find the first or last page on a given tree level
2285  *
2286  * If the index is empty, we will return InvalidBuffer; any other failure
2287  * condition causes ereport(). We will not return a dead page.
2288  *
2289  * The returned buffer is pinned and read-locked.
2290  */
2291 Buffer
2292 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2293  Snapshot snapshot)
2294 {
2295  Buffer buf;
2296  Page page;
2297  BTPageOpaque opaque;
2298  OffsetNumber offnum;
2299  BlockNumber blkno;
2300  IndexTuple itup;
2301 
2302  /*
2303  * If we are looking for a leaf page, okay to descend from fast root;
2304  * otherwise better descend from true root. (There is no point in being
2305  * smarter about intermediate levels.)
2306  */
2307  if (level == 0)
2308  buf = _bt_getroot(rel, BT_READ);
2309  else
2310  buf = _bt_gettrueroot(rel);
2311 
2312  if (!BufferIsValid(buf))
2313  return InvalidBuffer;
2314 
2315  page = BufferGetPage(buf);
2316  TestForOldSnapshot(snapshot, rel, page);
2317  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2318 
2319  for (;;)
2320  {
2321  /*
2322  * If we landed on a deleted page, step right to find a live page
2323  * (there must be one). Also, if we want the rightmost page, step
2324  * right if needed to get to it (this could happen if the page split
2325  * since we obtained a pointer to it).
2326  */
2327  while (P_IGNORE(opaque) ||
2328  (rightmost && !P_RIGHTMOST(opaque)))
2329  {
2330  blkno = opaque->btpo_next;
2331  if (blkno == P_NONE)
2332  elog(ERROR, "fell off the end of index \"%s\"",
2334  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2335  page = BufferGetPage(buf);
2336  TestForOldSnapshot(snapshot, rel, page);
2337  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2338  }
2339 
2340  /* Done? */
2341  if (opaque->btpo.level == level)
2342  break;
2343  if (opaque->btpo.level < level)
2344  ereport(ERROR,
2345  (errcode(ERRCODE_INDEX_CORRUPTED),
2346  errmsg_internal("btree level %u not found in index \"%s\"",
2347  level, RelationGetRelationName(rel))));
2348 
2349  /* Descend to leftmost or rightmost child page */
2350  if (rightmost)
2351  offnum = PageGetMaxOffsetNumber(page);
2352  else
2353  offnum = P_FIRSTDATAKEY(opaque);
2354 
2355  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2356  blkno = BTreeTupleGetDownLink(itup);
2357 
2358  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2359  page = BufferGetPage(buf);
2360  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2361  }
2362 
2363  return buf;
2364 }
2365 
2366 /*
2367  * _bt_endpoint() -- Find the first or last page in the index, and scan
2368  * from there to the first key satisfying all the quals.
2369  *
2370  * This is used by _bt_first() to set up a scan when we've determined
2371  * that the scan must start at the beginning or end of the index (for
2372  * a forward or backward scan respectively). Exit conditions are the
2373  * same as for _bt_first().
2374  */
2375 static bool
2377 {
2378  Relation rel = scan->indexRelation;
2379  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2380  Buffer buf;
2381  Page page;
2382  BTPageOpaque opaque;
2383  OffsetNumber start;
2384  BTScanPosItem *currItem;
2385 
2386  /*
2387  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2388  * version of _bt_search(). We don't maintain a stack since we know we
2389  * won't need it.
2390  */
2392 
2393  if (!BufferIsValid(buf))
2394  {
2395  /*
2396  * Empty index. Lock the whole relation, as nothing finer to lock
2397  * exists.
2398  */
2399  PredicateLockRelation(rel, scan->xs_snapshot);
2401  return false;
2402  }
2403 
2405  page = BufferGetPage(buf);
2406  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2407  Assert(P_ISLEAF(opaque));
2408 
2409  if (ScanDirectionIsForward(dir))
2410  {
2411  /* There could be dead pages to the left, so not this: */
2412  /* Assert(P_LEFTMOST(opaque)); */
2413 
2414  start = P_FIRSTDATAKEY(opaque);
2415  }
2416  else if (ScanDirectionIsBackward(dir))
2417  {
2418  Assert(P_RIGHTMOST(opaque));
2419 
2420  start = PageGetMaxOffsetNumber(page);
2421  }
2422  else
2423  {
2424  elog(ERROR, "invalid scan direction: %d", (int) dir);
2425  start = 0; /* keep compiler quiet */
2426  }
2427 
2428  /* remember which buffer we have pinned */
2429  so->currPos.buf = buf;
2430 
2431  _bt_initialize_more_data(so, dir);
2432 
2433  /*
2434  * Now load data from the first page of the scan.
2435  */
2436  if (!_bt_readpage(scan, dir, start))
2437  {
2438  /*
2439  * There's no actually-matching data on this page. Try to advance to
2440  * the next page. Return false if there's no matching data at all.
2441  */
2443  if (!_bt_steppage(scan, dir))
2444  return false;
2445  }
2446  else
2447  {
2448  /* Drop the lock, and maybe the pin, on the current page */
2450  }
2451 
2452  /* OK, itemIndex says what to return */
2453  currItem = &so->currPos.items[so->currPos.itemIndex];
2454  scan->xs_heaptid = currItem->heapTid;
2455  if (scan->xs_want_itup)
2456  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2457 
2458  return true;
2459 }
2460 
2461 /*
2462  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2463  * for scan direction
2464  */
2465 static inline void
2467 {
2468  /* initialize moreLeft/moreRight appropriately for scan direction */
2469  if (ScanDirectionIsForward(dir))
2470  {
2471  so->currPos.moreLeft = false;
2472  so->currPos.moreRight = true;
2473  }
2474  else
2475  {
2476  so->currPos.moreLeft = true;
2477  so->currPos.moreRight = false;
2478  }
2479  so->numKilled = 0; /* just paranoia */
2480  so->markItemIndex = -1; /* ditto */
2481 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2523
BlockNumber btpo_next
Definition: nbtree.h:59
#define BTORDER_PROC
Definition: nbtree.h:575
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:277
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:800
#define P_IGNORE(opaque)
Definition: nbtree.h:219
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
bool moreRight
Definition: nbtree.h:838
bool bounds_valid
Definition: nbtree.h:696
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:506
#define DatumGetInt32(X)
Definition: postgres.h:472
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:928
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:727
#define RelationGetDescr(relation)
Definition: rel.h:482
ItemPointer scantid
Definition: nbtree.h:664
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2143
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
int itemIndex
Definition: nbtree.h:855
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2500
char * currTuples
Definition: nbtree.h:929
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
ItemPointerData t_tid
Definition: itup.h:37
regproc RegProcedure
Definition: c.h:511
#define Min(x, y)
Definition: c.h:920
union BTPageOpaqueData::@45 btpo
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
#define InvalidBuffer
Definition: buf.h:25
BlockNumber currPage
Definition: nbtree.h:828
#define INDEX_SIZE_MASK
Definition: itup.h:65
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
OffsetNumber stricthigh
Definition: nbtree.h:698
int errcode(int sqlerrcode)
Definition: elog.c:610
bool allequalimage
Definition: nbtree.h:660
BTStackData * BTStack
Definition: nbtree.h:607
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:879
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
OffsetNumber indexOffset
Definition: nbtree.h:819
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3483
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1792
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
OffsetNumber low
Definition: nbtree.h:697
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1152
#define MaxTIDsPerBTreePage
Definition: nbtree.h:179
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
bool moreLeft
Definition: nbtree.h:837
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2292
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:1830
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:849
bool ignore_killed_tuples
Definition: relscan.h:114
signed int int32
Definition: c.h:355
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:344
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:559
int nextTupleOffset
Definition: nbtree.h:844
#define BT_READ
Definition: nbtree.h:587
int lastItem
Definition: nbtree.h:854
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
BTScanPosData markPos
Definition: nbtree.h:943
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2215
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:132
BlockNumber btpo_prev
Definition: nbtree.h:58
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2896
OffsetNumber bts_offset
Definition: nbtree.h:603
static char * buf
Definition: pg_test_fsync.c:67
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2376
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:862
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:448
#define RegProcedureIsValid(p)
Definition: c.h:646
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1762
ScanDirection
Definition: sdir.h:22
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468
char * markTuples
Definition: nbtree.h:930
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1412
unsigned int uint32
Definition: c.h:367
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:646
Oid * rd_opfamily
Definition: rel.h:189
int firstItem
Definition: nbtree.h:853
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:271
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:645
#define SK_ISNULL
Definition: skey.h:115
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool pivotsearch
Definition: nbtree.h:663
#define P_ISDELETED(opaque)
Definition: nbtree.h:216
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:667
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:742
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:532
uint32 level
Definition: nbtree.h:62
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2466
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3722
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:507
bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1355
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:97
#define InvalidOffsetNumber
Definition: off.h:26
#define InvalidOid
Definition: postgres_ext.h:36
BlockNumber bts_blkno
Definition: nbtree.h:602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define ereport(elevel,...)
Definition: elog.h:144
int markItemIndex
Definition: nbtree.h:939
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1965
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
#define Max(x, y)
Definition: c.h:914
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:397
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2465
int sk_flags
Definition: skey.h:66
Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:242
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:738
#define SK_BT_DESC
Definition: nbtree.h:956
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:794
bool heapkeyspace
Definition: nbtree.h:659
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:885
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
int numberOfKeys
Definition: nbtree.h:908
#define MAXALIGN(LEN)
Definition: c.h:691
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
struct BTStackData * bts_parent
Definition: nbtree.h:604
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:666
#define RelationNeedsWAL(relation)
Definition: rel.h:562
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:704
ItemPointerData heapTid
Definition: nbtree.h:818
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
#define DatumGetPointer(X)
Definition: postgres.h:549
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber nextPage
Definition: nbtree.h:829
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
BTScanPosData currPos
Definition: nbtree.h:942
void * palloc(Size size)
Definition: mcxt.c:949
ScanKey keyData
Definition: nbtree.h:909
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:214
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1858
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
#define unlikely(x)
Definition: c.h:206
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1451
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
XLogRecPtr lsn
Definition: nbtree.h:827
Buffer buf
Definition: nbtree.h:825
unsigned short t_info
Definition: itup.h:49
Oid * rd_opcintype
Definition: rel.h:190
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:225
#define BT_WRITE
Definition: nbtree.h:588
#define BTLessStrategyNumber
Definition: stratnum.h:29
LocationIndex tupleOffset
Definition: nbtree.h:820
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1506
int16 AttrNumber
Definition: attnum.h:21
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3521
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1088
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define offsetof(type, field)
Definition: c.h:661
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:873
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214
static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:2173