PostgreSQL Source Code  git master
nbtsearch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsearch.c
4  * Search code for postgres btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 
26 
29 static int _bt_binsrch_posting(BTScanInsert key, Page page,
30  OffsetNumber offnum);
31 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
32  OffsetNumber offnum);
33 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
34  OffsetNumber offnum, IndexTuple itup);
35 static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
36  OffsetNumber offnum, ItemPointer heapTid,
37  IndexTuple itup);
38 static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
39  OffsetNumber offnum,
40  ItemPointer heapTid, int tupleOffset);
41 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
42 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
43 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
44  ScanDirection dir);
45 static Buffer _bt_walk_left(Relation rel, Relation heaprel, Buffer buf,
46  Snapshot snapshot);
47 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
48 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
49 
50 
51 /*
52  * _bt_drop_lock_and_maybe_pin()
53  *
54  * Unlock the buffer; and if it is safe to release the pin, do that, too.
55  * This will prevent vacuum from stalling in a blocked state trying to read a
56  * page when a cursor is sitting on it.
57  *
58  * See nbtree/README section on making concurrent TID recycling safe.
59  */
60 static void
62 {
63  _bt_unlockbuf(scan->indexRelation, sp->buf);
64 
65  if (IsMVCCSnapshot(scan->xs_snapshot) &&
67  !scan->xs_want_itup)
68  {
69  ReleaseBuffer(sp->buf);
70  sp->buf = InvalidBuffer;
71  }
72 }
73 
74 /*
75  * _bt_search() -- Search the tree for a particular scankey,
76  * or more precisely for the first leaf page it could be on.
77  *
78  * The passed scankey is an insertion-type scankey (see nbtree/README),
79  * but it can omit the rightmost column(s) of the index.
80  *
81  * Return value is a stack of parent-page pointers (i.e. there is no entry for
82  * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
83  * which is locked and pinned. No locks are held on the parent pages,
84  * however!
85  *
86  * If the snapshot parameter is not NULL, "old snapshot" checking will take
87  * place during the descent through the tree. This is not needed when
88  * positioning for an insert or delete, so NULL is used for those cases.
89  *
90  * The returned buffer is locked according to access parameter. Additionally,
91  * access = BT_WRITE will allow an empty root page to be created and returned.
92  * When access = BT_READ, an empty index will result in *bufP being set to
93  * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
94  * during the search will be finished.
95  */
96 BTStack
98  int access, Snapshot snapshot)
99 {
100  BTStack stack_in = NULL;
101  int page_access = BT_READ;
102 
103  /* Get the root page to start with */
104  *bufP = _bt_getroot(rel, heaprel, access);
105 
106  /* If index is empty and access = BT_READ, no root page is created. */
107  if (!BufferIsValid(*bufP))
108  return (BTStack) NULL;
109 
110  /* Loop iterates once per level descended in the tree */
111  for (;;)
112  {
113  Page page;
114  BTPageOpaque opaque;
115  OffsetNumber offnum;
116  ItemId itemid;
117  IndexTuple itup;
118  BlockNumber child;
119  BTStack new_stack;
120 
121  /*
122  * Race -- the page we just grabbed may have split since we read its
123  * downlink in its parent page (or the metapage). If it has, we may
124  * need to move right to its new sibling. Do that.
125  *
126  * In write-mode, allow _bt_moveright to finish any incomplete splits
127  * along the way. Strictly speaking, we'd only need to finish an
128  * incomplete split on the leaf page we're about to insert to, not on
129  * any of the upper levels (internal pages with incomplete splits are
130  * also taken care of in _bt_getstackbuf). But this is a good
131  * opportunity to finish splits of internal pages too.
132  */
133  *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
134  stack_in, page_access, snapshot);
135 
136  /* if this is a leaf page, we're done */
137  page = BufferGetPage(*bufP);
138  opaque = BTPageGetOpaque(page);
139  if (P_ISLEAF(opaque))
140  break;
141 
142  /*
143  * Find the appropriate pivot tuple on this page. Its downlink points
144  * to the child page that we're about to descend to.
145  */
146  offnum = _bt_binsrch(rel, key, *bufP);
147  itemid = PageGetItemId(page, offnum);
148  itup = (IndexTuple) PageGetItem(page, itemid);
149  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
150  child = BTreeTupleGetDownLink(itup);
151 
152  /*
153  * We need to save the location of the pivot tuple we chose in a new
154  * stack entry for this page/level. If caller ends up splitting a
155  * page one level down, it usually ends up inserting a new pivot
156  * tuple/downlink immediately after the location recorded here.
157  */
158  new_stack = (BTStack) palloc(sizeof(BTStackData));
159  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
160  new_stack->bts_offset = offnum;
161  new_stack->bts_parent = stack_in;
162 
163  /*
164  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
165  * we're on the level 1 and asked to lock leaf page in write mode,
166  * then lock next page in write mode, because it must be a leaf.
167  */
168  if (opaque->btpo_level == 1 && access == BT_WRITE)
169  page_access = BT_WRITE;
170 
171  /* drop the read lock on the page, then acquire one on its child */
172  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
173 
174  /* okay, all set to move down a level */
175  stack_in = new_stack;
176  }
177 
178  /*
179  * If we're asked to lock leaf in write mode, but didn't manage to, then
180  * relock. This should only happen when the root page is a leaf page (and
181  * the only page in the index other than the metapage).
182  */
183  if (access == BT_WRITE && page_access == BT_READ)
184  {
185  /* trade in our read lock for a write lock */
186  _bt_unlockbuf(rel, *bufP);
187  _bt_lockbuf(rel, *bufP, BT_WRITE);
188 
189  /*
190  * Race -- the leaf page may have split after we dropped the read lock
191  * but before we acquired a write lock. If it has, we may need to
192  * move right to its new sibling. Do that.
193  */
194  *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE,
195  snapshot);
196  }
197 
198  return stack_in;
199 }
200 
201 /*
202  * _bt_moveright() -- move right in the btree if necessary.
203  *
204  * When we follow a pointer to reach a page, it is possible that
205  * the page has changed in the meanwhile. If this happens, we're
206  * guaranteed that the page has "split right" -- that is, that any
207  * data that appeared on the page originally is either on the page
208  * or strictly to the right of it.
209  *
210  * This routine decides whether or not we need to move right in the
211  * tree by examining the high key entry on the page. If that entry is
212  * strictly less than the scankey, or <= the scankey in the
213  * key.nextkey=true case, then we followed the wrong link and we need
214  * to move right.
215  *
216  * The passed insertion-type scankey can omit the rightmost column(s) of the
217  * index. (see nbtree/README)
218  *
219  * When key.nextkey is false (the usual case), we are looking for the first
220  * item >= key. When key.nextkey is true, we are looking for the first item
221  * strictly greater than key.
222  *
223  * If forupdate is true, we will attempt to finish any incomplete splits
224  * that we encounter. This is required when locking a target page for an
225  * insertion, because we don't allow inserting on a page before the split
226  * is completed. 'stack' is only used if forupdate is true.
227  *
228  * On entry, we have the buffer pinned and a lock of the type specified by
229  * 'access'. If we move right, we release the buffer and lock and acquire
230  * the same on the right sibling. Return value is the buffer we stop at.
231  *
232  * If the snapshot parameter is not NULL, "old snapshot" checking will take
233  * place during the descent through the tree. This is not needed when
234  * positioning for an insert or delete, so NULL is used for those cases.
235  */
236 Buffer
238  Relation heaprel,
240  Buffer buf,
241  bool forupdate,
242  BTStack stack,
243  int access,
244  Snapshot snapshot)
245 {
246  Page page;
247  BTPageOpaque opaque;
248  int32 cmpval;
249 
250  /*
251  * When nextkey = false (normal case): if the scan key that brought us to
252  * this page is > the high key stored on the page, then the page has split
253  * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
254  * have some duplicates to the right as well as the left, but that's
255  * something that's only ever dealt with on the leaf level, after
256  * _bt_search has found an initial leaf page.)
257  *
258  * When nextkey = true: move right if the scan key is >= page's high key.
259  * (Note that key.scantid cannot be set in this case.)
260  *
261  * The page could even have split more than once, so scan as far as
262  * needed.
263  *
264  * We also have to move right if we followed a link that brought us to a
265  * dead page.
266  */
267  cmpval = key->nextkey ? 0 : 1;
268 
269  for (;;)
270  {
271  page = BufferGetPage(buf);
272  TestForOldSnapshot(snapshot, rel, page);
273  opaque = BTPageGetOpaque(page);
274 
275  if (P_RIGHTMOST(opaque))
276  break;
277 
278  /*
279  * Finish any incomplete splits we encounter along the way.
280  */
281  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
282  {
284 
285  /* upgrade our lock if necessary */
286  if (access == BT_READ)
287  {
288  _bt_unlockbuf(rel, buf);
289  _bt_lockbuf(rel, buf, BT_WRITE);
290  }
291 
292  if (P_INCOMPLETE_SPLIT(opaque))
293  _bt_finish_split(rel, heaprel, buf, stack);
294  else
295  _bt_relbuf(rel, buf);
296 
297  /* re-acquire the lock in the right mode, and re-check */
298  buf = _bt_getbuf(rel, heaprel, blkno, access);
299  continue;
300  }
301 
302  if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
303  {
304  /* step right one page */
305  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
306  continue;
307  }
308  else
309  break;
310  }
311 
312  if (P_IGNORE(opaque))
313  elog(ERROR, "fell off the end of index \"%s\"",
315 
316  return buf;
317 }
318 
319 /*
320  * _bt_binsrch() -- Do a binary search for a key on a particular page.
321  *
322  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
323  * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
324  * particular, this means it is possible to return a value 1 greater than the
325  * number of keys on the page, if the scankey is > all keys on the page.)
326  *
327  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
328  * of the last key < given scankey, or last key <= given scankey if nextkey
329  * is true. (Since _bt_compare treats the first data key of such a page as
330  * minus infinity, there will be at least one key < scankey, so the result
331  * always points at one of the keys on the page.) This key indicates the
332  * right place to descend to be sure we find all leaf keys >= given scankey
333  * (or leaf keys > given scankey when nextkey is true).
334  *
335  * This procedure is not responsible for walking right, it just examines
336  * the given page. _bt_binsrch() has no lock or refcount side effects
337  * on the buffer.
338  */
339 static OffsetNumber
342  Buffer buf)
343 {
344  Page page;
345  BTPageOpaque opaque;
346  OffsetNumber low,
347  high;
348  int32 result,
349  cmpval;
350 
351  page = BufferGetPage(buf);
352  opaque = BTPageGetOpaque(page);
353 
354  /* Requesting nextkey semantics while using scantid seems nonsensical */
355  Assert(!key->nextkey || key->scantid == NULL);
356  /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
357  Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
358 
359  low = P_FIRSTDATAKEY(opaque);
360  high = PageGetMaxOffsetNumber(page);
361 
362  /*
363  * If there are no keys on the page, return the first available slot. Note
364  * this covers two cases: the page is really empty (no keys), or it
365  * contains only a high key. The latter case is possible after vacuuming.
366  * This can never happen on an internal page, however, since they are
367  * never empty (an internal page must have children).
368  */
369  if (unlikely(high < low))
370  return low;
371 
372  /*
373  * Binary search to find the first key on the page >= scan key, or first
374  * key > scankey when nextkey is true.
375  *
376  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
377  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
378  *
379  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
380  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
381  *
382  * We can fall out when high == low.
383  */
384  high++; /* establish the loop invariant for high */
385 
386  cmpval = key->nextkey ? 0 : 1; /* select comparison value */
387 
388  while (high > low)
389  {
390  OffsetNumber mid = low + ((high - low) / 2);
391 
392  /* We have low <= mid < high, so mid points at a real slot */
393 
394  result = _bt_compare(rel, key, page, mid);
395 
396  if (result >= cmpval)
397  low = mid + 1;
398  else
399  high = mid;
400  }
401 
402  /*
403  * At this point we have high == low, but be careful: they could point
404  * past the last slot on the page.
405  *
406  * On a leaf page, we always return the first key >= scan key (resp. >
407  * scan key), which could be the last slot + 1.
408  */
409  if (P_ISLEAF(opaque))
410  return low;
411 
412  /*
413  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
414  * There must be one if _bt_compare() is playing by the rules.
415  */
416  Assert(low > P_FIRSTDATAKEY(opaque));
417 
418  return OffsetNumberPrev(low);
419 }
420 
421 /*
422  *
423  * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
424  *
425  * Like _bt_binsrch(), but with support for caching the binary search
426  * bounds. Only used during insertion, and only on the leaf page that it
427  * looks like caller will insert tuple on. Exclusive-locked and pinned
428  * leaf page is contained within insertstate.
429  *
430  * Caches the bounds fields in insertstate so that a subsequent call can
431  * reuse the low and strict high bounds of original binary search. Callers
432  * that use these fields directly must be prepared for the case where low
433  * and/or stricthigh are not on the same page (one or both exceed maxoff
434  * for the page). The case where there are no items on the page (high <
435  * low) makes bounds invalid.
436  *
437  * Caller is responsible for invalidating bounds when it modifies the page
438  * before calling here a second time, and for dealing with posting list
439  * tuple matches (callers can use insertstate's postingoff field to
440  * determine which existing heap TID will need to be replaced by a posting
441  * list split).
442  */
445 {
446  BTScanInsert key = insertstate->itup_key;
447  Page page;
448  BTPageOpaque opaque;
449  OffsetNumber low,
450  high,
451  stricthigh;
452  int32 result,
453  cmpval;
454 
455  page = BufferGetPage(insertstate->buf);
456  opaque = BTPageGetOpaque(page);
457 
458  Assert(P_ISLEAF(opaque));
459  Assert(!key->nextkey);
460  Assert(insertstate->postingoff == 0);
461 
462  if (!insertstate->bounds_valid)
463  {
464  /* Start new binary search */
465  low = P_FIRSTDATAKEY(opaque);
466  high = PageGetMaxOffsetNumber(page);
467  }
468  else
469  {
470  /* Restore result of previous binary search against same page */
471  low = insertstate->low;
472  high = insertstate->stricthigh;
473  }
474 
475  /* If there are no keys on the page, return the first available slot */
476  if (unlikely(high < low))
477  {
478  /* Caller can't reuse bounds */
479  insertstate->low = InvalidOffsetNumber;
480  insertstate->stricthigh = InvalidOffsetNumber;
481  insertstate->bounds_valid = false;
482  return low;
483  }
484 
485  /*
486  * Binary search to find the first key on the page >= scan key. (nextkey
487  * is always false when inserting).
488  *
489  * The loop invariant is: all slots before 'low' are < scan key, all slots
490  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491  * maintained to save additional search effort for caller.
492  *
493  * We can fall out when high == low.
494  */
495  if (!insertstate->bounds_valid)
496  high++; /* establish the loop invariant for high */
497  stricthigh = high; /* high initially strictly higher */
498 
499  cmpval = 1; /* !nextkey comparison value */
500 
501  while (high > low)
502  {
503  OffsetNumber mid = low + ((high - low) / 2);
504 
505  /* We have low <= mid < high, so mid points at a real slot */
506 
507  result = _bt_compare(rel, key, page, mid);
508 
509  if (result >= cmpval)
510  low = mid + 1;
511  else
512  {
513  high = mid;
514  if (result != 0)
515  stricthigh = high;
516  }
517 
518  /*
519  * If tuple at offset located by binary search is a posting list whose
520  * TID range overlaps with caller's scantid, perform posting list
521  * binary search to set postingoff for caller. Caller must split the
522  * posting list when postingoff is set. This should happen
523  * infrequently.
524  */
525  if (unlikely(result == 0 && key->scantid != NULL))
526  {
527  /*
528  * postingoff should never be set more than once per leaf page
529  * binary search. That would mean that there are duplicate table
530  * TIDs in the index, which is never okay. Check for that here.
531  */
532  if (insertstate->postingoff != 0)
533  ereport(ERROR,
534  (errcode(ERRCODE_INDEX_CORRUPTED),
535  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
536  ItemPointerGetBlockNumber(key->scantid),
538  low, stricthigh,
539  BufferGetBlockNumber(insertstate->buf),
540  RelationGetRelationName(rel))));
541 
542  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
543  }
544  }
545 
546  /*
547  * On a leaf page, a binary search always returns the first key >= scan
548  * key (at least in !nextkey case), which could be the last slot + 1. This
549  * is also the lower bound of cached search.
550  *
551  * stricthigh may also be the last slot + 1, which prevents caller from
552  * using bounds directly, but is still useful to us if we're called a
553  * second time with cached bounds (cached low will be < stricthigh when
554  * that happens).
555  */
556  insertstate->low = low;
557  insertstate->stricthigh = stricthigh;
558  insertstate->bounds_valid = true;
559 
560  return low;
561 }
562 
563 /*----------
564  * _bt_binsrch_posting() -- posting list binary search.
565  *
566  * Helper routine for _bt_binsrch_insert().
567  *
568  * Returns offset into posting list where caller's scantid belongs.
569  *----------
570  */
571 static int
573 {
574  IndexTuple itup;
575  ItemId itemid;
576  int low,
577  high,
578  mid,
579  res;
580 
581  /*
582  * If this isn't a posting tuple, then the index must be corrupt (if it is
583  * an ordinary non-pivot tuple then there must be an existing tuple with a
584  * heap TID that equals inserter's new heap TID/scantid). Defensively
585  * check that tuple is a posting list tuple whose posting list range
586  * includes caller's scantid.
587  *
588  * (This is also needed because contrib/amcheck's rootdescend option needs
589  * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
590  */
591  itemid = PageGetItemId(page, offnum);
592  itup = (IndexTuple) PageGetItem(page, itemid);
593  if (!BTreeTupleIsPosting(itup))
594  return 0;
595 
596  Assert(key->heapkeyspace && key->allequalimage);
597 
598  /*
599  * In the event that posting list tuple has LP_DEAD bit set, indicate this
600  * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
601  * second call to _bt_binsrch_insert() can take place when its caller has
602  * removed the dead item.
603  */
604  if (ItemIdIsDead(itemid))
605  return -1;
606 
607  /* "high" is past end of posting list for loop invariant */
608  low = 0;
609  high = BTreeTupleGetNPosting(itup);
610  Assert(high >= 2);
611 
612  while (high > low)
613  {
614  mid = low + ((high - low) / 2);
615  res = ItemPointerCompare(key->scantid,
616  BTreeTupleGetPostingN(itup, mid));
617 
618  if (res > 0)
619  low = mid + 1;
620  else if (res < 0)
621  high = mid;
622  else
623  return mid;
624  }
625 
626  /* Exact match not found */
627  return low;
628 }
629 
630 /*----------
631  * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
632  *
633  * page/offnum: location of btree item to be compared to.
634  *
635  * This routine returns:
636  * <0 if scankey < tuple at offnum;
637  * 0 if scankey == tuple at offnum;
638  * >0 if scankey > tuple at offnum.
639  *
640  * NULLs in the keys are treated as sortable values. Therefore
641  * "equality" does not necessarily mean that the item should be returned
642  * to the caller as a matching key. Similarly, an insertion scankey
643  * with its scantid set is treated as equal to a posting tuple whose TID
644  * range overlaps with their scantid. There generally won't be a
645  * matching TID in the posting tuple, which caller must handle
646  * themselves (e.g., by splitting the posting list tuple).
647  *
648  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
649  * "minus infinity": this routine will always claim it is less than the
650  * scankey. The actual key value stored is explicitly truncated to 0
651  * attributes (explicitly minus infinity) with version 3+ indexes, but
652  * that isn't relied upon. This allows us to implement the Lehman and
653  * Yao convention that the first down-link pointer is before the first
654  * key. See backend/access/nbtree/README for details.
655  *----------
656  */
657 int32
660  Page page,
661  OffsetNumber offnum)
662 {
663  TupleDesc itupdesc = RelationGetDescr(rel);
664  BTPageOpaque opaque = BTPageGetOpaque(page);
665  IndexTuple itup;
666  ItemPointer heapTid;
667  ScanKey scankey;
668  int ncmpkey;
669  int ntupatts;
670  int32 result;
671 
672  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
674  Assert(key->heapkeyspace || key->scantid == NULL);
675 
676  /*
677  * Force result ">" if target item is first data item on an internal page
678  * --- see NOTE above.
679  */
680  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
681  return 1;
682 
683  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
684  ntupatts = BTreeTupleGetNAtts(itup, rel);
685 
686  /*
687  * The scan key is set up with the attribute number associated with each
688  * term in the key. It is important that, if the index is multi-key, the
689  * scan contain the first k key attributes, and that they be in order. If
690  * you think about how multi-key ordering works, you'll understand why
691  * this is.
692  *
693  * We don't test for violation of this condition here, however. The
694  * initial setup for the index scan had better have gotten it right (see
695  * _bt_first).
696  */
697 
698  ncmpkey = Min(ntupatts, key->keysz);
699  Assert(key->heapkeyspace || ncmpkey == key->keysz);
700  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
701  scankey = key->scankeys;
702  for (int i = 1; i <= ncmpkey; i++)
703  {
704  Datum datum;
705  bool isNull;
706 
707  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
708 
709  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
710  {
711  if (isNull)
712  result = 0; /* NULL "=" NULL */
713  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
714  result = -1; /* NULL "<" NOT_NULL */
715  else
716  result = 1; /* NULL ">" NOT_NULL */
717  }
718  else if (isNull) /* key is NOT_NULL and item is NULL */
719  {
720  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
721  result = 1; /* NOT_NULL ">" NULL */
722  else
723  result = -1; /* NOT_NULL "<" NULL */
724  }
725  else
726  {
727  /*
728  * The sk_func needs to be passed the index value as left arg and
729  * the sk_argument as right arg (they might be of different
730  * types). Since it is convenient for callers to think of
731  * _bt_compare as comparing the scankey to the index item, we have
732  * to flip the sign of the comparison result. (Unless it's a DESC
733  * column, in which case we *don't* flip the sign.)
734  */
735  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
736  scankey->sk_collation,
737  datum,
738  scankey->sk_argument));
739 
740  if (!(scankey->sk_flags & SK_BT_DESC))
741  INVERT_COMPARE_RESULT(result);
742  }
743 
744  /* if the keys are unequal, return the difference */
745  if (result != 0)
746  return result;
747 
748  scankey++;
749  }
750 
751  /*
752  * All non-truncated attributes (other than heap TID) were found to be
753  * equal. Treat truncated attributes as minus infinity when scankey has a
754  * key attribute value that would otherwise be compared directly.
755  *
756  * Note: it doesn't matter if ntupatts includes non-key attributes;
757  * scankey won't, so explicitly excluding non-key attributes isn't
758  * necessary.
759  */
760  if (key->keysz > ntupatts)
761  return 1;
762 
763  /*
764  * Use the heap TID attribute and scantid to try to break the tie. The
765  * rules are the same as any other key attribute -- only the
766  * representation differs.
767  */
768  heapTid = BTreeTupleGetHeapTID(itup);
769  if (key->scantid == NULL)
770  {
771  /*
772  * Most searches have a scankey that is considered greater than a
773  * truncated pivot tuple if and when the scankey has equal values for
774  * attributes up to and including the least significant untruncated
775  * attribute in tuple.
776  *
777  * For example, if an index has the minimum two attributes (single
778  * user key attribute, plus heap TID attribute), and a page's high key
779  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
780  * will not descend to the page to the left. The search will descend
781  * right instead. The truncated attribute in pivot tuple means that
782  * all non-pivot tuples on the page to the left are strictly < 'foo',
783  * so it isn't necessary to descend left. In other words, search
784  * doesn't have to descend left because it isn't interested in a match
785  * that has a heap TID value of -inf.
786  *
787  * However, some searches (pivotsearch searches) actually require that
788  * we descend left when this happens. -inf is treated as a possible
789  * match for omitted scankey attribute(s). This is needed by page
790  * deletion, which must re-find leaf pages that are targets for
791  * deletion using their high keys.
792  *
793  * Note: the heap TID part of the test ensures that scankey is being
794  * compared to a pivot tuple with one or more truncated key
795  * attributes.
796  *
797  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
798  * left here, since they have no heap TID attribute (and cannot have
799  * any -inf key values in any case, since truncation can only remove
800  * non-key attributes). !heapkeyspace searches must always be
801  * prepared to deal with matches on both sides of the pivot once the
802  * leaf level is reached.
803  */
804  if (key->heapkeyspace && !key->pivotsearch &&
805  key->keysz == ntupatts && heapTid == NULL)
806  return 1;
807 
808  /* All provided scankey arguments found to be equal */
809  return 0;
810  }
811 
812  /*
813  * Treat truncated heap TID as minus infinity, since scankey has a key
814  * attribute value (scantid) that would otherwise be compared directly
815  */
817  if (heapTid == NULL)
818  return 1;
819 
820  /*
821  * Scankey must be treated as equal to a posting list tuple if its scantid
822  * value falls within the range of the posting list. In all other cases
823  * there can only be a single heap TID value, which is compared directly
824  * with scantid.
825  */
827  result = ItemPointerCompare(key->scantid, heapTid);
828  if (result <= 0 || !BTreeTupleIsPosting(itup))
829  return result;
830  else
831  {
832  result = ItemPointerCompare(key->scantid,
834  if (result > 0)
835  return 1;
836  }
837 
838  return 0;
839 }
840 
841 /*
842  * _bt_first() -- Find the first item in a scan.
843  *
844  * We need to be clever about the direction of scan, the search
845  * conditions, and the tree ordering. We find the first item (or,
846  * if backwards scan, the last item) in the tree that satisfies the
847  * qualifications in the scan key. On success exit, the page containing
848  * the current index tuple is pinned but not locked, and data about
849  * the matching tuple(s) on the page has been loaded into so->currPos.
850  * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
851  * and if requested, scan->xs_itup points to a copy of the index tuple.
852  *
853  * If there are no matching items in the index, we return false, with no
854  * pins or locks held.
855  *
856  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
857  * are both search-type scankeys (see nbtree/README for more about this).
858  * Within this routine, we build a temporary insertion-type scankey to use
859  * in locating the scan start position.
860  */
861 bool
863 {
864  Relation rel = scan->indexRelation;
865  Relation heaprel = scan->heapRelation;
866  BTScanOpaque so = (BTScanOpaque) scan->opaque;
867  Buffer buf;
868  BTStack stack;
869  OffsetNumber offnum;
870  StrategyNumber strat;
871  bool nextkey;
872  bool goback;
873  BTScanInsertData inskey;
874  ScanKey startKeys[INDEX_MAX_KEYS];
875  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
876  int keysCount = 0;
877  int i;
878  bool status;
879  StrategyNumber strat_total;
880  BTScanPosItem *currItem;
881  BlockNumber blkno;
882 
884 
886 
887  /*
888  * Examine the scan keys and eliminate any redundant keys; also mark the
889  * keys that must be matched to continue the scan.
890  */
891  _bt_preprocess_keys(scan);
892 
893  /*
894  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
895  * never be satisfied (eg, x == 1 AND x > 2).
896  */
897  if (!so->qual_ok)
898  {
899  /* Notify any other workers that we're done with this scan key. */
900  _bt_parallel_done(scan);
901  return false;
902  }
903 
904  /*
905  * For parallel scans, get the starting page from shared state. If the
906  * scan has not started, proceed to find out first leaf page in the usual
907  * way while keeping other participating processes waiting. If the scan
908  * has already begun, use the page number from the shared structure.
909  */
910  if (scan->parallel_scan != NULL)
911  {
912  status = _bt_parallel_seize(scan, &blkno);
913  if (!status)
914  return false;
915  else if (blkno == P_NONE)
916  {
917  _bt_parallel_done(scan);
918  return false;
919  }
920  else if (blkno != InvalidBlockNumber)
921  {
922  if (!_bt_parallel_readpage(scan, blkno, dir))
923  return false;
924  goto readcomplete;
925  }
926  }
927 
928  /*----------
929  * Examine the scan keys to discover where we need to start the scan.
930  *
931  * We want to identify the keys that can be used as starting boundaries;
932  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
933  * a backwards scan. We can use keys for multiple attributes so long as
934  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
935  * a > or < boundary or find an attribute with no boundary (which can be
936  * thought of as the same as "> -infinity"), we can't use keys for any
937  * attributes to its right, because it would break our simplistic notion
938  * of what initial positioning strategy to use.
939  *
940  * When the scan keys include cross-type operators, _bt_preprocess_keys
941  * may not be able to eliminate redundant keys; in such cases we will
942  * arbitrarily pick a usable one for each attribute. This is correct
943  * but possibly not optimal behavior. (For example, with keys like
944  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
945  * x=5 would be more efficient.) Since the situation only arises given
946  * a poorly-worded query plus an incomplete opfamily, live with it.
947  *
948  * When both equality and inequality keys appear for a single attribute
949  * (again, only possible when cross-type operators appear), we *must*
950  * select one of the equality keys for the starting point, because
951  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
952  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
953  * start at x=4, we will fail and stop before reaching x=10. If multiple
954  * equality quals survive preprocessing, however, it doesn't matter which
955  * one we use --- by definition, they are either redundant or
956  * contradictory.
957  *
958  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
959  * If the index stores nulls at the end of the index we'll be starting
960  * from, and we have no boundary key for the column (which means the key
961  * we deduced NOT NULL from is an inequality key that constrains the other
962  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
963  * use as a boundary key. If we didn't do this, we might find ourselves
964  * traversing a lot of null entries at the start of the scan.
965  *
966  * In this loop, row-comparison keys are treated the same as keys on their
967  * first (leftmost) columns. We'll add on lower-order columns of the row
968  * comparison below, if possible.
969  *
970  * The selected scan keys (at most one per index column) are remembered by
971  * storing their addresses into the local startKeys[] array.
972  *----------
973  */
974  strat_total = BTEqualStrategyNumber;
975  if (so->numberOfKeys > 0)
976  {
977  AttrNumber curattr;
978  ScanKey chosen;
979  ScanKey impliesNN;
980  ScanKey cur;
981 
982  /*
983  * chosen is the so-far-chosen key for the current attribute, if any.
984  * We don't cast the decision in stone until we reach keys for the
985  * next attribute.
986  */
987  curattr = 1;
988  chosen = NULL;
989  /* Also remember any scankey that implies a NOT NULL constraint */
990  impliesNN = NULL;
991 
992  /*
993  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
994  * pass to handle after-last-key processing. Actual exit from the
995  * loop is at one of the "break" statements below.
996  */
997  for (cur = so->keyData, i = 0;; cur++, i++)
998  {
999  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1000  {
1001  /*
1002  * Done looking at keys for curattr. If we didn't find a
1003  * usable boundary key, see if we can deduce a NOT NULL key.
1004  */
1005  if (chosen == NULL && impliesNN != NULL &&
1006  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1007  ScanDirectionIsForward(dir) :
1009  {
1010  /* Yes, so build the key in notnullkeys[keysCount] */
1011  chosen = &notnullkeys[keysCount];
1012  ScanKeyEntryInitialize(chosen,
1014  (impliesNN->sk_flags &
1016  curattr,
1017  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1020  InvalidOid,
1021  InvalidOid,
1022  InvalidOid,
1023  (Datum) 0);
1024  }
1025 
1026  /*
1027  * If we still didn't find a usable boundary key, quit; else
1028  * save the boundary key pointer in startKeys.
1029  */
1030  if (chosen == NULL)
1031  break;
1032  startKeys[keysCount++] = chosen;
1033 
1034  /*
1035  * Adjust strat_total, and quit if we have stored a > or <
1036  * key.
1037  */
1038  strat = chosen->sk_strategy;
1039  if (strat != BTEqualStrategyNumber)
1040  {
1041  strat_total = strat;
1042  if (strat == BTGreaterStrategyNumber ||
1043  strat == BTLessStrategyNumber)
1044  break;
1045  }
1046 
1047  /*
1048  * Done if that was the last attribute, or if next key is not
1049  * in sequence (implying no boundary key is available for the
1050  * next attribute).
1051  */
1052  if (i >= so->numberOfKeys ||
1053  cur->sk_attno != curattr + 1)
1054  break;
1055 
1056  /*
1057  * Reset for next attr.
1058  */
1059  curattr = cur->sk_attno;
1060  chosen = NULL;
1061  impliesNN = NULL;
1062  }
1063 
1064  /*
1065  * Can we use this key as a starting boundary for this attr?
1066  *
1067  * If not, does it imply a NOT NULL constraint? (Because
1068  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1069  * *any* inequality key works for that; we need not test.)
1070  */
1071  switch (cur->sk_strategy)
1072  {
1073  case BTLessStrategyNumber:
1075  if (chosen == NULL)
1076  {
1077  if (ScanDirectionIsBackward(dir))
1078  chosen = cur;
1079  else
1080  impliesNN = cur;
1081  }
1082  break;
1083  case BTEqualStrategyNumber:
1084  /* override any non-equality choice */
1085  chosen = cur;
1086  break;
1089  if (chosen == NULL)
1090  {
1091  if (ScanDirectionIsForward(dir))
1092  chosen = cur;
1093  else
1094  impliesNN = cur;
1095  }
1096  break;
1097  }
1098  }
1099  }
1100 
1101  /*
1102  * If we found no usable boundary keys, we have to start from one end of
1103  * the tree. Walk down that edge to the first or last key, and scan from
1104  * there.
1105  */
1106  if (keysCount == 0)
1107  {
1108  bool match;
1109 
1110  match = _bt_endpoint(scan, dir);
1111 
1112  if (!match)
1113  {
1114  /* No match, so mark (parallel) scan finished */
1115  _bt_parallel_done(scan);
1116  }
1117 
1118  return match;
1119  }
1120 
1121  /*
1122  * We want to start the scan somewhere within the index. Set up an
1123  * insertion scankey we can use to search for the boundary point we
1124  * identified above. The insertion scankey is built using the keys
1125  * identified by startKeys[]. (Remaining insertion scankey fields are
1126  * initialized after initial-positioning strategy is finalized.)
1127  */
1128  Assert(keysCount <= INDEX_MAX_KEYS);
1129  for (i = 0; i < keysCount; i++)
1130  {
1131  ScanKey cur = startKeys[i];
1132 
1133  Assert(cur->sk_attno == i + 1);
1134 
1135  if (cur->sk_flags & SK_ROW_HEADER)
1136  {
1137  /*
1138  * Row comparison header: look to the first row member instead.
1139  *
1140  * The member scankeys are already in insertion format (ie, they
1141  * have sk_func = 3-way-comparison function), but we have to watch
1142  * out for nulls, which _bt_preprocess_keys didn't check. A null
1143  * in the first row member makes the condition unmatchable, just
1144  * like qual_ok = false.
1145  */
1146  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1147 
1148  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1149  if (subkey->sk_flags & SK_ISNULL)
1150  {
1151  _bt_parallel_done(scan);
1152  return false;
1153  }
1154  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1155 
1156  /*
1157  * If the row comparison is the last positioning key we accepted,
1158  * try to add additional keys from the lower-order row members.
1159  * (If we accepted independent conditions on additional index
1160  * columns, we use those instead --- doesn't seem worth trying to
1161  * determine which is more restrictive.) Note that this is OK
1162  * even if the row comparison is of ">" or "<" type, because the
1163  * condition applied to all but the last row member is effectively
1164  * ">=" or "<=", and so the extra keys don't break the positioning
1165  * scheme. But, by the same token, if we aren't able to use all
1166  * the row members, then the part of the row comparison that we
1167  * did use has to be treated as just a ">=" or "<=" condition, and
1168  * so we'd better adjust strat_total accordingly.
1169  */
1170  if (i == keysCount - 1)
1171  {
1172  bool used_all_subkeys = false;
1173 
1174  Assert(!(subkey->sk_flags & SK_ROW_END));
1175  for (;;)
1176  {
1177  subkey++;
1178  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1179  if (subkey->sk_attno != keysCount + 1)
1180  break; /* out-of-sequence, can't use it */
1181  if (subkey->sk_strategy != cur->sk_strategy)
1182  break; /* wrong direction, can't use it */
1183  if (subkey->sk_flags & SK_ISNULL)
1184  break; /* can't use null keys */
1185  Assert(keysCount < INDEX_MAX_KEYS);
1186  memcpy(inskey.scankeys + keysCount, subkey,
1187  sizeof(ScanKeyData));
1188  keysCount++;
1189  if (subkey->sk_flags & SK_ROW_END)
1190  {
1191  used_all_subkeys = true;
1192  break;
1193  }
1194  }
1195  if (!used_all_subkeys)
1196  {
1197  switch (strat_total)
1198  {
1199  case BTLessStrategyNumber:
1200  strat_total = BTLessEqualStrategyNumber;
1201  break;
1203  strat_total = BTGreaterEqualStrategyNumber;
1204  break;
1205  }
1206  }
1207  break; /* done with outer loop */
1208  }
1209  }
1210  else
1211  {
1212  /*
1213  * Ordinary comparison key. Transform the search-style scan key
1214  * to an insertion scan key by replacing the sk_func with the
1215  * appropriate btree comparison function.
1216  *
1217  * If scankey operator is not a cross-type comparison, we can use
1218  * the cached comparison function; otherwise gotta look it up in
1219  * the catalogs. (That can't lead to infinite recursion, since no
1220  * indexscan initiated by syscache lookup will use cross-data-type
1221  * operators.)
1222  *
1223  * We support the convention that sk_subtype == InvalidOid means
1224  * the opclass input type; this is a hack to simplify life for
1225  * ScanKeyInit().
1226  */
1227  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1228  cur->sk_subtype == InvalidOid)
1229  {
1230  FmgrInfo *procinfo;
1231 
1232  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1233  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1234  cur->sk_flags,
1235  cur->sk_attno,
1237  cur->sk_subtype,
1238  cur->sk_collation,
1239  procinfo,
1240  cur->sk_argument);
1241  }
1242  else
1243  {
1244  RegProcedure cmp_proc;
1245 
1246  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1247  rel->rd_opcintype[i],
1248  cur->sk_subtype,
1249  BTORDER_PROC);
1250  if (!RegProcedureIsValid(cmp_proc))
1251  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1252  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1253  cur->sk_attno, RelationGetRelationName(rel));
1254  ScanKeyEntryInitialize(inskey.scankeys + i,
1255  cur->sk_flags,
1256  cur->sk_attno,
1258  cur->sk_subtype,
1259  cur->sk_collation,
1260  cmp_proc,
1261  cur->sk_argument);
1262  }
1263  }
1264  }
1265 
1266  /*----------
1267  * Examine the selected initial-positioning strategy to determine exactly
1268  * where we need to start the scan, and set flag variables to control the
1269  * code below.
1270  *
1271  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1272  * item >= scan key. If nextkey = true, they will locate the first
1273  * item > scan key.
1274  *
1275  * If goback = true, we will then step back one item, while if
1276  * goback = false, we will start the scan on the located item.
1277  *----------
1278  */
1279  switch (strat_total)
1280  {
1281  case BTLessStrategyNumber:
1282 
1283  /*
1284  * Find first item >= scankey, then back up one to arrive at last
1285  * item < scankey. (Note: this positioning strategy is only used
1286  * for a backward scan, so that is always the correct starting
1287  * position.)
1288  */
1289  nextkey = false;
1290  goback = true;
1291  break;
1292 
1294 
1295  /*
1296  * Find first item > scankey, then back up one to arrive at last
1297  * item <= scankey. (Note: this positioning strategy is only used
1298  * for a backward scan, so that is always the correct starting
1299  * position.)
1300  */
1301  nextkey = true;
1302  goback = true;
1303  break;
1304 
1305  case BTEqualStrategyNumber:
1306 
1307  /*
1308  * If a backward scan was specified, need to start with last equal
1309  * item not first one.
1310  */
1311  if (ScanDirectionIsBackward(dir))
1312  {
1313  /*
1314  * This is the same as the <= strategy. We will check at the
1315  * end whether the found item is actually =.
1316  */
1317  nextkey = true;
1318  goback = true;
1319  }
1320  else
1321  {
1322  /*
1323  * This is the same as the >= strategy. We will check at the
1324  * end whether the found item is actually =.
1325  */
1326  nextkey = false;
1327  goback = false;
1328  }
1329  break;
1330 
1332 
1333  /*
1334  * Find first item >= scankey. (This is only used for forward
1335  * scans.)
1336  */
1337  nextkey = false;
1338  goback = false;
1339  break;
1340 
1342 
1343  /*
1344  * Find first item > scankey. (This is only used for forward
1345  * scans.)
1346  */
1347  nextkey = true;
1348  goback = false;
1349  break;
1350 
1351  default:
1352  /* can't get here, but keep compiler quiet */
1353  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1354  return false;
1355  }
1356 
1357  /* Initialize remaining insertion scan key fields */
1358  _bt_metaversion(rel, heaprel, &inskey.heapkeyspace, &inskey.allequalimage);
1359  inskey.anynullkeys = false; /* unused */
1360  inskey.nextkey = nextkey;
1361  inskey.pivotsearch = false;
1362  inskey.scantid = NULL;
1363  inskey.keysz = keysCount;
1364 
1365  /*
1366  * Use the manufactured insertion scan key to descend the tree and
1367  * position ourselves on the target leaf page.
1368  */
1369  stack = _bt_search(rel, heaprel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1370 
1371  /* don't need to keep the stack around... */
1372  _bt_freestack(stack);
1373 
1374  if (!BufferIsValid(buf))
1375  {
1376  /*
1377  * We only get here if the index is completely empty. Lock relation
1378  * because nothing finer to lock exists.
1379  */
1380  PredicateLockRelation(rel, scan->xs_snapshot);
1381 
1382  /*
1383  * mark parallel scan as done, so that all the workers can finish
1384  * their scan
1385  */
1386  _bt_parallel_done(scan);
1388 
1389  return false;
1390  }
1391  else
1393  scan->xs_snapshot);
1394 
1395  _bt_initialize_more_data(so, dir);
1396 
1397  /* position to the precise item on the page */
1398  offnum = _bt_binsrch(rel, &inskey, buf);
1399 
1400  /*
1401  * If nextkey = false, we are positioned at the first item >= scan key, or
1402  * possibly at the end of a page on which all the existing items are less
1403  * than the scan key and we know that everything on later pages is greater
1404  * than or equal to scan key.
1405  *
1406  * If nextkey = true, we are positioned at the first item > scan key, or
1407  * possibly at the end of a page on which all the existing items are less
1408  * than or equal to the scan key and we know that everything on later
1409  * pages is greater than scan key.
1410  *
1411  * The actually desired starting point is either this item or the prior
1412  * one, or in the end-of-page case it's the first item on the next page or
1413  * the last item on this page. Adjust the starting offset if needed. (If
1414  * this results in an offset before the first item or after the last one,
1415  * _bt_readpage will report no items found, and then we'll step to the
1416  * next page as needed.)
1417  */
1418  if (goback)
1419  offnum = OffsetNumberPrev(offnum);
1420 
1421  /* remember which buffer we have pinned, if any */
1423  so->currPos.buf = buf;
1424 
1425  /*
1426  * Now load data from the first page of the scan.
1427  */
1428  if (!_bt_readpage(scan, dir, offnum))
1429  {
1430  /*
1431  * There's no actually-matching data on this page. Try to advance to
1432  * the next page. Return false if there's no matching data at all.
1433  */
1434  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1435  if (!_bt_steppage(scan, dir))
1436  return false;
1437  }
1438  else
1439  {
1440  /* Drop the lock, and maybe the pin, on the current page */
1442  }
1443 
1444 readcomplete:
1445  /* OK, itemIndex says what to return */
1446  currItem = &so->currPos.items[so->currPos.itemIndex];
1447  scan->xs_heaptid = currItem->heapTid;
1448  if (scan->xs_want_itup)
1449  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1450 
1451  return true;
1452 }
1453 
1454 /*
1455  * _bt_next() -- Get the next item in a scan.
1456  *
1457  * On entry, so->currPos describes the current page, which may be pinned
1458  * but is not locked, and so->currPos.itemIndex identifies which item was
1459  * previously returned.
1460  *
1461  * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1462  * next heap tuple, and if requested, scan->xs_itup points to a copy of
1463  * the index tuple. so->currPos is updated as needed.
1464  *
1465  * On failure exit (no more tuples), we release pin and set
1466  * so->currPos.buf to InvalidBuffer.
1467  */
1468 bool
1470 {
1471  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1472  BTScanPosItem *currItem;
1473 
1474  /*
1475  * Advance to next tuple on current page; or if there's no more, try to
1476  * step to the next page with data.
1477  */
1478  if (ScanDirectionIsForward(dir))
1479  {
1480  if (++so->currPos.itemIndex > so->currPos.lastItem)
1481  {
1482  if (!_bt_steppage(scan, dir))
1483  return false;
1484  }
1485  }
1486  else
1487  {
1488  if (--so->currPos.itemIndex < so->currPos.firstItem)
1489  {
1490  if (!_bt_steppage(scan, dir))
1491  return false;
1492  }
1493  }
1494 
1495  /* OK, itemIndex says what to return */
1496  currItem = &so->currPos.items[so->currPos.itemIndex];
1497  scan->xs_heaptid = currItem->heapTid;
1498  if (scan->xs_want_itup)
1499  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1500 
1501  return true;
1502 }
1503 
1504 /*
1505  * _bt_readpage() -- Load data from current index page into so->currPos
1506  *
1507  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1508  * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1509  * they are updated as appropriate. All other fields of so->currPos are
1510  * initialized from scratch here.
1511  *
1512  * We scan the current page starting at offnum and moving in the indicated
1513  * direction. All items matching the scan keys are loaded into currPos.items.
1514  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1515  * that there can be no more matching tuples in the current scan direction.
1516  *
1517  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1518  * prior to calling this function; this function will invoke
1519  * _bt_parallel_release before returning.
1520  *
1521  * Returns true if any matching items found on the page, false if none.
1522  */
1523 static bool
1525 {
1526  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1527  Page page;
1528  BTPageOpaque opaque;
1529  OffsetNumber minoff;
1530  OffsetNumber maxoff;
1531  int itemIndex;
1532  bool continuescan;
1533  int indnatts;
1534 
1535  /*
1536  * We must have the buffer pinned and locked, but the usual macro can't be
1537  * used here; this function is what makes it good for currPos.
1538  */
1540 
1541  page = BufferGetPage(so->currPos.buf);
1542  opaque = BTPageGetOpaque(page);
1543 
1544  /* allow next page be processed by parallel worker */
1545  if (scan->parallel_scan)
1546  {
1547  if (ScanDirectionIsForward(dir))
1548  _bt_parallel_release(scan, opaque->btpo_next);
1549  else
1551  }
1552 
1553  continuescan = true; /* default assumption */
1555  minoff = P_FIRSTDATAKEY(opaque);
1556  maxoff = PageGetMaxOffsetNumber(page);
1557 
1558  /*
1559  * We note the buffer's block number so that we can release the pin later.
1560  * This allows us to re-read the buffer if it is needed again for hinting.
1561  */
1563 
1564  /*
1565  * We save the LSN of the page as we read it, so that we know whether it
1566  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1567  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1568  */
1570 
1571  /*
1572  * we must save the page's right-link while scanning it; this tells us
1573  * where to step right to after we're done with these items. There is no
1574  * corresponding need for the left-link, since splits always go right.
1575  */
1576  so->currPos.nextPage = opaque->btpo_next;
1577 
1578  /* initialize tuple workspace to empty */
1579  so->currPos.nextTupleOffset = 0;
1580 
1581  /*
1582  * Now that the current page has been made consistent, the macro should be
1583  * good.
1584  */
1586 
1587  if (ScanDirectionIsForward(dir))
1588  {
1589  /* load items[] in ascending order */
1590  itemIndex = 0;
1591 
1592  offnum = Max(offnum, minoff);
1593 
1594  while (offnum <= maxoff)
1595  {
1596  ItemId iid = PageGetItemId(page, offnum);
1597  IndexTuple itup;
1598 
1599  /*
1600  * If the scan specifies not to return killed tuples, then we
1601  * treat a killed tuple as not passing the qual
1602  */
1603  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1604  {
1605  offnum = OffsetNumberNext(offnum);
1606  continue;
1607  }
1608 
1609  itup = (IndexTuple) PageGetItem(page, iid);
1610 
1611  if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1612  {
1613  /* tuple passes all scan key conditions */
1614  if (!BTreeTupleIsPosting(itup))
1615  {
1616  /* Remember it */
1617  _bt_saveitem(so, itemIndex, offnum, itup);
1618  itemIndex++;
1619  }
1620  else
1621  {
1622  int tupleOffset;
1623 
1624  /*
1625  * Set up state to return posting list, and remember first
1626  * TID
1627  */
1628  tupleOffset =
1629  _bt_setuppostingitems(so, itemIndex, offnum,
1630  BTreeTupleGetPostingN(itup, 0),
1631  itup);
1632  itemIndex++;
1633  /* Remember additional TIDs */
1634  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1635  {
1636  _bt_savepostingitem(so, itemIndex, offnum,
1637  BTreeTupleGetPostingN(itup, i),
1638  tupleOffset);
1639  itemIndex++;
1640  }
1641  }
1642  }
1643  /* When !continuescan, there can't be any more matches, so stop */
1644  if (!continuescan)
1645  break;
1646 
1647  offnum = OffsetNumberNext(offnum);
1648  }
1649 
1650  /*
1651  * We don't need to visit page to the right when the high key
1652  * indicates that no more matches will be found there.
1653  *
1654  * Checking the high key like this works out more often than you might
1655  * think. Leaf page splits pick a split point between the two most
1656  * dissimilar tuples (this is weighed against the need to evenly share
1657  * free space). Leaf pages with high key attribute values that can
1658  * only appear on non-pivot tuples on the right sibling page are
1659  * common.
1660  */
1661  if (continuescan && !P_RIGHTMOST(opaque))
1662  {
1663  ItemId iid = PageGetItemId(page, P_HIKEY);
1664  IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1665  int truncatt;
1666 
1667  truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1668  _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1669  }
1670 
1671  if (!continuescan)
1672  so->currPos.moreRight = false;
1673 
1674  Assert(itemIndex <= MaxTIDsPerBTreePage);
1675  so->currPos.firstItem = 0;
1676  so->currPos.lastItem = itemIndex - 1;
1677  so->currPos.itemIndex = 0;
1678  }
1679  else
1680  {
1681  /* load items[] in descending order */
1682  itemIndex = MaxTIDsPerBTreePage;
1683 
1684  offnum = Min(offnum, maxoff);
1685 
1686  while (offnum >= minoff)
1687  {
1688  ItemId iid = PageGetItemId(page, offnum);
1689  IndexTuple itup;
1690  bool tuple_alive;
1691  bool passes_quals;
1692 
1693  /*
1694  * If the scan specifies not to return killed tuples, then we
1695  * treat a killed tuple as not passing the qual. Most of the
1696  * time, it's a win to not bother examining the tuple's index
1697  * keys, but just skip to the next tuple (previous, actually,
1698  * since we're scanning backwards). However, if this is the first
1699  * tuple on the page, we do check the index keys, to prevent
1700  * uselessly advancing to the page to the left. This is similar
1701  * to the high key optimization used by forward scans.
1702  */
1703  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1704  {
1705  Assert(offnum >= P_FIRSTDATAKEY(opaque));
1706  if (offnum > P_FIRSTDATAKEY(opaque))
1707  {
1708  offnum = OffsetNumberPrev(offnum);
1709  continue;
1710  }
1711 
1712  tuple_alive = false;
1713  }
1714  else
1715  tuple_alive = true;
1716 
1717  itup = (IndexTuple) PageGetItem(page, iid);
1718 
1719  passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1720  &continuescan);
1721  if (passes_quals && tuple_alive)
1722  {
1723  /* tuple passes all scan key conditions */
1724  if (!BTreeTupleIsPosting(itup))
1725  {
1726  /* Remember it */
1727  itemIndex--;
1728  _bt_saveitem(so, itemIndex, offnum, itup);
1729  }
1730  else
1731  {
1732  int tupleOffset;
1733 
1734  /*
1735  * Set up state to return posting list, and remember first
1736  * TID.
1737  *
1738  * Note that we deliberately save/return items from
1739  * posting lists in ascending heap TID order for backwards
1740  * scans. This allows _bt_killitems() to make a
1741  * consistent assumption about the order of items
1742  * associated with the same posting list tuple.
1743  */
1744  itemIndex--;
1745  tupleOffset =
1746  _bt_setuppostingitems(so, itemIndex, offnum,
1747  BTreeTupleGetPostingN(itup, 0),
1748  itup);
1749  /* Remember additional TIDs */
1750  for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1751  {
1752  itemIndex--;
1753  _bt_savepostingitem(so, itemIndex, offnum,
1754  BTreeTupleGetPostingN(itup, i),
1755  tupleOffset);
1756  }
1757  }
1758  }
1759  if (!continuescan)
1760  {
1761  /* there can't be any more matches, so stop */
1762  so->currPos.moreLeft = false;
1763  break;
1764  }
1765 
1766  offnum = OffsetNumberPrev(offnum);
1767  }
1768 
1769  Assert(itemIndex >= 0);
1770  so->currPos.firstItem = itemIndex;
1773  }
1774 
1775  return (so->currPos.firstItem <= so->currPos.lastItem);
1776 }
1777 
1778 /* Save an index item into so->currPos.items[itemIndex] */
1779 static void
1780 _bt_saveitem(BTScanOpaque so, int itemIndex,
1781  OffsetNumber offnum, IndexTuple itup)
1782 {
1783  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1784 
1785  Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1786 
1787  currItem->heapTid = itup->t_tid;
1788  currItem->indexOffset = offnum;
1789  if (so->currTuples)
1790  {
1791  Size itupsz = IndexTupleSize(itup);
1792 
1793  currItem->tupleOffset = so->currPos.nextTupleOffset;
1794  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1795  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1796  }
1797 }
1798 
1799 /*
1800  * Setup state to save TIDs/items from a single posting list tuple.
1801  *
1802  * Saves an index item into so->currPos.items[itemIndex] for TID that is
1803  * returned to scan first. Second or subsequent TIDs for posting list should
1804  * be saved by calling _bt_savepostingitem().
1805  *
1806  * Returns an offset into tuple storage space that main tuple is stored at if
1807  * needed.
1808  */
1809 static int
1811  ItemPointer heapTid, IndexTuple itup)
1812 {
1813  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1814 
1815  Assert(BTreeTupleIsPosting(itup));
1816 
1817  currItem->heapTid = *heapTid;
1818  currItem->indexOffset = offnum;
1819  if (so->currTuples)
1820  {
1821  /* Save base IndexTuple (truncate posting list) */
1822  IndexTuple base;
1823  Size itupsz = BTreeTupleGetPostingOffset(itup);
1824 
1825  itupsz = MAXALIGN(itupsz);
1826  currItem->tupleOffset = so->currPos.nextTupleOffset;
1827  base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1828  memcpy(base, itup, itupsz);
1829  /* Defensively reduce work area index tuple header size */
1830  base->t_info &= ~INDEX_SIZE_MASK;
1831  base->t_info |= itupsz;
1832  so->currPos.nextTupleOffset += itupsz;
1833 
1834  return currItem->tupleOffset;
1835  }
1836 
1837  return 0;
1838 }
1839 
1840 /*
1841  * Save an index item into so->currPos.items[itemIndex] for current posting
1842  * tuple.
1843  *
1844  * Assumes that _bt_setuppostingitems() has already been called for current
1845  * posting list tuple. Caller passes its return value as tupleOffset.
1846  */
1847 static inline void
1849  ItemPointer heapTid, int tupleOffset)
1850 {
1851  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1852 
1853  currItem->heapTid = *heapTid;
1854  currItem->indexOffset = offnum;
1855 
1856  /*
1857  * Have index-only scans return the same base IndexTuple for every TID
1858  * that originates from the same posting list
1859  */
1860  if (so->currTuples)
1861  currItem->tupleOffset = tupleOffset;
1862 }
1863 
1864 /*
1865  * _bt_steppage() -- Step to next page containing valid data for scan
1866  *
1867  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1868  * if pinned, we'll drop the pin before moving to next page. The buffer is
1869  * not locked on entry.
1870  *
1871  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1872  * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1873  * to InvalidBuffer. We return true to indicate success.
1874  */
1875 static bool
1877 {
1878  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1880  bool status;
1881 
1883 
1884  /* Before leaving current page, deal with any killed items */
1885  if (so->numKilled > 0)
1886  _bt_killitems(scan);
1887 
1888  /*
1889  * Before we modify currPos, make a copy of the page data if there was a
1890  * mark position that needs it.
1891  */
1892  if (so->markItemIndex >= 0)
1893  {
1894  /* bump pin on current buffer for assignment to mark buffer */
1895  if (BTScanPosIsPinned(so->currPos))
1897  memcpy(&so->markPos, &so->currPos,
1898  offsetof(BTScanPosData, items[1]) +
1899  so->currPos.lastItem * sizeof(BTScanPosItem));
1900  if (so->markTuples)
1901  memcpy(so->markTuples, so->currTuples,
1902  so->currPos.nextTupleOffset);
1903  so->markPos.itemIndex = so->markItemIndex;
1904  so->markItemIndex = -1;
1905  }
1906 
1907  if (ScanDirectionIsForward(dir))
1908  {
1909  /* Walk right to the next page with data */
1910  if (scan->parallel_scan != NULL)
1911  {
1912  /*
1913  * Seize the scan to get the next block number; if the scan has
1914  * ended already, bail out.
1915  */
1916  status = _bt_parallel_seize(scan, &blkno);
1917  if (!status)
1918  {
1919  /* release the previous buffer, if pinned */
1922  return false;
1923  }
1924  }
1925  else
1926  {
1927  /* Not parallel, so use the previously-saved nextPage link. */
1928  blkno = so->currPos.nextPage;
1929  }
1930 
1931  /* Remember we left a page with data */
1932  so->currPos.moreLeft = true;
1933 
1934  /* release the previous buffer, if pinned */
1936  }
1937  else
1938  {
1939  /* Remember we left a page with data */
1940  so->currPos.moreRight = true;
1941 
1942  if (scan->parallel_scan != NULL)
1943  {
1944  /*
1945  * Seize the scan to get the current block number; if the scan has
1946  * ended already, bail out.
1947  */
1948  status = _bt_parallel_seize(scan, &blkno);
1950  if (!status)
1951  {
1953  return false;
1954  }
1955  }
1956  else
1957  {
1958  /* Not parallel, so just use our own notion of the current page */
1959  blkno = so->currPos.currPage;
1960  }
1961  }
1962 
1963  if (!_bt_readnextpage(scan, blkno, dir))
1964  return false;
1965 
1966  /* Drop the lock, and maybe the pin, on the current page */
1968 
1969  return true;
1970 }
1971 
1972 /*
1973  * _bt_readnextpage() -- Read next page containing valid data for scan
1974  *
1975  * On success exit, so->currPos is updated to contain data from the next
1976  * interesting page. Caller is responsible to release lock and pin on
1977  * buffer on success. We return true to indicate success.
1978  *
1979  * If there are no more matching records in the given direction, we drop all
1980  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1981  */
1982 static bool
1984 {
1985  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1986  Relation rel;
1987  Page page;
1988  BTPageOpaque opaque;
1989  bool status;
1990 
1991  rel = scan->indexRelation;
1992 
1993  if (ScanDirectionIsForward(dir))
1994  {
1995  for (;;)
1996  {
1997  /*
1998  * if we're at end of scan, give up and mark parallel scan as
1999  * done, so that all the workers can finish their scan
2000  */
2001  if (blkno == P_NONE || !so->currPos.moreRight)
2002  {
2003  _bt_parallel_done(scan);
2005  return false;
2006  }
2007  /* check for interrupts while we're not holding any buffer lock */
2009  /* step right one page */
2010  so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno, BT_READ);
2011  page = BufferGetPage(so->currPos.buf);
2012  TestForOldSnapshot(scan->xs_snapshot, rel, page);
2013  opaque = BTPageGetOpaque(page);
2014  /* check for deleted page */
2015  if (!P_IGNORE(opaque))
2016  {
2017  PredicateLockPage(rel, blkno, scan->xs_snapshot);
2018  /* see if there are any matches on this page */
2019  /* note that this will clear moreRight if we can stop */
2020  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
2021  break;
2022  }
2023  else if (scan->parallel_scan != NULL)
2024  {
2025  /* allow next page be processed by parallel worker */
2026  _bt_parallel_release(scan, opaque->btpo_next);
2027  }
2028 
2029  /* nope, keep going */
2030  if (scan->parallel_scan != NULL)
2031  {
2032  _bt_relbuf(rel, so->currPos.buf);
2033  status = _bt_parallel_seize(scan, &blkno);
2034  if (!status)
2035  {
2037  return false;
2038  }
2039  }
2040  else
2041  {
2042  blkno = opaque->btpo_next;
2043  _bt_relbuf(rel, so->currPos.buf);
2044  }
2045  }
2046  }
2047  else
2048  {
2049  /*
2050  * Should only happen in parallel cases, when some other backend
2051  * advanced the scan.
2052  */
2053  if (so->currPos.currPage != blkno)
2054  {
2056  so->currPos.currPage = blkno;
2057  }
2058 
2059  /*
2060  * Walk left to the next page with data. This is much more complex
2061  * than the walk-right case because of the possibility that the page
2062  * to our left splits while we are in flight to it, plus the
2063  * possibility that the page we were on gets deleted after we leave
2064  * it. See nbtree/README for details.
2065  *
2066  * It might be possible to rearrange this code to have less overhead
2067  * in pinning and locking, but that would require capturing the left
2068  * pointer when the page is initially read, and using it here, along
2069  * with big changes to _bt_walk_left() and the code below. It is not
2070  * clear whether this would be a win, since if the page immediately to
2071  * the left splits after we read this page and before we step left, we
2072  * would need to visit more pages than with the current code.
2073  *
2074  * Note that if we change the code so that we drop the pin for a scan
2075  * which uses a non-MVCC snapshot, we will need to modify the code for
2076  * walking left, to allow for the possibility that a referenced page
2077  * has been deleted. As long as the buffer is pinned or the snapshot
2078  * is MVCC the page cannot move past the half-dead state to fully
2079  * deleted.
2080  */
2081  if (BTScanPosIsPinned(so->currPos))
2082  _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2083  else
2084  so->currPos.buf = _bt_getbuf(rel, scan->heapRelation,
2085  so->currPos.currPage, BT_READ);
2086 
2087  for (;;)
2088  {
2089  /* Done if we know there are no matching keys to the left */
2090  if (!so->currPos.moreLeft)
2091  {
2092  _bt_relbuf(rel, so->currPos.buf);
2093  _bt_parallel_done(scan);
2095  return false;
2096  }
2097 
2098  /* Step to next physical page */
2099  so->currPos.buf = _bt_walk_left(rel, scan->heapRelation,
2100  so->currPos.buf, scan->xs_snapshot);
2101 
2102  /* if we're physically at end of index, return failure */
2103  if (so->currPos.buf == InvalidBuffer)
2104  {
2105  _bt_parallel_done(scan);
2107  return false;
2108  }
2109 
2110  /*
2111  * Okay, we managed to move left to a non-deleted page. Done if
2112  * it's not half-dead and contains matching tuples. Else loop back
2113  * and do it all again.
2114  */
2115  page = BufferGetPage(so->currPos.buf);
2116  TestForOldSnapshot(scan->xs_snapshot, rel, page);
2117  opaque = BTPageGetOpaque(page);
2118  if (!P_IGNORE(opaque))
2119  {
2121  /* see if there are any matches on this page */
2122  /* note that this will clear moreLeft if we can stop */
2123  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
2124  break;
2125  }
2126  else if (scan->parallel_scan != NULL)
2127  {
2128  /* allow next page be processed by parallel worker */
2130  }
2131 
2132  /*
2133  * For parallel scans, get the last page scanned as it is quite
2134  * possible that by the time we try to seize the scan, some other
2135  * worker has already advanced the scan to a different page. We
2136  * must continue based on the latest page scanned by any worker.
2137  */
2138  if (scan->parallel_scan != NULL)
2139  {
2140  _bt_relbuf(rel, so->currPos.buf);
2141  status = _bt_parallel_seize(scan, &blkno);
2142  if (!status)
2143  {
2145  return false;
2146  }
2147  so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno,
2148  BT_READ);
2149  }
2150  }
2151  }
2152 
2153  return true;
2154 }
2155 
2156 /*
2157  * _bt_parallel_readpage() -- Read current page containing valid data for scan
2158  *
2159  * On success, release lock and maybe pin on buffer. We return true to
2160  * indicate success.
2161  */
2162 static bool
2164 {
2165  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2166 
2167  _bt_initialize_more_data(so, dir);
2168 
2169  if (!_bt_readnextpage(scan, blkno, dir))
2170  return false;
2171 
2172  /* Drop the lock, and maybe the pin, on the current page */
2174 
2175  return true;
2176 }
2177 
2178 /*
2179  * _bt_walk_left() -- step left one page, if possible
2180  *
2181  * The given buffer must be pinned and read-locked. This will be dropped
2182  * before stepping left. On return, we have pin and read lock on the
2183  * returned page, instead.
2184  *
2185  * Returns InvalidBuffer if there is no page to the left (no lock is held
2186  * in that case).
2187  *
2188  * When working on a non-leaf level, it is possible for the returned page
2189  * to be half-dead; the caller should check that condition and step left
2190  * again if it's important.
2191  */
2192 static Buffer
2194 {
2195  Page page;
2196  BTPageOpaque opaque;
2197 
2198  page = BufferGetPage(buf);
2199  opaque = BTPageGetOpaque(page);
2200 
2201  for (;;)
2202  {
2203  BlockNumber obknum;
2204  BlockNumber lblkno;
2205  BlockNumber blkno;
2206  int tries;
2207 
2208  /* if we're at end of tree, release buf and return failure */
2209  if (P_LEFTMOST(opaque))
2210  {
2211  _bt_relbuf(rel, buf);
2212  break;
2213  }
2214  /* remember original page we are stepping left from */
2215  obknum = BufferGetBlockNumber(buf);
2216  /* step left */
2217  blkno = lblkno = opaque->btpo_prev;
2218  _bt_relbuf(rel, buf);
2219  /* check for interrupts while we're not holding any buffer lock */
2221  buf = _bt_getbuf(rel, heaprel, blkno, BT_READ);
2222  page = BufferGetPage(buf);
2223  TestForOldSnapshot(snapshot, rel, page);
2224  opaque = BTPageGetOpaque(page);
2225 
2226  /*
2227  * If this isn't the page we want, walk right till we find what we
2228  * want --- but go no more than four hops (an arbitrary limit). If we
2229  * don't find the correct page by then, the most likely bet is that
2230  * the original page got deleted and isn't in the sibling chain at all
2231  * anymore, not that its left sibling got split more than four times.
2232  *
2233  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2234  * because half-dead pages are still in the sibling chain. Caller
2235  * must reject half-dead pages if wanted.
2236  */
2237  tries = 0;
2238  for (;;)
2239  {
2240  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2241  {
2242  /* Found desired page, return it */
2243  return buf;
2244  }
2245  if (P_RIGHTMOST(opaque) || ++tries > 4)
2246  break;
2247  blkno = opaque->btpo_next;
2248  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2249  page = BufferGetPage(buf);
2250  TestForOldSnapshot(snapshot, rel, page);
2251  opaque = BTPageGetOpaque(page);
2252  }
2253 
2254  /* Return to the original page to see what's up */
2255  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2256  page = BufferGetPage(buf);
2257  TestForOldSnapshot(snapshot, rel, page);
2258  opaque = BTPageGetOpaque(page);
2259  if (P_ISDELETED(opaque))
2260  {
2261  /*
2262  * It was deleted. Move right to first nondeleted page (there
2263  * must be one); that is the page that has acquired the deleted
2264  * one's keyspace, so stepping left from it will take us where we
2265  * want to be.
2266  */
2267  for (;;)
2268  {
2269  if (P_RIGHTMOST(opaque))
2270  elog(ERROR, "fell off the end of index \"%s\"",
2272  blkno = opaque->btpo_next;
2273  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2274  page = BufferGetPage(buf);
2275  TestForOldSnapshot(snapshot, rel, page);
2276  opaque = BTPageGetOpaque(page);
2277  if (!P_ISDELETED(opaque))
2278  break;
2279  }
2280 
2281  /*
2282  * Now return to top of loop, resetting obknum to point to this
2283  * nondeleted page, and try again.
2284  */
2285  }
2286  else
2287  {
2288  /*
2289  * It wasn't deleted; the explanation had better be that the page
2290  * to the left got split or deleted. Without this check, we'd go
2291  * into an infinite loop if there's anything wrong.
2292  */
2293  if (opaque->btpo_prev == lblkno)
2294  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2295  obknum, RelationGetRelationName(rel));
2296  /* Okay to try again with new lblkno value */
2297  }
2298  }
2299 
2300  return InvalidBuffer;
2301 }
2302 
2303 /*
2304  * _bt_get_endpoint() -- Find the first or last page on a given tree level
2305  *
2306  * If the index is empty, we will return InvalidBuffer; any other failure
2307  * condition causes ereport(). We will not return a dead page.
2308  *
2309  * The returned buffer is pinned and read-locked.
2310  */
2311 Buffer
2312 _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost,
2313  Snapshot snapshot)
2314 {
2315  Buffer buf;
2316  Page page;
2317  BTPageOpaque opaque;
2318  OffsetNumber offnum;
2319  BlockNumber blkno;
2320  IndexTuple itup;
2321 
2322  /*
2323  * If we are looking for a leaf page, okay to descend from fast root;
2324  * otherwise better descend from true root. (There is no point in being
2325  * smarter about intermediate levels.)
2326  */
2327  if (level == 0)
2328  buf = _bt_getroot(rel, heaprel, BT_READ);
2329  else
2330  buf = _bt_gettrueroot(rel, heaprel);
2331 
2332  if (!BufferIsValid(buf))
2333  return InvalidBuffer;
2334 
2335  page = BufferGetPage(buf);
2336  TestForOldSnapshot(snapshot, rel, page);
2337  opaque = BTPageGetOpaque(page);
2338 
2339  for (;;)
2340  {
2341  /*
2342  * If we landed on a deleted page, step right to find a live page
2343  * (there must be one). Also, if we want the rightmost page, step
2344  * right if needed to get to it (this could happen if the page split
2345  * since we obtained a pointer to it).
2346  */
2347  while (P_IGNORE(opaque) ||
2348  (rightmost && !P_RIGHTMOST(opaque)))
2349  {
2350  blkno = opaque->btpo_next;
2351  if (blkno == P_NONE)
2352  elog(ERROR, "fell off the end of index \"%s\"",
2354  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2355  page = BufferGetPage(buf);
2356  TestForOldSnapshot(snapshot, rel, page);
2357  opaque = BTPageGetOpaque(page);
2358  }
2359 
2360  /* Done? */
2361  if (opaque->btpo_level == level)
2362  break;
2363  if (opaque->btpo_level < level)
2364  ereport(ERROR,
2365  (errcode(ERRCODE_INDEX_CORRUPTED),
2366  errmsg_internal("btree level %u not found in index \"%s\"",
2367  level, RelationGetRelationName(rel))));
2368 
2369  /* Descend to leftmost or rightmost child page */
2370  if (rightmost)
2371  offnum = PageGetMaxOffsetNumber(page);
2372  else
2373  offnum = P_FIRSTDATAKEY(opaque);
2374 
2375  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2376  blkno = BTreeTupleGetDownLink(itup);
2377 
2378  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2379  page = BufferGetPage(buf);
2380  opaque = BTPageGetOpaque(page);
2381  }
2382 
2383  return buf;
2384 }
2385 
2386 /*
2387  * _bt_endpoint() -- Find the first or last page in the index, and scan
2388  * from there to the first key satisfying all the quals.
2389  *
2390  * This is used by _bt_first() to set up a scan when we've determined
2391  * that the scan must start at the beginning or end of the index (for
2392  * a forward or backward scan respectively). Exit conditions are the
2393  * same as for _bt_first().
2394  */
2395 static bool
2397 {
2398  Relation rel = scan->indexRelation;
2399  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2400  Buffer buf;
2401  Page page;
2402  BTPageOpaque opaque;
2403  OffsetNumber start;
2404  BTScanPosItem *currItem;
2405 
2406  /*
2407  * Scan down to the leftmost or rightmost leaf page. This is a simplified
2408  * version of _bt_search(). We don't maintain a stack since we know we
2409  * won't need it.
2410  */
2411  buf = _bt_get_endpoint(rel, scan->heapRelation, 0,
2412  ScanDirectionIsBackward(dir), scan->xs_snapshot);
2413 
2414  if (!BufferIsValid(buf))
2415  {
2416  /*
2417  * Empty index. Lock the whole relation, as nothing finer to lock
2418  * exists.
2419  */
2420  PredicateLockRelation(rel, scan->xs_snapshot);
2422  return false;
2423  }
2424 
2426  page = BufferGetPage(buf);
2427  opaque = BTPageGetOpaque(page);
2428  Assert(P_ISLEAF(opaque));
2429 
2430  if (ScanDirectionIsForward(dir))
2431  {
2432  /* There could be dead pages to the left, so not this: */
2433  /* Assert(P_LEFTMOST(opaque)); */
2434 
2435  start = P_FIRSTDATAKEY(opaque);
2436  }
2437  else if (ScanDirectionIsBackward(dir))
2438  {
2439  Assert(P_RIGHTMOST(opaque));
2440 
2441  start = PageGetMaxOffsetNumber(page);
2442  }
2443  else
2444  {
2445  elog(ERROR, "invalid scan direction: %d", (int) dir);
2446  start = 0; /* keep compiler quiet */
2447  }
2448 
2449  /* remember which buffer we have pinned */
2450  so->currPos.buf = buf;
2451 
2452  _bt_initialize_more_data(so, dir);
2453 
2454  /*
2455  * Now load data from the first page of the scan.
2456  */
2457  if (!_bt_readpage(scan, dir, start))
2458  {
2459  /*
2460  * There's no actually-matching data on this page. Try to advance to
2461  * the next page. Return false if there's no matching data at all.
2462  */
2463  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2464  if (!_bt_steppage(scan, dir))
2465  return false;
2466  }
2467  else
2468  {
2469  /* Drop the lock, and maybe the pin, on the current page */
2471  }
2472 
2473  /* OK, itemIndex says what to return */
2474  currItem = &so->currPos.items[so->currPos.itemIndex];
2475  scan->xs_heaptid = currItem->heapTid;
2476  if (scan->xs_want_itup)
2477  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2478 
2479  return true;
2480 }
2481 
2482 /*
2483  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2484  * for scan direction
2485  */
2486 static inline void
2488 {
2489  /* initialize moreLeft/moreRight appropriately for scan direction */
2490  if (ScanDirectionIsForward(dir))
2491  {
2492  so->currPos.moreLeft = false;
2493  so->currPos.moreRight = true;
2494  }
2495  else
2496  {
2497  so->currPos.moreLeft = true;
2498  so->currPos.moreRight = false;
2499  }
2500  so->numKilled = 0; /* just paranoia */
2501  so->markItemIndex = -1; /* ditto */
2502 }
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4512
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3290
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4480
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3551
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:355
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:378
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:303
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
unsigned int uint32
Definition: c.h:490
#define RegProcedureIsValid(p)
Definition: c.h:761
#define Min(x, y)
Definition: c.h:988
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1114
#define MAXALIGN(LEN)
Definition: c.h:795
signed int int32
Definition: c.h:478
#define Max(x, y)
Definition: c.h:982
regproc RegProcedure
Definition: c.h:634
#define unlikely(x)
Definition: c.h:295
size_t Size
Definition: c.h:589
struct cursor * cur
Definition: ecpg.c:28
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errcode(int sqlerrcode)
Definition: elog.c:858
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1120
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:811
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define INDEX_SIZE_MASK
Definition: itup.h:65
Assert(fmt[strlen(fmt) - 1] !='\n')
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:795
void * palloc(Size size)
Definition: mcxt.c:1226
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2235
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1000
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1020
void _bt_metaversion(Relation rel, Relation heaprel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:735
Buffer _bt_getbuf(Relation rel, Relation heaprel, BlockNumber blkno, int access)
Definition: nbtpage.c:872
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1067
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1036
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:342
Buffer _bt_gettrueroot(Relation rel, Relation heaprel)
Definition: nbtpage.c:576
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:694
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:717
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:636
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:992
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:515
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:477
BTStackData * BTStack
Definition: nbtree.h:736
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define P_HIKEY
Definition: nbtree.h:364
#define BTORDER_PROC
Definition: nbtree.h:704
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1009
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:366
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:526
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1087
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:541
#define BT_READ
Definition: nbtree.h:716
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:553
#define SK_BT_DESC
Definition: nbtree.h:1086
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:661
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:489
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1015
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1003
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:635
#define BT_WRITE
Definition: nbtree.h:717
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:574
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1076
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1983
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:572
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:862
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1780
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1524
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2163
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1810
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:61
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:340
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2396
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1876
static Buffer _bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:2193
Buffer _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2312
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:1848
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:444
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1469
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:658
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2487
Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:237
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:182
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1725
bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1362
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2472
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:749
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define INDEX_MAX_KEYS
static char * buf
Definition: pg_test_fsync.c:67
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:623
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2533
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2510
short access
Definition: preproc-type.c:36
#define RelationGetDescr(relation)
Definition: rel.h:530
#define RelationGetRelationName(relation)
Definition: rel.h:538
#define RelationNeedsWAL(relation)
Definition: rel.h:629
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:516
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:523
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
ScanDirection
Definition: sdir.h:25
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:96
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
OffsetNumber stricthigh
Definition: nbtree.h:827
bool bounds_valid
Definition: nbtree.h:825
OffsetNumber low
Definition: nbtree.h:826
BTScanInsert itup_key
Definition: nbtree.h:815
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint32 btpo_level
Definition: nbtree.h:66
char * markTuples
Definition: nbtree.h:1060
BTScanPosData currPos
Definition: nbtree.h:1072
char * currTuples
Definition: nbtree.h:1059
BTScanPosData markPos
Definition: nbtree.h:1073
ScanKey keyData
Definition: nbtree.h:1039
bool moreRight
Definition: nbtree.h:968
Buffer buf
Definition: nbtree.h:955
BlockNumber currPage
Definition: nbtree.h:958
int firstItem
Definition: nbtree.h:983
int nextTupleOffset
Definition: nbtree.h:974
BlockNumber nextPage
Definition: nbtree.h:959
bool moreLeft
Definition: nbtree.h:967
int lastItem
Definition: nbtree.h:984
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:987
int itemIndex
Definition: nbtree.h:985
XLogRecPtr lsn
Definition: nbtree.h:957
ItemPointerData heapTid
Definition: nbtree.h:948
LocationIndex tupleOffset
Definition: nbtree.h:950
OffsetNumber indexOffset
Definition: nbtree.h:949
BlockNumber bts_blkno
Definition: nbtree.h:731
struct BTStackData * bts_parent
Definition: nbtree.h:733
OffsetNumber bts_offset
Definition: nbtree.h:732
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
bool ignore_killed_tuples
Definition: relscan.h:129
IndexTuple xs_itup
Definition: relscan.h:142
Relation indexRelation
Definition: relscan.h:118
ItemPointerData xs_heaptid
Definition: relscan.h:147
struct SnapshotData * xs_snapshot
Definition: relscan.h:119
Relation heapRelation
Definition: relscan.h:117
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
Oid * rd_opcintype
Definition: rel.h:207
Oid * rd_opfamily
Definition: rel.h:206
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
StrategyNumber sk_strategy
Definition: skey.h:68
AttrNumber sk_attno
Definition: skey.h:67