PostgreSQL Source Code  git master
nbtsearch.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsearch.c
4  * Search code for postgres btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 #include "utils/tqual.h"
26 
27 
28 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
29  OffsetNumber offnum);
30 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
31  OffsetNumber offnum, IndexTuple itup);
32 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
33 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
34 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
35  ScanDirection dir);
36 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
37 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40 
41 
42 /*
43  * _bt_drop_lock_and_maybe_pin()
44  *
45  * Unlock the buffer; and if it is safe to release the pin, do that, too. It
46  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47  * This will prevent vacuum from stalling in a blocked state trying to read a
48  * page when a cursor is sitting on it -- at least in many important cases.
49  *
50  * Set the buffer to invalid if the pin is released, since the buffer may be
51  * re-used. If we need to go back to this block (for example, to apply
52  * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53  * will remain in shared memory for as long as it takes to scan the index
54  * buffer page.
55  */
56 static void
58 {
60 
61  if (IsMVCCSnapshot(scan->xs_snapshot) &&
63  !scan->xs_want_itup)
64  {
65  ReleaseBuffer(sp->buf);
66  sp->buf = InvalidBuffer;
67  }
68 }
69 
70 
71 /*
72  * _bt_search() -- Search the tree for a particular scankey,
73  * or more precisely for the first leaf page it could be on.
74  *
75  * The passed scankey must be an insertion-type scankey (see nbtree/README),
76  * but it can omit the rightmost column(s) of the index.
77  *
78  * When nextkey is false (the usual case), we are looking for the first
79  * item >= scankey. When nextkey is true, we are looking for the first
80  * item strictly greater than scankey.
81  *
82  * Return value is a stack of parent-page pointers. *bufP is set to the
83  * address of the leaf-page buffer, which is read-locked and pinned.
84  * No locks are held on the parent pages, however!
85  *
86  * If the snapshot parameter is not NULL, "old snapshot" checking will take
87  * place during the descent through the tree. This is not needed when
88  * positioning for an insert or delete, so NULL is used for those cases.
89  *
90  * NOTE that the returned buffer is read-locked regardless of the access
91  * parameter. However, access = BT_WRITE will allow an empty root page
92  * to be created and returned. When access = BT_READ, an empty index
93  * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
94  * any incomplete splits encountered during the search will be finished.
95  */
96 BTStack
97 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
98  Buffer *bufP, int access, Snapshot snapshot)
99 {
100  BTStack stack_in = NULL;
101 
102  /* Get the root page to start with */
103  *bufP = _bt_getroot(rel, access);
104 
105  /* If index is empty and access = BT_READ, no root page is created. */
106  if (!BufferIsValid(*bufP))
107  return (BTStack) NULL;
108 
109  /* Loop iterates once per level descended in the tree */
110  for (;;)
111  {
112  Page page;
113  BTPageOpaque opaque;
114  OffsetNumber offnum;
115  ItemId itemid;
116  IndexTuple itup;
117  BlockNumber blkno;
118  BlockNumber par_blkno;
119  BTStack new_stack;
120 
121  /*
122  * Race -- the page we just grabbed may have split since we read its
123  * pointer in the parent (or metapage). If it has, we may need to
124  * move right to its new sibling. Do that.
125  *
126  * In write-mode, allow _bt_moveright to finish any incomplete splits
127  * along the way. Strictly speaking, we'd only need to finish an
128  * incomplete split on the leaf page we're about to insert to, not on
129  * any of the upper levels (they are taken care of in _bt_getstackbuf,
130  * if the leaf page is split and we insert to the parent page). But
131  * this is a good opportunity to finish splits of internal pages too.
132  */
133  *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
134  (access == BT_WRITE), stack_in,
135  BT_READ, snapshot);
136 
137  /* if this is a leaf page, we're done */
138  page = BufferGetPage(*bufP);
139  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
140  if (P_ISLEAF(opaque))
141  break;
142 
143  /*
144  * Find the appropriate item on the internal page, and get the child
145  * page that it points to.
146  */
147  offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
148  itemid = PageGetItemId(page, offnum);
149  itup = (IndexTuple) PageGetItem(page, itemid);
150  blkno = BTreeInnerTupleGetDownLink(itup);
151  par_blkno = BufferGetBlockNumber(*bufP);
152 
153  /*
154  * We need to save the location of the index entry we chose in the
155  * parent page on a stack. In case we split the tree, we'll use the
156  * stack to work back up to the parent page. We also save the actual
157  * downlink (block) to uniquely identify the index entry, in case it
158  * moves right while we're working lower in the tree. See the paper
159  * by Lehman and Yao for how this is detected and handled. (We use the
160  * child link to disambiguate duplicate keys in the index -- Lehman
161  * and Yao disallow duplicate keys.)
162  */
163  new_stack = (BTStack) palloc(sizeof(BTStackData));
164  new_stack->bts_blkno = par_blkno;
165  new_stack->bts_offset = offnum;
166  new_stack->bts_btentry = blkno;
167  new_stack->bts_parent = stack_in;
168 
169  /* drop the read lock on the parent page, acquire one on the child */
170  *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
171 
172  /* okay, all set to move down a level */
173  stack_in = new_stack;
174  }
175 
176  return stack_in;
177 }
178 
179 /*
180  * _bt_moveright() -- move right in the btree if necessary.
181  *
182  * When we follow a pointer to reach a page, it is possible that
183  * the page has changed in the meanwhile. If this happens, we're
184  * guaranteed that the page has "split right" -- that is, that any
185  * data that appeared on the page originally is either on the page
186  * or strictly to the right of it.
187  *
188  * This routine decides whether or not we need to move right in the
189  * tree by examining the high key entry on the page. If that entry
190  * is strictly less than the scankey, or <= the scankey in the nextkey=true
191  * case, then we followed the wrong link and we need to move right.
192  *
193  * The passed scankey must be an insertion-type scankey (see nbtree/README),
194  * but it can omit the rightmost column(s) of the index.
195  *
196  * When nextkey is false (the usual case), we are looking for the first
197  * item >= scankey. When nextkey is true, we are looking for the first
198  * item strictly greater than scankey.
199  *
200  * If forupdate is true, we will attempt to finish any incomplete splits
201  * that we encounter. This is required when locking a target page for an
202  * insertion, because we don't allow inserting on a page before the split
203  * is completed. 'stack' is only used if forupdate is true.
204  *
205  * On entry, we have the buffer pinned and a lock of the type specified by
206  * 'access'. If we move right, we release the buffer and lock and acquire
207  * the same on the right sibling. Return value is the buffer we stop at.
208  *
209  * If the snapshot parameter is not NULL, "old snapshot" checking will take
210  * place during the descent through the tree. This is not needed when
211  * positioning for an insert or delete, so NULL is used for those cases.
212  */
213 Buffer
215  Buffer buf,
216  int keysz,
217  ScanKey scankey,
218  bool nextkey,
219  bool forupdate,
220  BTStack stack,
221  int access,
222  Snapshot snapshot)
223 {
224  Page page;
225  BTPageOpaque opaque;
226  int32 cmpval;
227 
228  /*
229  * When nextkey = false (normal case): if the scan key that brought us to
230  * this page is > the high key stored on the page, then the page has split
231  * and we need to move right. (If the scan key is equal to the high key,
232  * we might or might not need to move right; have to scan the page first
233  * anyway.)
234  *
235  * When nextkey = true: move right if the scan key is >= page's high key.
236  *
237  * The page could even have split more than once, so scan as far as
238  * needed.
239  *
240  * We also have to move right if we followed a link that brought us to a
241  * dead page.
242  */
243  cmpval = nextkey ? 0 : 1;
244 
245  for (;;)
246  {
247  page = BufferGetPage(buf);
248  TestForOldSnapshot(snapshot, rel, page);
249  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
250 
251  if (P_RIGHTMOST(opaque))
252  break;
253 
254  /*
255  * Finish any incomplete splits we encounter along the way.
256  */
257  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
258  {
259  BlockNumber blkno = BufferGetBlockNumber(buf);
260 
261  /* upgrade our lock if necessary */
262  if (access == BT_READ)
263  {
265  LockBuffer(buf, BT_WRITE);
266  }
267 
268  if (P_INCOMPLETE_SPLIT(opaque))
269  _bt_finish_split(rel, buf, stack);
270  else
271  _bt_relbuf(rel, buf);
272 
273  /* re-acquire the lock in the right mode, and re-check */
274  buf = _bt_getbuf(rel, blkno, access);
275  continue;
276  }
277 
278  if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
279  {
280  /* step right one page */
281  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
282  continue;
283  }
284  else
285  break;
286  }
287 
288  if (P_IGNORE(opaque))
289  elog(ERROR, "fell off the end of index \"%s\"",
291 
292  return buf;
293 }
294 
295 /*
296  * _bt_binsrch() -- Do a binary search for a key on a particular page.
297  *
298  * The passed scankey must be an insertion-type scankey (see nbtree/README),
299  * but it can omit the rightmost column(s) of the index.
300  *
301  * When nextkey is false (the usual case), we are looking for the first
302  * item >= scankey. When nextkey is true, we are looking for the first
303  * item strictly greater than scankey.
304  *
305  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
306  * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
307  * particular, this means it is possible to return a value 1 greater than the
308  * number of keys on the page, if the scankey is > all keys on the page.)
309  *
310  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
311  * of the last key < given scankey, or last key <= given scankey if nextkey
312  * is true. (Since _bt_compare treats the first data key of such a page as
313  * minus infinity, there will be at least one key < scankey, so the result
314  * always points at one of the keys on the page.) This key indicates the
315  * right place to descend to be sure we find all leaf keys >= given scankey
316  * (or leaf keys > given scankey when nextkey is true).
317  *
318  * This procedure is not responsible for walking right, it just examines
319  * the given page. _bt_binsrch() has no lock or refcount side effects
320  * on the buffer.
321  */
324  Buffer buf,
325  int keysz,
326  ScanKey scankey,
327  bool nextkey)
328 {
329  Page page;
330  BTPageOpaque opaque;
331  OffsetNumber low,
332  high;
333  int32 result,
334  cmpval;
335 
336  page = BufferGetPage(buf);
337  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
338 
339  low = P_FIRSTDATAKEY(opaque);
340  high = PageGetMaxOffsetNumber(page);
341 
342  /*
343  * If there are no keys on the page, return the first available slot. Note
344  * this covers two cases: the page is really empty (no keys), or it
345  * contains only a high key. The latter case is possible after vacuuming.
346  * This can never happen on an internal page, however, since they are
347  * never empty (an internal page must have children).
348  */
349  if (high < low)
350  return low;
351 
352  /*
353  * Binary search to find the first key on the page >= scan key, or first
354  * key > scankey when nextkey is true.
355  *
356  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
357  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
358  *
359  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
360  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
361  *
362  * We can fall out when high == low.
363  */
364  high++; /* establish the loop invariant for high */
365 
366  cmpval = nextkey ? 0 : 1; /* select comparison value */
367 
368  while (high > low)
369  {
370  OffsetNumber mid = low + ((high - low) / 2);
371 
372  /* We have low <= mid < high, so mid points at a real slot */
373 
374  result = _bt_compare(rel, keysz, scankey, page, mid);
375 
376  if (result >= cmpval)
377  low = mid + 1;
378  else
379  high = mid;
380  }
381 
382  /*
383  * At this point we have high == low, but be careful: they could point
384  * past the last slot on the page.
385  *
386  * On a leaf page, we always return the first key >= scan key (resp. >
387  * scan key), which could be the last slot + 1.
388  */
389  if (P_ISLEAF(opaque))
390  return low;
391 
392  /*
393  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
394  * There must be one if _bt_compare() is playing by the rules.
395  */
396  Assert(low > P_FIRSTDATAKEY(opaque));
397 
398  return OffsetNumberPrev(low);
399 }
400 
401 /*----------
402  * _bt_compare() -- Compare scankey to a particular tuple on the page.
403  *
404  * The passed scankey must be an insertion-type scankey (see nbtree/README),
405  * but it can omit the rightmost column(s) of the index.
406  *
407  * keysz: number of key conditions to be checked (might be less than the
408  * number of index columns!)
409  * page/offnum: location of btree item to be compared to.
410  *
411  * This routine returns:
412  * <0 if scankey < tuple at offnum;
413  * 0 if scankey == tuple at offnum;
414  * >0 if scankey > tuple at offnum.
415  * NULLs in the keys are treated as sortable values. Therefore
416  * "equality" does not necessarily mean that the item should be
417  * returned to the caller as a matching key!
418  *
419  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
420  * "minus infinity": this routine will always claim it is less than the
421  * scankey. The actual key value stored (if any, which there probably isn't)
422  * does not matter. This convention allows us to implement the Lehman and
423  * Yao convention that the first down-link pointer is before the first key.
424  * See backend/access/nbtree/README for details.
425  *----------
426  */
427 int32
429  int keysz,
430  ScanKey scankey,
431  Page page,
432  OffsetNumber offnum)
433 {
434  TupleDesc itupdesc = RelationGetDescr(rel);
436  IndexTuple itup;
437  int i;
438 
439  Assert(_bt_check_natts(rel, page, offnum));
440 
441  /*
442  * Force result ">" if target item is first data item on an internal page
443  * --- see NOTE above.
444  */
445  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
446  return 1;
447 
448  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
449 
450  /*
451  * The scan key is set up with the attribute number associated with each
452  * term in the key. It is important that, if the index is multi-key, the
453  * scan contain the first k key attributes, and that they be in order. If
454  * you think about how multi-key ordering works, you'll understand why
455  * this is.
456  *
457  * We don't test for violation of this condition here, however. The
458  * initial setup for the index scan had better have gotten it right (see
459  * _bt_first).
460  */
461 
462  for (i = 1; i <= keysz; i++)
463  {
464  Datum datum;
465  bool isNull;
466  int32 result;
467 
468  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
469 
470  /* see comments about NULLs handling in btbuild */
471  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
472  {
473  if (isNull)
474  result = 0; /* NULL "=" NULL */
475  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
476  result = -1; /* NULL "<" NOT_NULL */
477  else
478  result = 1; /* NULL ">" NOT_NULL */
479  }
480  else if (isNull) /* key is NOT_NULL and item is NULL */
481  {
482  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
483  result = 1; /* NOT_NULL ">" NULL */
484  else
485  result = -1; /* NOT_NULL "<" NULL */
486  }
487  else
488  {
489  /*
490  * The sk_func needs to be passed the index value as left arg and
491  * the sk_argument as right arg (they might be of different
492  * types). Since it is convenient for callers to think of
493  * _bt_compare as comparing the scankey to the index item, we have
494  * to flip the sign of the comparison result. (Unless it's a DESC
495  * column, in which case we *don't* flip the sign.)
496  */
497  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
498  scankey->sk_collation,
499  datum,
500  scankey->sk_argument));
501 
502  if (!(scankey->sk_flags & SK_BT_DESC))
503  result = -result;
504  }
505 
506  /* if the keys are unequal, return the difference */
507  if (result != 0)
508  return result;
509 
510  scankey++;
511  }
512 
513  /* if we get here, the keys are equal */
514  return 0;
515 }
516 
517 /*
518  * _bt_first() -- Find the first item in a scan.
519  *
520  * We need to be clever about the direction of scan, the search
521  * conditions, and the tree ordering. We find the first item (or,
522  * if backwards scan, the last item) in the tree that satisfies the
523  * qualifications in the scan key. On success exit, the page containing
524  * the current index tuple is pinned but not locked, and data about
525  * the matching tuple(s) on the page has been loaded into so->currPos.
526  * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
527  * and if requested, scan->xs_itup points to a copy of the index tuple.
528  *
529  * If there are no matching items in the index, we return false, with no
530  * pins or locks held.
531  *
532  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
533  * are both search-type scankeys (see nbtree/README for more about this).
534  * Within this routine, we build a temporary insertion-type scankey to use
535  * in locating the scan start position.
536  */
537 bool
539 {
540  Relation rel = scan->indexRelation;
541  BTScanOpaque so = (BTScanOpaque) scan->opaque;
542  Buffer buf;
543  BTStack stack;
544  OffsetNumber offnum;
545  StrategyNumber strat;
546  bool nextkey;
547  bool goback;
548  ScanKey startKeys[INDEX_MAX_KEYS];
549  ScanKeyData scankeys[INDEX_MAX_KEYS];
550  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
551  int keysCount = 0;
552  int i;
553  bool status = true;
554  StrategyNumber strat_total;
555  BTScanPosItem *currItem;
556  BlockNumber blkno;
557 
559 
561 
562  /*
563  * Examine the scan keys and eliminate any redundant keys; also mark the
564  * keys that must be matched to continue the scan.
565  */
566  _bt_preprocess_keys(scan);
567 
568  /*
569  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
570  * never be satisfied (eg, x == 1 AND x > 2).
571  */
572  if (!so->qual_ok)
573  return false;
574 
575  /*
576  * For parallel scans, get the starting page from shared state. If the
577  * scan has not started, proceed to find out first leaf page in the usual
578  * way while keeping other participating processes waiting. If the scan
579  * has already begun, use the page number from the shared structure.
580  */
581  if (scan->parallel_scan != NULL)
582  {
583  status = _bt_parallel_seize(scan, &blkno);
584  if (!status)
585  return false;
586  else if (blkno == P_NONE)
587  {
588  _bt_parallel_done(scan);
589  return false;
590  }
591  else if (blkno != InvalidBlockNumber)
592  {
593  if (!_bt_parallel_readpage(scan, blkno, dir))
594  return false;
595  goto readcomplete;
596  }
597  }
598 
599  /*----------
600  * Examine the scan keys to discover where we need to start the scan.
601  *
602  * We want to identify the keys that can be used as starting boundaries;
603  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
604  * a backwards scan. We can use keys for multiple attributes so long as
605  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
606  * a > or < boundary or find an attribute with no boundary (which can be
607  * thought of as the same as "> -infinity"), we can't use keys for any
608  * attributes to its right, because it would break our simplistic notion
609  * of what initial positioning strategy to use.
610  *
611  * When the scan keys include cross-type operators, _bt_preprocess_keys
612  * may not be able to eliminate redundant keys; in such cases we will
613  * arbitrarily pick a usable one for each attribute. This is correct
614  * but possibly not optimal behavior. (For example, with keys like
615  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
616  * x=5 would be more efficient.) Since the situation only arises given
617  * a poorly-worded query plus an incomplete opfamily, live with it.
618  *
619  * When both equality and inequality keys appear for a single attribute
620  * (again, only possible when cross-type operators appear), we *must*
621  * select one of the equality keys for the starting point, because
622  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
623  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
624  * start at x=4, we will fail and stop before reaching x=10. If multiple
625  * equality quals survive preprocessing, however, it doesn't matter which
626  * one we use --- by definition, they are either redundant or
627  * contradictory.
628  *
629  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
630  * If the index stores nulls at the end of the index we'll be starting
631  * from, and we have no boundary key for the column (which means the key
632  * we deduced NOT NULL from is an inequality key that constrains the other
633  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
634  * use as a boundary key. If we didn't do this, we might find ourselves
635  * traversing a lot of null entries at the start of the scan.
636  *
637  * In this loop, row-comparison keys are treated the same as keys on their
638  * first (leftmost) columns. We'll add on lower-order columns of the row
639  * comparison below, if possible.
640  *
641  * The selected scan keys (at most one per index column) are remembered by
642  * storing their addresses into the local startKeys[] array.
643  *----------
644  */
645  strat_total = BTEqualStrategyNumber;
646  if (so->numberOfKeys > 0)
647  {
648  AttrNumber curattr;
649  ScanKey chosen;
650  ScanKey impliesNN;
651  ScanKey cur;
652 
653  /*
654  * chosen is the so-far-chosen key for the current attribute, if any.
655  * We don't cast the decision in stone until we reach keys for the
656  * next attribute.
657  */
658  curattr = 1;
659  chosen = NULL;
660  /* Also remember any scankey that implies a NOT NULL constraint */
661  impliesNN = NULL;
662 
663  /*
664  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
665  * pass to handle after-last-key processing. Actual exit from the
666  * loop is at one of the "break" statements below.
667  */
668  for (cur = so->keyData, i = 0;; cur++, i++)
669  {
670  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
671  {
672  /*
673  * Done looking at keys for curattr. If we didn't find a
674  * usable boundary key, see if we can deduce a NOT NULL key.
675  */
676  if (chosen == NULL && impliesNN != NULL &&
677  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
680  {
681  /* Yes, so build the key in notnullkeys[keysCount] */
682  chosen = &notnullkeys[keysCount];
683  ScanKeyEntryInitialize(chosen,
685  (impliesNN->sk_flags &
687  curattr,
688  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
691  InvalidOid,
692  InvalidOid,
693  InvalidOid,
694  (Datum) 0);
695  }
696 
697  /*
698  * If we still didn't find a usable boundary key, quit; else
699  * save the boundary key pointer in startKeys.
700  */
701  if (chosen == NULL)
702  break;
703  startKeys[keysCount++] = chosen;
704 
705  /*
706  * Adjust strat_total, and quit if we have stored a > or <
707  * key.
708  */
709  strat = chosen->sk_strategy;
710  if (strat != BTEqualStrategyNumber)
711  {
712  strat_total = strat;
713  if (strat == BTGreaterStrategyNumber ||
714  strat == BTLessStrategyNumber)
715  break;
716  }
717 
718  /*
719  * Done if that was the last attribute, or if next key is not
720  * in sequence (implying no boundary key is available for the
721  * next attribute).
722  */
723  if (i >= so->numberOfKeys ||
724  cur->sk_attno != curattr + 1)
725  break;
726 
727  /*
728  * Reset for next attr.
729  */
730  curattr = cur->sk_attno;
731  chosen = NULL;
732  impliesNN = NULL;
733  }
734 
735  /*
736  * Can we use this key as a starting boundary for this attr?
737  *
738  * If not, does it imply a NOT NULL constraint? (Because
739  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
740  * *any* inequality key works for that; we need not test.)
741  */
742  switch (cur->sk_strategy)
743  {
746  if (chosen == NULL)
747  {
748  if (ScanDirectionIsBackward(dir))
749  chosen = cur;
750  else
751  impliesNN = cur;
752  }
753  break;
755  /* override any non-equality choice */
756  chosen = cur;
757  break;
760  if (chosen == NULL)
761  {
762  if (ScanDirectionIsForward(dir))
763  chosen = cur;
764  else
765  impliesNN = cur;
766  }
767  break;
768  }
769  }
770  }
771 
772  /*
773  * If we found no usable boundary keys, we have to start from one end of
774  * the tree. Walk down that edge to the first or last key, and scan from
775  * there.
776  */
777  if (keysCount == 0)
778  {
779  bool match;
780 
781  match = _bt_endpoint(scan, dir);
782 
783  if (!match)
784  {
785  /* No match, so mark (parallel) scan finished */
786  _bt_parallel_done(scan);
787  }
788 
789  return match;
790  }
791 
792  /*
793  * We want to start the scan somewhere within the index. Set up an
794  * insertion scankey we can use to search for the boundary point we
795  * identified above. The insertion scankey is built in the local
796  * scankeys[] array, using the keys identified by startKeys[].
797  */
798  Assert(keysCount <= INDEX_MAX_KEYS);
799  for (i = 0; i < keysCount; i++)
800  {
801  ScanKey cur = startKeys[i];
802 
803  Assert(cur->sk_attno == i + 1);
804 
805  if (cur->sk_flags & SK_ROW_HEADER)
806  {
807  /*
808  * Row comparison header: look to the first row member instead.
809  *
810  * The member scankeys are already in insertion format (ie, they
811  * have sk_func = 3-way-comparison function), but we have to watch
812  * out for nulls, which _bt_preprocess_keys didn't check. A null
813  * in the first row member makes the condition unmatchable, just
814  * like qual_ok = false.
815  */
816  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
817 
818  Assert(subkey->sk_flags & SK_ROW_MEMBER);
819  if (subkey->sk_flags & SK_ISNULL)
820  {
821  _bt_parallel_done(scan);
822  return false;
823  }
824  memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
825 
826  /*
827  * If the row comparison is the last positioning key we accepted,
828  * try to add additional keys from the lower-order row members.
829  * (If we accepted independent conditions on additional index
830  * columns, we use those instead --- doesn't seem worth trying to
831  * determine which is more restrictive.) Note that this is OK
832  * even if the row comparison is of ">" or "<" type, because the
833  * condition applied to all but the last row member is effectively
834  * ">=" or "<=", and so the extra keys don't break the positioning
835  * scheme. But, by the same token, if we aren't able to use all
836  * the row members, then the part of the row comparison that we
837  * did use has to be treated as just a ">=" or "<=" condition, and
838  * so we'd better adjust strat_total accordingly.
839  */
840  if (i == keysCount - 1)
841  {
842  bool used_all_subkeys = false;
843 
844  Assert(!(subkey->sk_flags & SK_ROW_END));
845  for (;;)
846  {
847  subkey++;
848  Assert(subkey->sk_flags & SK_ROW_MEMBER);
849  if (subkey->sk_attno != keysCount + 1)
850  break; /* out-of-sequence, can't use it */
851  if (subkey->sk_strategy != cur->sk_strategy)
852  break; /* wrong direction, can't use it */
853  if (subkey->sk_flags & SK_ISNULL)
854  break; /* can't use null keys */
855  Assert(keysCount < INDEX_MAX_KEYS);
856  memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
857  keysCount++;
858  if (subkey->sk_flags & SK_ROW_END)
859  {
860  used_all_subkeys = true;
861  break;
862  }
863  }
864  if (!used_all_subkeys)
865  {
866  switch (strat_total)
867  {
869  strat_total = BTLessEqualStrategyNumber;
870  break;
872  strat_total = BTGreaterEqualStrategyNumber;
873  break;
874  }
875  }
876  break; /* done with outer loop */
877  }
878  }
879  else
880  {
881  /*
882  * Ordinary comparison key. Transform the search-style scan key
883  * to an insertion scan key by replacing the sk_func with the
884  * appropriate btree comparison function.
885  *
886  * If scankey operator is not a cross-type comparison, we can use
887  * the cached comparison function; otherwise gotta look it up in
888  * the catalogs. (That can't lead to infinite recursion, since no
889  * indexscan initiated by syscache lookup will use cross-data-type
890  * operators.)
891  *
892  * We support the convention that sk_subtype == InvalidOid means
893  * the opclass input type; this is a hack to simplify life for
894  * ScanKeyInit().
895  */
896  if (cur->sk_subtype == rel->rd_opcintype[i] ||
897  cur->sk_subtype == InvalidOid)
898  {
899  FmgrInfo *procinfo;
900 
901  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
903  cur->sk_flags,
904  cur->sk_attno,
906  cur->sk_subtype,
907  cur->sk_collation,
908  procinfo,
909  cur->sk_argument);
910  }
911  else
912  {
913  RegProcedure cmp_proc;
914 
915  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
916  rel->rd_opcintype[i],
917  cur->sk_subtype,
918  BTORDER_PROC);
919  if (!RegProcedureIsValid(cmp_proc))
920  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
921  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
922  cur->sk_attno, RelationGetRelationName(rel));
923  ScanKeyEntryInitialize(scankeys + i,
924  cur->sk_flags,
925  cur->sk_attno,
927  cur->sk_subtype,
928  cur->sk_collation,
929  cmp_proc,
930  cur->sk_argument);
931  }
932  }
933  }
934 
935  /*----------
936  * Examine the selected initial-positioning strategy to determine exactly
937  * where we need to start the scan, and set flag variables to control the
938  * code below.
939  *
940  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
941  * item >= scan key. If nextkey = true, they will locate the first
942  * item > scan key.
943  *
944  * If goback = true, we will then step back one item, while if
945  * goback = false, we will start the scan on the located item.
946  *----------
947  */
948  switch (strat_total)
949  {
951 
952  /*
953  * Find first item >= scankey, then back up one to arrive at last
954  * item < scankey. (Note: this positioning strategy is only used
955  * for a backward scan, so that is always the correct starting
956  * position.)
957  */
958  nextkey = false;
959  goback = true;
960  break;
961 
963 
964  /*
965  * Find first item > scankey, then back up one to arrive at last
966  * item <= scankey. (Note: this positioning strategy is only used
967  * for a backward scan, so that is always the correct starting
968  * position.)
969  */
970  nextkey = true;
971  goback = true;
972  break;
973 
975 
976  /*
977  * If a backward scan was specified, need to start with last equal
978  * item not first one.
979  */
980  if (ScanDirectionIsBackward(dir))
981  {
982  /*
983  * This is the same as the <= strategy. We will check at the
984  * end whether the found item is actually =.
985  */
986  nextkey = true;
987  goback = true;
988  }
989  else
990  {
991  /*
992  * This is the same as the >= strategy. We will check at the
993  * end whether the found item is actually =.
994  */
995  nextkey = false;
996  goback = false;
997  }
998  break;
999 
1001 
1002  /*
1003  * Find first item >= scankey. (This is only used for forward
1004  * scans.)
1005  */
1006  nextkey = false;
1007  goback = false;
1008  break;
1009 
1011 
1012  /*
1013  * Find first item > scankey. (This is only used for forward
1014  * scans.)
1015  */
1016  nextkey = true;
1017  goback = false;
1018  break;
1019 
1020  default:
1021  /* can't get here, but keep compiler quiet */
1022  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1023  return false;
1024  }
1025 
1026  /*
1027  * Use the manufactured insertion scan key to descend the tree and
1028  * position ourselves on the target leaf page.
1029  */
1030  stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
1031  scan->xs_snapshot);
1032 
1033  /* don't need to keep the stack around... */
1034  _bt_freestack(stack);
1035 
1036  if (!BufferIsValid(buf))
1037  {
1038  /*
1039  * We only get here if the index is completely empty. Lock relation
1040  * because nothing finer to lock exists.
1041  */
1042  PredicateLockRelation(rel, scan->xs_snapshot);
1043 
1044  /*
1045  * mark parallel scan as done, so that all the workers can finish
1046  * their scan
1047  */
1048  _bt_parallel_done(scan);
1050 
1051  return false;
1052  }
1053  else
1055  scan->xs_snapshot);
1056 
1057  _bt_initialize_more_data(so, dir);
1058 
1059  /* position to the precise item on the page */
1060  offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
1061 
1062  /*
1063  * If nextkey = false, we are positioned at the first item >= scan key, or
1064  * possibly at the end of a page on which all the existing items are less
1065  * than the scan key and we know that everything on later pages is greater
1066  * than or equal to scan key.
1067  *
1068  * If nextkey = true, we are positioned at the first item > scan key, or
1069  * possibly at the end of a page on which all the existing items are less
1070  * than or equal to the scan key and we know that everything on later
1071  * pages is greater than scan key.
1072  *
1073  * The actually desired starting point is either this item or the prior
1074  * one, or in the end-of-page case it's the first item on the next page or
1075  * the last item on this page. Adjust the starting offset if needed. (If
1076  * this results in an offset before the first item or after the last one,
1077  * _bt_readpage will report no items found, and then we'll step to the
1078  * next page as needed.)
1079  */
1080  if (goback)
1081  offnum = OffsetNumberPrev(offnum);
1082 
1083  /* remember which buffer we have pinned, if any */
1085  so->currPos.buf = buf;
1086 
1087  /*
1088  * Now load data from the first page of the scan.
1089  */
1090  if (!_bt_readpage(scan, dir, offnum))
1091  {
1092  /*
1093  * There's no actually-matching data on this page. Try to advance to
1094  * the next page. Return false if there's no matching data at all.
1095  */
1097  if (!_bt_steppage(scan, dir))
1098  return false;
1099  }
1100  else
1101  {
1102  /* Drop the lock, and maybe the pin, on the current page */
1104  }
1105 
1106 readcomplete:
1107  /* OK, itemIndex says what to return */
1108  currItem = &so->currPos.items[so->currPos.itemIndex];
1109  scan->xs_ctup.t_self = currItem->heapTid;
1110  if (scan->xs_want_itup)
1111  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1112 
1113  return true;
1114 }
1115 
1116 /*
1117  * _bt_next() -- Get the next item in a scan.
1118  *
1119  * On entry, so->currPos describes the current page, which may be pinned
1120  * but is not locked, and so->currPos.itemIndex identifies which item was
1121  * previously returned.
1122  *
1123  * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1124  * next heap tuple, and if requested, scan->xs_itup points to a copy of
1125  * the index tuple. so->currPos is updated as needed.
1126  *
1127  * On failure exit (no more tuples), we release pin and set
1128  * so->currPos.buf to InvalidBuffer.
1129  */
1130 bool
1132 {
1133  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1134  BTScanPosItem *currItem;
1135 
1136  /*
1137  * Advance to next tuple on current page; or if there's no more, try to
1138  * step to the next page with data.
1139  */
1140  if (ScanDirectionIsForward(dir))
1141  {
1142  if (++so->currPos.itemIndex > so->currPos.lastItem)
1143  {
1144  if (!_bt_steppage(scan, dir))
1145  return false;
1146  }
1147  }
1148  else
1149  {
1150  if (--so->currPos.itemIndex < so->currPos.firstItem)
1151  {
1152  if (!_bt_steppage(scan, dir))
1153  return false;
1154  }
1155  }
1156 
1157  /* OK, itemIndex says what to return */
1158  currItem = &so->currPos.items[so->currPos.itemIndex];
1159  scan->xs_ctup.t_self = currItem->heapTid;
1160  if (scan->xs_want_itup)
1161  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1162 
1163  return true;
1164 }
1165 
1166 /*
1167  * _bt_readpage() -- Load data from current index page into so->currPos
1168  *
1169  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1170  * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1171  * they are updated as appropriate. All other fields of so->currPos are
1172  * initialized from scratch here.
1173  *
1174  * We scan the current page starting at offnum and moving in the indicated
1175  * direction. All items matching the scan keys are loaded into currPos.items.
1176  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1177  * that there can be no more matching tuples in the current scan direction.
1178  *
1179  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1180  * prior to calling this function; this function will invoke
1181  * _bt_parallel_release before returning.
1182  *
1183  * Returns true if any matching items found on the page, false if none.
1184  */
1185 static bool
1187 {
1188  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1189  Page page;
1190  BTPageOpaque opaque;
1191  OffsetNumber minoff;
1192  OffsetNumber maxoff;
1193  int itemIndex;
1194  IndexTuple itup;
1195  bool continuescan;
1196 
1197  /*
1198  * We must have the buffer pinned and locked, but the usual macro can't be
1199  * used here; this function is what makes it good for currPos.
1200  */
1202 
1203  page = BufferGetPage(so->currPos.buf);
1204  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1205 
1206  /* allow next page be processed by parallel worker */
1207  if (scan->parallel_scan)
1208  {
1209  if (ScanDirectionIsForward(dir))
1210  _bt_parallel_release(scan, opaque->btpo_next);
1211  else
1213  }
1214 
1215  minoff = P_FIRSTDATAKEY(opaque);
1216  maxoff = PageGetMaxOffsetNumber(page);
1217 
1218  /*
1219  * We note the buffer's block number so that we can release the pin later.
1220  * This allows us to re-read the buffer if it is needed again for hinting.
1221  */
1223 
1224  /*
1225  * We save the LSN of the page as we read it, so that we know whether it
1226  * safe to apply LP_DEAD hints to the page later. This allows us to drop
1227  * the pin for MVCC scans, which allows vacuum to avoid blocking.
1228  */
1230 
1231  /*
1232  * we must save the page's right-link while scanning it; this tells us
1233  * where to step right to after we're done with these items. There is no
1234  * corresponding need for the left-link, since splits always go right.
1235  */
1236  so->currPos.nextPage = opaque->btpo_next;
1237 
1238  /* initialize tuple workspace to empty */
1239  so->currPos.nextTupleOffset = 0;
1240 
1241  /*
1242  * Now that the current page has been made consistent, the macro should be
1243  * good.
1244  */
1246 
1247  if (ScanDirectionIsForward(dir))
1248  {
1249  /* load items[] in ascending order */
1250  itemIndex = 0;
1251 
1252  offnum = Max(offnum, minoff);
1253 
1254  while (offnum <= maxoff)
1255  {
1256  itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1257  if (itup != NULL)
1258  {
1259  /* tuple passes all scan key conditions, so remember it */
1260  _bt_saveitem(so, itemIndex, offnum, itup);
1261  itemIndex++;
1262  }
1263  if (!continuescan)
1264  {
1265  /* there can't be any more matches, so stop */
1266  so->currPos.moreRight = false;
1267  break;
1268  }
1269 
1270  offnum = OffsetNumberNext(offnum);
1271  }
1272 
1273  Assert(itemIndex <= MaxIndexTuplesPerPage);
1274  so->currPos.firstItem = 0;
1275  so->currPos.lastItem = itemIndex - 1;
1276  so->currPos.itemIndex = 0;
1277  }
1278  else
1279  {
1280  /* load items[] in descending order */
1281  itemIndex = MaxIndexTuplesPerPage;
1282 
1283  offnum = Min(offnum, maxoff);
1284 
1285  while (offnum >= minoff)
1286  {
1287  itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1288  if (itup != NULL)
1289  {
1290  /* tuple passes all scan key conditions, so remember it */
1291  itemIndex--;
1292  _bt_saveitem(so, itemIndex, offnum, itup);
1293  }
1294  if (!continuescan)
1295  {
1296  /* there can't be any more matches, so stop */
1297  so->currPos.moreLeft = false;
1298  break;
1299  }
1300 
1301  offnum = OffsetNumberPrev(offnum);
1302  }
1303 
1304  Assert(itemIndex >= 0);
1305  so->currPos.firstItem = itemIndex;
1308  }
1309 
1310  return (so->currPos.firstItem <= so->currPos.lastItem);
1311 }
1312 
1313 /* Save an index item into so->currPos.items[itemIndex] */
1314 static void
1315 _bt_saveitem(BTScanOpaque so, int itemIndex,
1316  OffsetNumber offnum, IndexTuple itup)
1317 {
1318  BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1319 
1320  currItem->heapTid = itup->t_tid;
1321  currItem->indexOffset = offnum;
1322  if (so->currTuples)
1323  {
1324  Size itupsz = IndexTupleSize(itup);
1325 
1326  currItem->tupleOffset = so->currPos.nextTupleOffset;
1327  memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1328  so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1329  }
1330 }
1331 
1332 /*
1333  * _bt_steppage() -- Step to next page containing valid data for scan
1334  *
1335  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1336  * if pinned, we'll drop the pin before moving to next page. The buffer is
1337  * not locked on entry.
1338  *
1339  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1340  * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1341  * to InvalidBuffer. We return true to indicate success.
1342  */
1343 static bool
1345 {
1346  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1348  bool status = true;
1349 
1351 
1352  /* Before leaving current page, deal with any killed items */
1353  if (so->numKilled > 0)
1354  _bt_killitems(scan);
1355 
1356  /*
1357  * Before we modify currPos, make a copy of the page data if there was a
1358  * mark position that needs it.
1359  */
1360  if (so->markItemIndex >= 0)
1361  {
1362  /* bump pin on current buffer for assignment to mark buffer */
1363  if (BTScanPosIsPinned(so->currPos))
1365  memcpy(&so->markPos, &so->currPos,
1366  offsetof(BTScanPosData, items[1]) +
1367  so->currPos.lastItem * sizeof(BTScanPosItem));
1368  if (so->markTuples)
1369  memcpy(so->markTuples, so->currTuples,
1370  so->currPos.nextTupleOffset);
1371  so->markPos.itemIndex = so->markItemIndex;
1372  so->markItemIndex = -1;
1373  }
1374 
1375  if (ScanDirectionIsForward(dir))
1376  {
1377  /* Walk right to the next page with data */
1378  if (scan->parallel_scan != NULL)
1379  {
1380  /*
1381  * Seize the scan to get the next block number; if the scan has
1382  * ended already, bail out.
1383  */
1384  status = _bt_parallel_seize(scan, &blkno);
1385  if (!status)
1386  {
1387  /* release the previous buffer, if pinned */
1390  return false;
1391  }
1392  }
1393  else
1394  {
1395  /* Not parallel, so use the previously-saved nextPage link. */
1396  blkno = so->currPos.nextPage;
1397  }
1398 
1399  /* Remember we left a page with data */
1400  so->currPos.moreLeft = true;
1401 
1402  /* release the previous buffer, if pinned */
1404  }
1405  else
1406  {
1407  /* Remember we left a page with data */
1408  so->currPos.moreRight = true;
1409 
1410  if (scan->parallel_scan != NULL)
1411  {
1412  /*
1413  * Seize the scan to get the current block number; if the scan has
1414  * ended already, bail out.
1415  */
1416  status = _bt_parallel_seize(scan, &blkno);
1418  if (!status)
1419  {
1421  return false;
1422  }
1423  }
1424  else
1425  {
1426  /* Not parallel, so just use our own notion of the current page */
1427  blkno = so->currPos.currPage;
1428  }
1429  }
1430 
1431  if (!_bt_readnextpage(scan, blkno, dir))
1432  return false;
1433 
1434  /* Drop the lock, and maybe the pin, on the current page */
1436 
1437  return true;
1438 }
1439 
1440 /*
1441  * _bt_readnextpage() -- Read next page containing valid data for scan
1442  *
1443  * On success exit, so->currPos is updated to contain data from the next
1444  * interesting page. Caller is responsible to release lock and pin on
1445  * buffer on success. We return true to indicate success.
1446  *
1447  * If there are no more matching records in the given direction, we drop all
1448  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1449  */
1450 static bool
1452 {
1453  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1454  Relation rel;
1455  Page page;
1456  BTPageOpaque opaque;
1457  bool status = true;
1458 
1459  rel = scan->indexRelation;
1460 
1461  if (ScanDirectionIsForward(dir))
1462  {
1463  for (;;)
1464  {
1465  /*
1466  * if we're at end of scan, give up and mark parallel scan as
1467  * done, so that all the workers can finish their scan
1468  */
1469  if (blkno == P_NONE || !so->currPos.moreRight)
1470  {
1471  _bt_parallel_done(scan);
1473  return false;
1474  }
1475  /* check for interrupts while we're not holding any buffer lock */
1477  /* step right one page */
1478  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1479  page = BufferGetPage(so->currPos.buf);
1480  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1481  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1482  /* check for deleted page */
1483  if (!P_IGNORE(opaque))
1484  {
1485  PredicateLockPage(rel, blkno, scan->xs_snapshot);
1486  /* see if there are any matches on this page */
1487  /* note that this will clear moreRight if we can stop */
1488  if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1489  break;
1490  }
1491  else if (scan->parallel_scan != NULL)
1492  {
1493  /* allow next page be processed by parallel worker */
1494  _bt_parallel_release(scan, opaque->btpo_next);
1495  }
1496 
1497  /* nope, keep going */
1498  if (scan->parallel_scan != NULL)
1499  {
1500  status = _bt_parallel_seize(scan, &blkno);
1501  if (!status)
1502  {
1503  _bt_relbuf(rel, so->currPos.buf);
1505  return false;
1506  }
1507  }
1508  else
1509  blkno = opaque->btpo_next;
1510  _bt_relbuf(rel, so->currPos.buf);
1511  }
1512  }
1513  else
1514  {
1515  /*
1516  * Should only happen in parallel cases, when some other backend
1517  * advanced the scan.
1518  */
1519  if (so->currPos.currPage != blkno)
1520  {
1522  so->currPos.currPage = blkno;
1523  }
1524 
1525  /*
1526  * Walk left to the next page with data. This is much more complex
1527  * than the walk-right case because of the possibility that the page
1528  * to our left splits while we are in flight to it, plus the
1529  * possibility that the page we were on gets deleted after we leave
1530  * it. See nbtree/README for details.
1531  *
1532  * It might be possible to rearrange this code to have less overhead
1533  * in pinning and locking, but that would require capturing the left
1534  * pointer when the page is initially read, and using it here, along
1535  * with big changes to _bt_walk_left() and the code below. It is not
1536  * clear whether this would be a win, since if the page immediately to
1537  * the left splits after we read this page and before we step left, we
1538  * would need to visit more pages than with the current code.
1539  *
1540  * Note that if we change the code so that we drop the pin for a scan
1541  * which uses a non-MVCC snapshot, we will need to modify the code for
1542  * walking left, to allow for the possibility that a referenced page
1543  * has been deleted. As long as the buffer is pinned or the snapshot
1544  * is MVCC the page cannot move past the half-dead state to fully
1545  * deleted.
1546  */
1547  if (BTScanPosIsPinned(so->currPos))
1548  LockBuffer(so->currPos.buf, BT_READ);
1549  else
1550  so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1551 
1552  for (;;)
1553  {
1554  /* Done if we know there are no matching keys to the left */
1555  if (!so->currPos.moreLeft)
1556  {
1557  _bt_relbuf(rel, so->currPos.buf);
1558  _bt_parallel_done(scan);
1560  return false;
1561  }
1562 
1563  /* Step to next physical page */
1564  so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1565  scan->xs_snapshot);
1566 
1567  /* if we're physically at end of index, return failure */
1568  if (so->currPos.buf == InvalidBuffer)
1569  {
1570  _bt_parallel_done(scan);
1572  return false;
1573  }
1574 
1575  /*
1576  * Okay, we managed to move left to a non-deleted page. Done if
1577  * it's not half-dead and contains matching tuples. Else loop back
1578  * and do it all again.
1579  */
1580  page = BufferGetPage(so->currPos.buf);
1581  TestForOldSnapshot(scan->xs_snapshot, rel, page);
1582  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1583  if (!P_IGNORE(opaque))
1584  {
1586  /* see if there are any matches on this page */
1587  /* note that this will clear moreLeft if we can stop */
1588  if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1589  break;
1590  }
1591  else if (scan->parallel_scan != NULL)
1592  {
1593  /* allow next page be processed by parallel worker */
1595  }
1596 
1597  /*
1598  * For parallel scans, get the last page scanned as it is quite
1599  * possible that by the time we try to seize the scan, some other
1600  * worker has already advanced the scan to a different page. We
1601  * must continue based on the latest page scanned by any worker.
1602  */
1603  if (scan->parallel_scan != NULL)
1604  {
1605  _bt_relbuf(rel, so->currPos.buf);
1606  status = _bt_parallel_seize(scan, &blkno);
1607  if (!status)
1608  {
1610  return false;
1611  }
1612  so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1613  }
1614  }
1615  }
1616 
1617  return true;
1618 }
1619 
1620 /*
1621  * _bt_parallel_readpage() -- Read current page containing valid data for scan
1622  *
1623  * On success, release lock and maybe pin on buffer. We return true to
1624  * indicate success.
1625  */
1626 static bool
1628 {
1629  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1630 
1631  _bt_initialize_more_data(so, dir);
1632 
1633  if (!_bt_readnextpage(scan, blkno, dir))
1634  return false;
1635 
1636  /* Drop the lock, and maybe the pin, on the current page */
1638 
1639  return true;
1640 }
1641 
1642 /*
1643  * _bt_walk_left() -- step left one page, if possible
1644  *
1645  * The given buffer must be pinned and read-locked. This will be dropped
1646  * before stepping left. On return, we have pin and read lock on the
1647  * returned page, instead.
1648  *
1649  * Returns InvalidBuffer if there is no page to the left (no lock is held
1650  * in that case).
1651  *
1652  * When working on a non-leaf level, it is possible for the returned page
1653  * to be half-dead; the caller should check that condition and step left
1654  * again if it's important.
1655  */
1656 static Buffer
1658 {
1659  Page page;
1660  BTPageOpaque opaque;
1661 
1662  page = BufferGetPage(buf);
1663  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1664 
1665  for (;;)
1666  {
1667  BlockNumber obknum;
1668  BlockNumber lblkno;
1669  BlockNumber blkno;
1670  int tries;
1671 
1672  /* if we're at end of tree, release buf and return failure */
1673  if (P_LEFTMOST(opaque))
1674  {
1675  _bt_relbuf(rel, buf);
1676  break;
1677  }
1678  /* remember original page we are stepping left from */
1679  obknum = BufferGetBlockNumber(buf);
1680  /* step left */
1681  blkno = lblkno = opaque->btpo_prev;
1682  _bt_relbuf(rel, buf);
1683  /* check for interrupts while we're not holding any buffer lock */
1685  buf = _bt_getbuf(rel, blkno, BT_READ);
1686  page = BufferGetPage(buf);
1687  TestForOldSnapshot(snapshot, rel, page);
1688  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1689 
1690  /*
1691  * If this isn't the page we want, walk right till we find what we
1692  * want --- but go no more than four hops (an arbitrary limit). If we
1693  * don't find the correct page by then, the most likely bet is that
1694  * the original page got deleted and isn't in the sibling chain at all
1695  * anymore, not that its left sibling got split more than four times.
1696  *
1697  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1698  * because half-dead pages are still in the sibling chain. Caller
1699  * must reject half-dead pages if wanted.
1700  */
1701  tries = 0;
1702  for (;;)
1703  {
1704  if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1705  {
1706  /* Found desired page, return it */
1707  return buf;
1708  }
1709  if (P_RIGHTMOST(opaque) || ++tries > 4)
1710  break;
1711  blkno = opaque->btpo_next;
1712  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1713  page = BufferGetPage(buf);
1714  TestForOldSnapshot(snapshot, rel, page);
1715  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1716  }
1717 
1718  /* Return to the original page to see what's up */
1719  buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1720  page = BufferGetPage(buf);
1721  TestForOldSnapshot(snapshot, rel, page);
1722  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1723  if (P_ISDELETED(opaque))
1724  {
1725  /*
1726  * It was deleted. Move right to first nondeleted page (there
1727  * must be one); that is the page that has acquired the deleted
1728  * one's keyspace, so stepping left from it will take us where we
1729  * want to be.
1730  */
1731  for (;;)
1732  {
1733  if (P_RIGHTMOST(opaque))
1734  elog(ERROR, "fell off the end of index \"%s\"",
1736  blkno = opaque->btpo_next;
1737  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1738  page = BufferGetPage(buf);
1739  TestForOldSnapshot(snapshot, rel, page);
1740  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1741  if (!P_ISDELETED(opaque))
1742  break;
1743  }
1744 
1745  /*
1746  * Now return to top of loop, resetting obknum to point to this
1747  * nondeleted page, and try again.
1748  */
1749  }
1750  else
1751  {
1752  /*
1753  * It wasn't deleted; the explanation had better be that the page
1754  * to the left got split or deleted. Without this check, we'd go
1755  * into an infinite loop if there's anything wrong.
1756  */
1757  if (opaque->btpo_prev == lblkno)
1758  elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1759  obknum, RelationGetRelationName(rel));
1760  /* Okay to try again with new lblkno value */
1761  }
1762  }
1763 
1764  return InvalidBuffer;
1765 }
1766 
1767 /*
1768  * _bt_get_endpoint() -- Find the first or last page on a given tree level
1769  *
1770  * If the index is empty, we will return InvalidBuffer; any other failure
1771  * condition causes ereport(). We will not return a dead page.
1772  *
1773  * The returned buffer is pinned and read-locked.
1774  */
1775 Buffer
1776 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
1777  Snapshot snapshot)
1778 {
1779  Buffer buf;
1780  Page page;
1781  BTPageOpaque opaque;
1782  OffsetNumber offnum;
1783  BlockNumber blkno;
1784  IndexTuple itup;
1785 
1786  /*
1787  * If we are looking for a leaf page, okay to descend from fast root;
1788  * otherwise better descend from true root. (There is no point in being
1789  * smarter about intermediate levels.)
1790  */
1791  if (level == 0)
1792  buf = _bt_getroot(rel, BT_READ);
1793  else
1794  buf = _bt_gettrueroot(rel);
1795 
1796  if (!BufferIsValid(buf))
1797  return InvalidBuffer;
1798 
1799  page = BufferGetPage(buf);
1800  TestForOldSnapshot(snapshot, rel, page);
1801  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1802 
1803  for (;;)
1804  {
1805  /*
1806  * If we landed on a deleted page, step right to find a live page
1807  * (there must be one). Also, if we want the rightmost page, step
1808  * right if needed to get to it (this could happen if the page split
1809  * since we obtained a pointer to it).
1810  */
1811  while (P_IGNORE(opaque) ||
1812  (rightmost && !P_RIGHTMOST(opaque)))
1813  {
1814  blkno = opaque->btpo_next;
1815  if (blkno == P_NONE)
1816  elog(ERROR, "fell off the end of index \"%s\"",
1818  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1819  page = BufferGetPage(buf);
1820  TestForOldSnapshot(snapshot, rel, page);
1821  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1822  }
1823 
1824  /* Done? */
1825  if (opaque->btpo.level == level)
1826  break;
1827  if (opaque->btpo.level < level)
1828  elog(ERROR, "btree level %u not found in index \"%s\"",
1829  level, RelationGetRelationName(rel));
1830 
1831  /* Descend to leftmost or rightmost child page */
1832  if (rightmost)
1833  offnum = PageGetMaxOffsetNumber(page);
1834  else
1835  offnum = P_FIRSTDATAKEY(opaque);
1836 
1837  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1838  blkno = BTreeInnerTupleGetDownLink(itup);
1839 
1840  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1841  page = BufferGetPage(buf);
1842  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1843  }
1844 
1845  return buf;
1846 }
1847 
1848 /*
1849  * _bt_endpoint() -- Find the first or last page in the index, and scan
1850  * from there to the first key satisfying all the quals.
1851  *
1852  * This is used by _bt_first() to set up a scan when we've determined
1853  * that the scan must start at the beginning or end of the index (for
1854  * a forward or backward scan respectively). Exit conditions are the
1855  * same as for _bt_first().
1856  */
1857 static bool
1859 {
1860  Relation rel = scan->indexRelation;
1861  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1862  Buffer buf;
1863  Page page;
1864  BTPageOpaque opaque;
1865  OffsetNumber start;
1866  BTScanPosItem *currItem;
1867 
1868  /*
1869  * Scan down to the leftmost or rightmost leaf page. This is a simplified
1870  * version of _bt_search(). We don't maintain a stack since we know we
1871  * won't need it.
1872  */
1874 
1875  if (!BufferIsValid(buf))
1876  {
1877  /*
1878  * Empty index. Lock the whole relation, as nothing finer to lock
1879  * exists.
1880  */
1881  PredicateLockRelation(rel, scan->xs_snapshot);
1883  return false;
1884  }
1885 
1887  page = BufferGetPage(buf);
1888  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1889  Assert(P_ISLEAF(opaque));
1890 
1891  if (ScanDirectionIsForward(dir))
1892  {
1893  /* There could be dead pages to the left, so not this: */
1894  /* Assert(P_LEFTMOST(opaque)); */
1895 
1896  start = P_FIRSTDATAKEY(opaque);
1897  }
1898  else if (ScanDirectionIsBackward(dir))
1899  {
1900  Assert(P_RIGHTMOST(opaque));
1901 
1902  start = PageGetMaxOffsetNumber(page);
1903  }
1904  else
1905  {
1906  elog(ERROR, "invalid scan direction: %d", (int) dir);
1907  start = 0; /* keep compiler quiet */
1908  }
1909 
1910  /* remember which buffer we have pinned */
1911  so->currPos.buf = buf;
1912 
1913  _bt_initialize_more_data(so, dir);
1914 
1915  /*
1916  * Now load data from the first page of the scan.
1917  */
1918  if (!_bt_readpage(scan, dir, start))
1919  {
1920  /*
1921  * There's no actually-matching data on this page. Try to advance to
1922  * the next page. Return false if there's no matching data at all.
1923  */
1925  if (!_bt_steppage(scan, dir))
1926  return false;
1927  }
1928  else
1929  {
1930  /* Drop the lock, and maybe the pin, on the current page */
1932  }
1933 
1934  /* OK, itemIndex says what to return */
1935  currItem = &so->currPos.items[so->currPos.itemIndex];
1936  scan->xs_ctup.t_self = currItem->heapTid;
1937  if (scan->xs_want_itup)
1938  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1939 
1940  return true;
1941 }
1942 
1943 /*
1944  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
1945  * for scan direction
1946  */
1947 static inline void
1949 {
1950  /* initialize moreLeft/moreRight appropriately for scan direction */
1951  if (ScanDirectionIsForward(dir))
1952  {
1953  so->currPos.moreLeft = false;
1954  so->currPos.moreRight = true;
1955  }
1956  else
1957  {
1958  so->currPos.moreLeft = true;
1959  so->currPos.moreRight = false;
1960  }
1961  so->numKilled = 0; /* just paranoia */
1962  so->markItemIndex = -1; /* ditto */
1963 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:141
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2475
BlockNumber btpo_next
Definition: nbtree.h:58
#define BTORDER_PROC
Definition: nbtree.h:290
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:265
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:855
#define P_IGNORE(opaque)
Definition: nbtree.h:163
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:224
bool moreRight
Definition: nbtree.h:371
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:115
IndexTuple _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1370
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:390
#define DatumGetInt32(X)
Definition: postgres.h:457
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:860
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:720
#define RelationGetDescr(relation)
Definition: rel.h:433
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1627
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:388
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2452
char * currTuples
Definition: nbtree.h:462
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
ItemPointerData t_tid
Definition: itup.h:37
regproc RegProcedure
Definition: c.h:472
#define Min(x, y)
Definition: c.h:857
union BTPageOpaqueData::@46 btpo
#define P_NONE
Definition: nbtree.h:150
Snapshot xs_snapshot
Definition: relscan.h:92
#define InvalidBuffer
Definition: buf.h:25
BlockNumber currPage
Definition: nbtree.h:361
bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2134
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
BTStackData * BTStack
Definition: nbtree.h:321
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:412
OffsetNumber indexOffset
Definition: nbtree.h:352
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1133
bool moreLeft
Definition: nbtree.h:370
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1776
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:538
signed int int32
Definition: c.h:313
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:91
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
int nextTupleOffset
Definition: nbtree.h:377
#define BT_READ
Definition: nbtree.h:300
int lastItem
Definition: nbtree.h:387
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1744
BTScanPosData markPos
Definition: nbtree.h:476
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1938
Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:214
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:479
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData t_self
Definition: htup.h:65
BlockNumber btpo_prev
Definition: nbtree.h:57
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2832
OffsetNumber bts_offset
Definition: nbtree.h:316
static char * buf
Definition: pg_test_fsync.c:67
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1858
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:395
#define RegProcedureIsValid(p)
Definition: c.h:607
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1315
ScanDirection
Definition: sdir.h:22
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:323
char * markTuples
Definition: nbtree.h:463
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define P_LEFTMOST(opaque)
Definition: nbtree.h:156
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1291
unsigned int uint32
Definition: c.h:325
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:639
Oid * rd_opfamily
Definition: rel.h:154
int firstItem
Definition: nbtree.h:386
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:252
#define SK_ISNULL
Definition: skey.h:115
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define P_ISDELETED(opaque)
Definition: nbtree.h:160
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:756
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:490
uint32 level
Definition: nbtree.h:61
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:1948
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:498
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
#define InvalidOid
Definition: postgres_ext.h:36
BlockNumber bts_blkno
Definition: nbtree.h:315
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
int markItemIndex
Definition: nbtree.h:472
BlockNumber bts_btentry
Definition: nbtree.h:317
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1451
bool xs_want_itup
Definition: relscan.h:97
#define Max(x, y)
Definition: c.h:851
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:699
#define IsMVCCSnapshot(snapshot)
Definition: tqual.h:31
#define SK_BT_DESC
Definition: nbtree.h:489
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
HeapTupleData xs_ctup
Definition: relscan.h:121
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:418
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int numberOfKeys
Definition: nbtree.h:441
#define MAXALIGN(LEN)
Definition: c.h:652
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
struct BTStackData * bts_parent
Definition: nbtree.h:318
#define RelationNeedsWAL(relation)
Definition: rel.h:510
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:697
ItemPointerData heapTid
Definition: nbtree.h:351
#define DatumGetPointer(X)
Definition: postgres.h:534
#define P_HIKEY
Definition: nbtree.h:185
BlockNumber nextPage
Definition: nbtree.h:362
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
BTScanPosData currPos
Definition: nbtree.h:475
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:924
ScanKey keyData
Definition: nbtree.h:442
Oid sk_collation
Definition: skey.h:70
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1344
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1131
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
#define elog
Definition: elog.h:219
XLogRecPtr lsn
Definition: nbtree.h:360
Buffer buf
Definition: nbtree.h:358
Oid * rd_opcintype
Definition: rel.h:155
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:428
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:225
#define BT_WRITE
Definition: nbtree.h:301
#define BTLessStrategyNumber
Definition: stratnum.h:29
LocationIndex tupleOffset
Definition: nbtree.h:353
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1186
int16 AttrNumber
Definition: attnum.h:21
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3347
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define offsetof(type, field)
Definition: c.h:622
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:406
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158
static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
Definition: nbtsearch.c:1657