PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtsearch.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
5 *
6 *
7 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "access/nbtree.h"
19#include "access/relscan.h"
20#include "access/xact.h"
22#include "miscadmin.h"
23#include "pgstat.h"
24#include "storage/predicate.h"
25#include "utils/lsyscache.h"
26#include "utils/rel.h"
27
28
30static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
31 Buffer buf, bool forupdate, BTStack stack,
32 int access);
34static int _bt_binsrch_posting(BTScanInsert key, Page page,
35 OffsetNumber offnum);
36static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
37static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
38static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
39 ScanDirection dir);
40static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
42 bool seized);
45static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
46
47
48/*
49 * _bt_drop_lock_and_maybe_pin()
50 *
51 * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
52 * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
53 */
54static inline void
56{
57 if (!so->dropPin)
58 {
59 /* Just drop the lock (not the pin) */
60 _bt_unlockbuf(rel, so->currPos.buf);
61 return;
62 }
63
64 /*
65 * Drop both the lock and the pin.
66 *
67 * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
68 * when concurrent heap TID recycling by VACUUM might have taken place.
69 */
70 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
71 _bt_relbuf(rel, so->currPos.buf);
72 so->currPos.buf = InvalidBuffer;
73}
74
75/*
76 * _bt_search() -- Search the tree for a particular scankey,
77 * or more precisely for the first leaf page it could be on.
78 *
79 * The passed scankey is an insertion-type scankey (see nbtree/README),
80 * but it can omit the rightmost column(s) of the index.
81 *
82 * If returnstack is true, return value is a stack of parent-page pointers
83 * (i.e. there is no entry for the leaf level/page). If returnstack is false,
84 * we just return NULL. This scheme allows callers that don't need a descent
85 * stack to avoid palloc churn.
86 *
87 * When we return, *bufP is set to the address of the leaf-page buffer, which
88 * is locked and pinned. No locks are held on the parent pages, however!
89 *
90 * The returned buffer is locked according to access parameter. Additionally,
91 * access = BT_WRITE will allow an empty root page to be created and returned.
92 * When access = BT_READ, an empty index will result in *bufP being set to
93 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
94 * during the search will be finished.
95 *
96 * heaprel must be provided by callers that pass access = BT_WRITE, since we
97 * might need to allocate a new root page for caller -- see _bt_allocbuf.
98 */
101 int access, bool returnstack)
102{
104 int page_access = BT_READ;
105
106 /* heaprel must be set whenever _bt_allocbuf is reachable */
108 Assert(access == BT_READ || heaprel != NULL);
109
110 /* Get the root page to start with */
111 *bufP = _bt_getroot(rel, heaprel, access);
112
113 /* If index is empty and access = BT_READ, no root page is created. */
114 if (!BufferIsValid(*bufP))
115 return (BTStack) NULL;
116
117 /* Loop iterates once per level descended in the tree */
118 for (;;)
119 {
120 Page page;
121 BTPageOpaque opaque;
122 OffsetNumber offnum;
123 ItemId itemid;
124 IndexTuple itup;
125 BlockNumber child;
127
128 /*
129 * Race -- the page we just grabbed may have split since we read its
130 * downlink in its parent page (or the metapage). If it has, we may
131 * need to move right to its new sibling. Do that.
132 *
133 * In write-mode, allow _bt_moveright to finish any incomplete splits
134 * along the way. Strictly speaking, we'd only need to finish an
135 * incomplete split on the leaf page we're about to insert to, not on
136 * any of the upper levels (internal pages with incomplete splits are
137 * also taken care of in _bt_getstackbuf). But this is a good
138 * opportunity to finish splits of internal pages too.
139 */
140 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
142
143 /* if this is a leaf page, we're done */
144 page = BufferGetPage(*bufP);
145 opaque = BTPageGetOpaque(page);
146 if (P_ISLEAF(opaque))
147 break;
148
149 /*
150 * Find the appropriate pivot tuple on this page. Its downlink points
151 * to the child page that we're about to descend to.
152 */
153 offnum = _bt_binsrch(rel, key, *bufP);
154 itemid = PageGetItemId(page, offnum);
155 itup = (IndexTuple) PageGetItem(page, itemid);
156 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
157 child = BTreeTupleGetDownLink(itup);
158
159 /*
160 * We need to save the location of the pivot tuple we chose in a new
161 * stack entry for this page/level. If caller ends up splitting a
162 * page one level down, it usually ends up inserting a new pivot
163 * tuple/downlink immediately after the location recorded here.
164 */
165 if (returnstack)
166 {
168 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169 new_stack->bts_offset = offnum;
170 new_stack->bts_parent = stack_in;
172 }
173
174 /*
175 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
176 * we're on the level 1 and asked to lock leaf page in write mode,
177 * then lock next page in write mode, because it must be a leaf.
178 */
179 if (opaque->btpo_level == 1 && access == BT_WRITE)
181
182 /* drop the read lock on the page, then acquire one on its child */
183 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
184
185 /* okay, all set to move down a level */
186 }
187
188 /*
189 * If we're asked to lock leaf in write mode, but didn't manage to, then
190 * relock. This should only happen when the root page is a leaf page (and
191 * the only page in the index other than the metapage).
192 */
193 if (access == BT_WRITE && page_access == BT_READ)
194 {
195 /* trade in our read lock for a write lock */
196 _bt_unlockbuf(rel, *bufP);
197 _bt_lockbuf(rel, *bufP, BT_WRITE);
198
199 /*
200 * Race -- the leaf page may have split after we dropped the read lock
201 * but before we acquired a write lock. If it has, we may need to
202 * move right to its new sibling. Do that.
203 */
204 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
205 }
206
207 return stack_in;
208}
209
210/*
211 * _bt_moveright() -- move right in the btree if necessary.
212 *
213 * When we follow a pointer to reach a page, it is possible that
214 * the page has changed in the meanwhile. If this happens, we're
215 * guaranteed that the page has "split right" -- that is, that any
216 * data that appeared on the page originally is either on the page
217 * or strictly to the right of it.
218 *
219 * This routine decides whether or not we need to move right in the
220 * tree by examining the high key entry on the page. If that entry is
221 * strictly less than the scankey, or <= the scankey in the
222 * key.nextkey=true case, then we followed the wrong link and we need
223 * to move right.
224 *
225 * The passed insertion-type scankey can omit the rightmost column(s) of the
226 * index. (see nbtree/README)
227 *
228 * When key.nextkey is false (the usual case), we are looking for the first
229 * item >= key. When key.nextkey is true, we are looking for the first item
230 * strictly greater than key.
231 *
232 * If forupdate is true, we will attempt to finish any incomplete splits
233 * that we encounter. This is required when locking a target page for an
234 * insertion, because we don't allow inserting on a page before the split is
235 * completed. 'heaprel' and 'stack' are only used if forupdate is true.
236 *
237 * On entry, we have the buffer pinned and a lock of the type specified by
238 * 'access'. If we move right, we release the buffer and lock and acquire
239 * the same on the right sibling. Return value is the buffer we stop at.
240 */
241static Buffer
243 Relation heaprel,
244 BTScanInsert key,
245 Buffer buf,
246 bool forupdate,
247 BTStack stack,
248 int access)
249{
250 Page page;
251 BTPageOpaque opaque;
253
254 Assert(!forupdate || heaprel != NULL);
255
256 /*
257 * When nextkey = false (normal case): if the scan key that brought us to
258 * this page is > the high key stored on the page, then the page has split
259 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
260 * have some duplicates to the right as well as the left, but that's
261 * something that's only ever dealt with on the leaf level, after
262 * _bt_search has found an initial leaf page.)
263 *
264 * When nextkey = true: move right if the scan key is >= page's high key.
265 * (Note that key.scantid cannot be set in this case.)
266 *
267 * The page could even have split more than once, so scan as far as
268 * needed.
269 *
270 * We also have to move right if we followed a link that brought us to a
271 * dead page.
272 */
273 cmpval = key->nextkey ? 0 : 1;
274
275 for (;;)
276 {
277 page = BufferGetPage(buf);
278 opaque = BTPageGetOpaque(page);
279
280 if (P_RIGHTMOST(opaque))
281 break;
282
283 /*
284 * Finish any incomplete splits we encounter along the way.
285 */
286 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
287 {
289
290 /* upgrade our lock if necessary */
291 if (access == BT_READ)
292 {
293 _bt_unlockbuf(rel, buf);
294 _bt_lockbuf(rel, buf, BT_WRITE);
295 }
296
297 if (P_INCOMPLETE_SPLIT(opaque))
298 _bt_finish_split(rel, heaprel, buf, stack);
299 else
300 _bt_relbuf(rel, buf);
301
302 /* re-acquire the lock in the right mode, and re-check */
303 buf = _bt_getbuf(rel, blkno, access);
304 continue;
305 }
306
307 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
308 {
309 /* step right one page */
310 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
311 continue;
312 }
313 else
314 break;
315 }
316
317 if (P_IGNORE(opaque))
318 elog(ERROR, "fell off the end of index \"%s\"",
320
321 return buf;
322}
323
324/*
325 * _bt_binsrch() -- Do a binary search for a key on a particular page.
326 *
327 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
328 * of the last key < given scankey, or last key <= given scankey if nextkey
329 * is true. (Since _bt_compare treats the first data key of such a page as
330 * minus infinity, there will be at least one key < scankey, so the result
331 * always points at one of the keys on the page.)
332 *
333 * On a leaf page, _bt_binsrch() returns the final result of the initial
334 * positioning process that started with _bt_first's call to _bt_search.
335 * We're returning a non-pivot tuple offset, so things are a little different.
336 * It is possible that we'll return an offset that's either past the last
337 * non-pivot slot, or (in the case of a backward scan) before the first slot.
338 *
339 * This procedure is not responsible for walking right, it just examines
340 * the given page. _bt_binsrch() has no lock or refcount side effects
341 * on the buffer.
342 */
343static OffsetNumber
345 BTScanInsert key,
346 Buffer buf)
347{
348 Page page;
349 BTPageOpaque opaque;
350 OffsetNumber low,
351 high;
352 int32 result,
353 cmpval;
354
355 page = BufferGetPage(buf);
356 opaque = BTPageGetOpaque(page);
357
358 /* Requesting nextkey semantics while using scantid seems nonsensical */
359 Assert(!key->nextkey || key->scantid == NULL);
360 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
361 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
362
363 low = P_FIRSTDATAKEY(opaque);
364 high = PageGetMaxOffsetNumber(page);
365
366 /*
367 * If there are no keys on the page, return the first available slot. Note
368 * this covers two cases: the page is really empty (no keys), or it
369 * contains only a high key. The latter case is possible after vacuuming.
370 * This can never happen on an internal page, however, since they are
371 * never empty (an internal page must have at least one child).
372 */
373 if (unlikely(high < low))
374 return low;
375
376 /*
377 * Binary search to find the first key on the page >= scan key, or first
378 * key > scankey when nextkey is true.
379 *
380 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
382 *
383 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
385 *
386 * We can fall out when high == low.
387 */
388 high++; /* establish the loop invariant for high */
389
390 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
391
392 while (high > low)
393 {
394 OffsetNumber mid = low + ((high - low) / 2);
395
396 /* We have low <= mid < high, so mid points at a real slot */
397
398 result = _bt_compare(rel, key, page, mid);
399
400 if (result >= cmpval)
401 low = mid + 1;
402 else
403 high = mid;
404 }
405
406 /*
407 * At this point we have high == low.
408 *
409 * On a leaf page we always return the first non-pivot tuple >= scan key
410 * (resp. > scan key) for forward scan callers. For backward scans, it's
411 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
412 */
413 if (P_ISLEAF(opaque))
414 {
415 /*
416 * In the backward scan case we're supposed to locate the last
417 * matching tuple on the leaf level -- not the first matching tuple
418 * (the last tuple will be the first one returned by the scan).
419 *
420 * At this point we've located the first non-pivot tuple immediately
421 * after the last matching tuple (which might just be maxoff + 1).
422 * Compensate by stepping back.
423 */
424 if (key->backward)
425 return OffsetNumberPrev(low);
426
427 return low;
428 }
429
430 /*
431 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
432 * There must be one if _bt_compare() is playing by the rules.
433 *
434 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
435 * a truncated -inf heap TID is usually enough to prevent it altogether.
436 * Even omitted scan key entries are treated as > truncated attributes.
437 *
438 * However, during backward scans _bt_compare() interprets omitted scan
439 * key attributes as == corresponding truncated -inf attributes instead.
440 * This works just like < would work here. Under this scheme, < strategy
441 * backward scans will always directly descend to the correct leaf page.
442 * In particular, they will never incur an "extra" leaf page access with a
443 * scan key that happens to contain the same prefix of values as some
444 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
445 * it uses a leaf page high key to "re-find" a page undergoing deletion.
446 */
447 Assert(low > P_FIRSTDATAKEY(opaque));
448
449 return OffsetNumberPrev(low);
450}
451
452/*
453 *
454 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
455 *
456 * Like _bt_binsrch(), but with support for caching the binary search
457 * bounds. Only used during insertion, and only on the leaf page that it
458 * looks like caller will insert tuple on. Exclusive-locked and pinned
459 * leaf page is contained within insertstate.
460 *
461 * Caches the bounds fields in insertstate so that a subsequent call can
462 * reuse the low and strict high bounds of original binary search. Callers
463 * that use these fields directly must be prepared for the case where low
464 * and/or stricthigh are not on the same page (one or both exceed maxoff
465 * for the page). The case where there are no items on the page (high <
466 * low) makes bounds invalid.
467 *
468 * Caller is responsible for invalidating bounds when it modifies the page
469 * before calling here a second time, and for dealing with posting list
470 * tuple matches (callers can use insertstate's postingoff field to
471 * determine which existing heap TID will need to be replaced by a posting
472 * list split).
473 */
476{
477 BTScanInsert key = insertstate->itup_key;
478 Page page;
479 BTPageOpaque opaque;
480 OffsetNumber low,
481 high,
482 stricthigh;
483 int32 result,
484 cmpval;
485
486 page = BufferGetPage(insertstate->buf);
487 opaque = BTPageGetOpaque(page);
488
489 Assert(P_ISLEAF(opaque));
490 Assert(!key->nextkey);
491 Assert(insertstate->postingoff == 0);
492
493 if (!insertstate->bounds_valid)
494 {
495 /* Start new binary search */
496 low = P_FIRSTDATAKEY(opaque);
497 high = PageGetMaxOffsetNumber(page);
498 }
499 else
500 {
501 /* Restore result of previous binary search against same page */
502 low = insertstate->low;
503 high = insertstate->stricthigh;
504 }
505
506 /* If there are no keys on the page, return the first available slot */
507 if (unlikely(high < low))
508 {
509 /* Caller can't reuse bounds */
511 insertstate->stricthigh = InvalidOffsetNumber;
512 insertstate->bounds_valid = false;
513 return low;
514 }
515
516 /*
517 * Binary search to find the first key on the page >= scan key. (nextkey
518 * is always false when inserting).
519 *
520 * The loop invariant is: all slots before 'low' are < scan key, all slots
521 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
522 * maintained to save additional search effort for caller.
523 *
524 * We can fall out when high == low.
525 */
526 if (!insertstate->bounds_valid)
527 high++; /* establish the loop invariant for high */
528 stricthigh = high; /* high initially strictly higher */
529
530 cmpval = 1; /* !nextkey comparison value */
531
532 while (high > low)
533 {
534 OffsetNumber mid = low + ((high - low) / 2);
535
536 /* We have low <= mid < high, so mid points at a real slot */
537
538 result = _bt_compare(rel, key, page, mid);
539
540 if (result >= cmpval)
541 low = mid + 1;
542 else
543 {
544 high = mid;
545 if (result != 0)
546 stricthigh = high;
547 }
548
549 /*
550 * If tuple at offset located by binary search is a posting list whose
551 * TID range overlaps with caller's scantid, perform posting list
552 * binary search to set postingoff for caller. Caller must split the
553 * posting list when postingoff is set. This should happen
554 * infrequently.
555 */
556 if (unlikely(result == 0 && key->scantid != NULL))
557 {
558 /*
559 * postingoff should never be set more than once per leaf page
560 * binary search. That would mean that there are duplicate table
561 * TIDs in the index, which is never okay. Check for that here.
562 */
563 if (insertstate->postingoff != 0)
566 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
567 ItemPointerGetBlockNumber(key->scantid),
568 ItemPointerGetOffsetNumber(key->scantid),
569 low, stricthigh,
572
573 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
574 }
575 }
576
577 /*
578 * On a leaf page, a binary search always returns the first key >= scan
579 * key (at least in !nextkey case), which could be the last slot + 1. This
580 * is also the lower bound of cached search.
581 *
582 * stricthigh may also be the last slot + 1, which prevents caller from
583 * using bounds directly, but is still useful to us if we're called a
584 * second time with cached bounds (cached low will be < stricthigh when
585 * that happens).
586 */
587 insertstate->low = low;
588 insertstate->stricthigh = stricthigh;
589 insertstate->bounds_valid = true;
590
591 return low;
592}
593
594/*----------
595 * _bt_binsrch_posting() -- posting list binary search.
596 *
597 * Helper routine for _bt_binsrch_insert().
598 *
599 * Returns offset into posting list where caller's scantid belongs.
600 *----------
601 */
602static int
604{
605 IndexTuple itup;
606 ItemId itemid;
607 int low,
608 high,
609 mid,
610 res;
611
612 /*
613 * If this isn't a posting tuple, then the index must be corrupt (if it is
614 * an ordinary non-pivot tuple then there must be an existing tuple with a
615 * heap TID that equals inserter's new heap TID/scantid). Defensively
616 * check that tuple is a posting list tuple whose posting list range
617 * includes caller's scantid.
618 *
619 * (This is also needed because contrib/amcheck's rootdescend option needs
620 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
621 */
622 itemid = PageGetItemId(page, offnum);
623 itup = (IndexTuple) PageGetItem(page, itemid);
624 if (!BTreeTupleIsPosting(itup))
625 return 0;
626
627 Assert(key->heapkeyspace && key->allequalimage);
628
629 /*
630 * In the event that posting list tuple has LP_DEAD bit set, indicate this
631 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
632 * second call to _bt_binsrch_insert() can take place when its caller has
633 * removed the dead item.
634 */
635 if (ItemIdIsDead(itemid))
636 return -1;
637
638 /* "high" is past end of posting list for loop invariant */
639 low = 0;
640 high = BTreeTupleGetNPosting(itup);
641 Assert(high >= 2);
642
643 while (high > low)
644 {
645 mid = low + ((high - low) / 2);
646 res = ItemPointerCompare(key->scantid,
647 BTreeTupleGetPostingN(itup, mid));
648
649 if (res > 0)
650 low = mid + 1;
651 else if (res < 0)
652 high = mid;
653 else
654 return mid;
655 }
656
657 /* Exact match not found */
658 return low;
659}
660
661/*----------
662 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
663 *
664 * page/offnum: location of btree item to be compared to.
665 *
666 * This routine returns:
667 * <0 if scankey < tuple at offnum;
668 * 0 if scankey == tuple at offnum;
669 * >0 if scankey > tuple at offnum.
670 *
671 * NULLs in the keys are treated as sortable values. Therefore
672 * "equality" does not necessarily mean that the item should be returned
673 * to the caller as a matching key. Similarly, an insertion scankey
674 * with its scantid set is treated as equal to a posting tuple whose TID
675 * range overlaps with their scantid. There generally won't be a
676 * matching TID in the posting tuple, which caller must handle
677 * themselves (e.g., by splitting the posting list tuple).
678 *
679 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
680 * "minus infinity": this routine will always claim it is less than the
681 * scankey. The actual key value stored is explicitly truncated to 0
682 * attributes (explicitly minus infinity) with version 3+ indexes, but
683 * that isn't relied upon. This allows us to implement the Lehman and
684 * Yao convention that the first down-link pointer is before the first
685 * key. See backend/access/nbtree/README for details.
686 *----------
687 */
688int32
690 BTScanInsert key,
691 Page page,
692 OffsetNumber offnum)
693{
695 BTPageOpaque opaque = BTPageGetOpaque(page);
696 IndexTuple itup;
697 ItemPointer heapTid;
699 int ncmpkey;
700 int ntupatts;
701 int32 result;
702
703 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
705 Assert(key->heapkeyspace || key->scantid == NULL);
706
707 /*
708 * Force result ">" if target item is first data item on an internal page
709 * --- see NOTE above.
710 */
711 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
712 return 1;
713
714 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
715 ntupatts = BTreeTupleGetNAtts(itup, rel);
716
717 /*
718 * The scan key is set up with the attribute number associated with each
719 * term in the key. It is important that, if the index is multi-key, the
720 * scan contain the first k key attributes, and that they be in order. If
721 * you think about how multi-key ordering works, you'll understand why
722 * this is.
723 *
724 * We don't test for violation of this condition here, however. The
725 * initial setup for the index scan had better have gotten it right (see
726 * _bt_first).
727 */
728
729 ncmpkey = Min(ntupatts, key->keysz);
730 Assert(key->heapkeyspace || ncmpkey == key->keysz);
731 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
732 scankey = key->scankeys;
733 for (int i = 1; i <= ncmpkey; i++)
734 {
735 Datum datum;
736 bool isNull;
737
738 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
739
740 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
741 {
742 if (isNull)
743 result = 0; /* NULL "=" NULL */
744 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
745 result = -1; /* NULL "<" NOT_NULL */
746 else
747 result = 1; /* NULL ">" NOT_NULL */
748 }
749 else if (isNull) /* key is NOT_NULL and item is NULL */
750 {
751 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
752 result = 1; /* NOT_NULL ">" NULL */
753 else
754 result = -1; /* NOT_NULL "<" NULL */
755 }
756 else
757 {
758 /*
759 * The sk_func needs to be passed the index value as left arg and
760 * the sk_argument as right arg (they might be of different
761 * types). Since it is convenient for callers to think of
762 * _bt_compare as comparing the scankey to the index item, we have
763 * to flip the sign of the comparison result. (Unless it's a DESC
764 * column, in which case we *don't* flip the sign.)
765 */
766 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
767 scankey->sk_collation,
768 datum,
769 scankey->sk_argument));
770
771 if (!(scankey->sk_flags & SK_BT_DESC))
772 INVERT_COMPARE_RESULT(result);
773 }
774
775 /* if the keys are unequal, return the difference */
776 if (result != 0)
777 return result;
778
779 scankey++;
780 }
781
782 /*
783 * All non-truncated attributes (other than heap TID) were found to be
784 * equal. Treat truncated attributes as minus infinity when scankey has a
785 * key attribute value that would otherwise be compared directly.
786 *
787 * Note: it doesn't matter if ntupatts includes non-key attributes;
788 * scankey won't, so explicitly excluding non-key attributes isn't
789 * necessary.
790 */
791 if (key->keysz > ntupatts)
792 return 1;
793
794 /*
795 * Use the heap TID attribute and scantid to try to break the tie. The
796 * rules are the same as any other key attribute -- only the
797 * representation differs.
798 */
799 heapTid = BTreeTupleGetHeapTID(itup);
800 if (key->scantid == NULL)
801 {
802 /*
803 * Forward scans have a scankey that is considered greater than a
804 * truncated pivot tuple if and when the scankey has equal values for
805 * attributes up to and including the least significant untruncated
806 * attribute in tuple. Even attributes that were omitted from the
807 * scan key are considered greater than -inf truncated attributes.
808 * (See _bt_binsrch for an explanation of our backward scan behavior.)
809 *
810 * For example, if an index has the minimum two attributes (single
811 * user key attribute, plus heap TID attribute), and a page's high key
812 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
813 * will not descend to the page to the left. The search will descend
814 * right instead. The truncated attribute in pivot tuple means that
815 * all non-pivot tuples on the page to the left are strictly < 'foo',
816 * so it isn't necessary to descend left. In other words, search
817 * doesn't have to descend left because it isn't interested in a match
818 * that has a heap TID value of -inf.
819 *
820 * Note: the heap TID part of the test ensures that scankey is being
821 * compared to a pivot tuple with one or more truncated -inf key
822 * attributes. The heap TID attribute is the last key attribute in
823 * every index, of course, but other than that it isn't special.
824 */
825 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
826 key->heapkeyspace)
827 return 1;
828
829 /* All provided scankey arguments found to be equal */
830 return 0;
831 }
832
833 /*
834 * Treat truncated heap TID as minus infinity, since scankey has a key
835 * attribute value (scantid) that would otherwise be compared directly
836 */
838 if (heapTid == NULL)
839 return 1;
840
841 /*
842 * Scankey must be treated as equal to a posting list tuple if its scantid
843 * value falls within the range of the posting list. In all other cases
844 * there can only be a single heap TID value, which is compared directly
845 * with scantid.
846 */
848 result = ItemPointerCompare(key->scantid, heapTid);
849 if (result <= 0 || !BTreeTupleIsPosting(itup))
850 return result;
851 else
852 {
853 result = ItemPointerCompare(key->scantid,
855 if (result > 0)
856 return 1;
857 }
858
859 return 0;
860}
861
862/*
863 * _bt_first() -- Find the first item in a scan.
864 *
865 * We need to be clever about the direction of scan, the search
866 * conditions, and the tree ordering. We find the first item (or,
867 * if backwards scan, the last item) in the tree that satisfies the
868 * qualifications in the scan key. On success exit, data about the
869 * matching tuple(s) on the page has been loaded into so->currPos. We'll
870 * drop all locks and hold onto a pin on page's buffer, except during
871 * so->dropPin scans, when we drop both the lock and the pin.
872 * _bt_returnitem sets the next item to return to scan on success exit.
873 *
874 * If there are no matching items in the index, we return false, with no
875 * pins or locks held. so->currPos will remain invalid.
876 *
877 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
878 * are both search-type scankeys (see nbtree/README for more about this).
879 * Within this routine, we build a temporary insertion-type scankey to use
880 * in locating the scan start position.
881 */
882bool
884{
885 Relation rel = scan->indexRelation;
887 OffsetNumber offnum;
888 BTScanInsertData inskey;
891 int keysz = 0;
895
896 Assert(!BTScanPosIsValid(so->currPos));
897
898 /*
899 * Examine the scan keys and eliminate any redundant keys; also mark the
900 * keys that must be matched to continue the scan.
901 */
903
904 /*
905 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 * never be satisfied (eg, x == 1 AND x > 2).
907 */
908 if (!so->qual_ok)
909 {
910 Assert(!so->needPrimScan);
911 _bt_parallel_done(scan);
912 return false;
913 }
914
915 /*
916 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 * will likely release the parallel scan later on.
918 */
919 if (scan->parallel_scan != NULL &&
920 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 return false;
922
923 /*
924 * Initialize the scan's arrays (if any) for the current scan direction
925 * (except when they were already set to later values as part of
926 * scheduling the primitive index scan that is now underway)
927 */
928 if (so->numArrayKeys && !so->needPrimScan)
929 _bt_start_array_keys(scan, dir);
930
931 if (blkno != InvalidBlockNumber)
932 {
933 /*
934 * We anticipated calling _bt_search, but another worker bet us to it.
935 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 */
937 Assert(scan->parallel_scan != NULL);
938 Assert(!so->needPrimScan);
939 Assert(blkno != P_NONE);
940
941 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
942 return false;
943
944 _bt_returnitem(scan, so);
945 return true;
946 }
947
948 /*
949 * Count an indexscan for stats, now that we know that we'll call
950 * _bt_search/_bt_endpoint below
951 */
953 if (scan->instrument)
954 scan->instrument->nsearches++;
955
956 /*----------
957 * Examine the scan keys to discover where we need to start the scan.
958 * The selected scan keys (at most one per index column) are remembered by
959 * storing their addresses into the local startKeys[] array. The final
960 * startKeys[] entry's strategy is set in strat_total. (Actually, there
961 * are a couple of cases where we force a less/more restrictive strategy.)
962 *
963 * We must use the key that was marked required (in the direction opposite
964 * our own scan's) during preprocessing. Each index attribute can only
965 * have one such required key. In general, the keys that we use to find
966 * an initial position when scanning forwards are the same keys that end
967 * the scan on the leaf level when scanning backwards (and vice-versa).
968 *
969 * When the scan keys include cross-type operators, _bt_preprocess_keys
970 * may not be able to eliminate redundant keys; in such cases it will
971 * arbitrarily pick a usable key for each attribute (and scan direction),
972 * ensuring that there is no more than one key required in each direction.
973 * We stop considering further keys once we reach the first nonrequired
974 * key (which must come after all required keys), so this can't affect us.
975 *
976 * The required keys that we use as starting boundaries have to be =, >,
977 * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
978 * We can use keys for multiple attributes so long as the prior attributes
979 * had only =, >= (resp. =, <=) keys. These rules are very similar to the
980 * rules that preprocessing used to determine which keys to mark required.
981 * We cannot always use every required key as a positioning key, though.
982 * Skip arrays necessitate independently applying our own rules here.
983 * Skip arrays are always generally considered = array keys, but we'll
984 * nevertheless treat them as inequalities at certain points of the scan.
985 * When that happens, it _might_ have implications for the number of
986 * required keys that we can safely use for initial positioning purposes.
987 *
988 * For example, a forward scan with a skip array on its leading attribute
989 * (with no low_compare/high_compare) will have at least two required scan
990 * keys, but we won't use any of them as boundary keys during the scan's
991 * initial call here. Our positioning key during the first call here can
992 * be thought of as representing "> -infinity". Similarly, if such a skip
993 * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
994 * during the scan's initial call here; a lower-order key such as "b = 42"
995 * can't be used until the "a" array advances beyond MINVAL/low_compare.
996 *
997 * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
998 * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
999 * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
1000 * that we treat = and >= as equivalent when scanning forwards (just as we
1001 * treat = and <= as equivalent when scanning backwards). We effectively
1002 * do the same thing (though with a distinct "a" element/value) each time.
1003 *
1004 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1005 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1006 * If the index stores nulls at the end of the index we'll be starting
1007 * from, and we have no boundary key for the column (which means the key
1008 * we deduced NOT NULL from is an inequality key that constrains the other
1009 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1010 * use as a boundary key. If we didn't do this, we might find ourselves
1011 * traversing a lot of null entries at the start of the scan.
1012 *
1013 * In this loop, row-comparison keys are treated the same as keys on their
1014 * first (leftmost) columns. We'll add all lower-order columns of the row
1015 * comparison that were marked required during preprocessing below.
1016 *
1017 * _bt_advance_array_keys needs to know exactly how we'll reposition the
1018 * scan (should it opt to schedule another primitive index scan). It is
1019 * critical that primscans only be scheduled when they'll definitely make
1020 * some useful progress. _bt_advance_array_keys does this by calling
1021 * _bt_checkkeys routines that report whether a tuple is past the end of
1022 * matches for the scan's keys (given the scan's current array elements).
1023 * If the page's final tuple is "after the end of matches" for a scan that
1024 * uses the *opposite* scan direction, then it must follow that it's also
1025 * "before the start of matches" for the actual current scan direction.
1026 * It is therefore essential that all of our initial positioning rules are
1027 * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1028 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1029 * need to be kept in sync.
1030 *----------
1031 */
1032 if (so->numberOfKeys > 0)
1033 {
1035 ScanKey bkey;
1037 ScanKey cur;
1038
1039 /*
1040 * bkey will be set to the key that preprocessing left behind as the
1041 * boundary key for this attribute, in this scan direction (if any)
1042 */
1043 cur = so->keyData;
1044 curattr = 1;
1045 bkey = NULL;
1046 /* Also remember any scankey that implies a NOT NULL constraint */
1047 impliesNN = NULL;
1048
1049 /*
1050 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1051 * pass to handle after-last-key processing. Actual exit from the
1052 * loop is at one of the "break" statements below.
1053 */
1054 for (int i = 0;; cur++, i++)
1055 {
1056 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1057 {
1058 /* Done looking for the curattr boundary key */
1059 Assert(bkey == NULL ||
1060 (bkey->sk_attno == curattr &&
1061 (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1062 Assert(impliesNN == NULL ||
1063 (impliesNN->sk_attno == curattr &&
1064 (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1065
1066 /*
1067 * If this is a scan key for a skip array whose current
1068 * element is MINVAL, choose low_compare (when scanning
1069 * backwards it'll be MAXVAL, and we'll choose high_compare).
1070 *
1071 * Note: if the array's low_compare key makes 'bkey' NULL,
1072 * then we behave as if the array's first element is -inf,
1073 * except when !array->null_elem implies a usable NOT NULL
1074 * constraint.
1075 */
1076 if (bkey != NULL &&
1077 (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1078 {
1079 int ikey = bkey - so->keyData;
1081 BTArrayKeyInfo *array = NULL;
1082
1083 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1084 {
1085 array = &so->arrayKeys[arridx];
1086 if (array->scan_key == ikey)
1087 break;
1088 }
1089
1090 if (ScanDirectionIsForward(dir))
1091 {
1092 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1093 bkey = array->low_compare;
1094 }
1095 else
1096 {
1097 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1098 bkey = array->high_compare;
1099 }
1100
1101 Assert(bkey == NULL ||
1102 bkey->sk_attno == skipequalitykey->sk_attno);
1103
1104 if (!array->null_elem)
1106 else
1107 Assert(bkey == NULL && impliesNN == NULL);
1108 }
1109
1110 /*
1111 * If we didn't find a usable boundary key, see if we can
1112 * deduce a NOT NULL key
1113 */
1114 if (bkey == NULL && impliesNN != NULL &&
1115 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1118 {
1119 /* Final startKeys[] entry will be deduced NOT NULL key */
1120 bkey = &notnullkey;
1123 (impliesNN->sk_flags &
1125 curattr,
1128 InvalidOid,
1129 InvalidOid,
1130 InvalidOid,
1131 (Datum) 0);
1132 }
1133
1134 /*
1135 * If preprocessing didn't leave a usable boundary key, quit;
1136 * else save the boundary key pointer in startKeys[]
1137 */
1138 if (bkey == NULL)
1139 break;
1140 startKeys[keysz++] = bkey;
1141
1142 /*
1143 * We can only consider adding more boundary keys when the one
1144 * that we just chose to add uses either the = or >= strategy
1145 * (during backwards scans we can only do so when the key that
1146 * we just added to startKeys[] uses the = or <= strategy)
1147 */
1148 strat_total = bkey->sk_strategy;
1151 break;
1152
1153 /*
1154 * If the key that we just added to startKeys[] is a skip
1155 * array = key whose current element is marked NEXT or PRIOR,
1156 * make strat_total > or < (and stop adding boundary keys).
1157 * This can only happen with opclasses that lack skip support.
1158 */
1159 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1160 {
1161 Assert(bkey->sk_flags & SK_BT_SKIP);
1163
1164 if (ScanDirectionIsForward(dir))
1165 {
1166 Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1168 }
1169 else
1170 {
1171 Assert(!(bkey->sk_flags & SK_BT_NEXT));
1173 }
1174
1175 /*
1176 * We're done. We'll never find an exact = match for a
1177 * NEXT or PRIOR sentinel sk_argument value. There's no
1178 * sense in trying to add more keys to startKeys[].
1179 */
1180 break;
1181 }
1182
1183 /*
1184 * Done if that was the last scan key output by preprocessing.
1185 * Also done if we've now examined all keys marked required.
1186 */
1187 if (i >= so->numberOfKeys ||
1188 !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1189 break;
1190
1191 /*
1192 * Reset for next attr.
1193 */
1194 Assert(cur->sk_attno == curattr + 1);
1195 curattr = cur->sk_attno;
1196 bkey = NULL;
1197 impliesNN = NULL;
1198 }
1199
1200 /*
1201 * If we've located the starting boundary key for curattr, we have
1202 * no interest in curattr's other required key
1203 */
1204 if (bkey != NULL)
1205 continue;
1206
1207 /*
1208 * Is this key the starting boundary key for curattr?
1209 *
1210 * If not, does it imply a NOT NULL constraint? (Because
1211 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1212 * *any* inequality key works for that; we need not test.)
1213 */
1214 switch (cur->sk_strategy)
1215 {
1218 if (ScanDirectionIsBackward(dir))
1219 bkey = cur;
1220 else if (impliesNN == NULL)
1221 impliesNN = cur;
1222 break;
1224 bkey = cur;
1225 break;
1228 if (ScanDirectionIsForward(dir))
1229 bkey = cur;
1230 else if (impliesNN == NULL)
1231 impliesNN = cur;
1232 break;
1233 }
1234 }
1235 }
1236
1237 /*
1238 * If we found no usable boundary keys, we have to start from one end of
1239 * the tree. Walk down that edge to the first or last key, and scan from
1240 * there.
1241 *
1242 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1243 */
1244 if (keysz == 0)
1245 return _bt_endpoint(scan, dir);
1246
1247 /*
1248 * We want to start the scan somewhere within the index. Set up an
1249 * insertion scankey we can use to search for the boundary point we
1250 * identified above. The insertion scankey is built using the keys
1251 * identified by startKeys[]. (Remaining insertion scankey fields are
1252 * initialized after initial-positioning scan keys are finalized.)
1253 */
1254 Assert(keysz <= INDEX_MAX_KEYS);
1255 for (int i = 0; i < keysz; i++)
1256 {
1258
1259 Assert(bkey->sk_attno == i + 1);
1260
1261 if (bkey->sk_flags & SK_ROW_HEADER)
1262 {
1263 /*
1264 * Row comparison header: look to the first row member instead
1265 */
1266 ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1267 bool loosen_strat = false,
1268 tighten_strat = false;
1269
1270 /*
1271 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1272 * would've marked the qual as unsatisfiable, preventing us from
1273 * ever getting this far
1274 */
1275 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1276 Assert(subkey->sk_attno == bkey->sk_attno);
1277 Assert(!(subkey->sk_flags & SK_ISNULL));
1278
1279 /*
1280 * This is either a > or >= key (during backwards scans it is
1281 * either < or <=) that was marked required during preprocessing.
1282 * Later so->keyData[] keys can't have been marked required, so
1283 * our row compare header key must be the final startKeys[] entry.
1284 */
1285 Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
1286 Assert(subkey->sk_strategy == bkey->sk_strategy);
1287 Assert(subkey->sk_strategy == strat_total);
1288 Assert(i == keysz - 1);
1289
1290 /*
1291 * The member scankeys are already in insertion format (ie, they
1292 * have sk_func = 3-way-comparison function)
1293 */
1294 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1295
1296 /*
1297 * Now look to later row compare members.
1298 *
1299 * If there's an "index attribute gap" between two row compare
1300 * members, the second member won't have been marked required, and
1301 * so can't be used as a starting boundary key here. The part of
1302 * the row comparison that we do still use has to be treated as a
1303 * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1304 * with an omitted intervening index attribute "b" will use an
1305 * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1306 * the leaf level might satisfy the row compare qual.
1307 *
1308 * We're able to use a _more_ restrictive strategy when we reach a
1309 * NULL row compare member, since they're always unsatisfiable.
1310 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1311 * insertion scan key "a > 1". All tuples where "a = 1" cannot
1312 * possibly satisfy the row compare qual, so this is safe.
1313 */
1314 Assert(!(subkey->sk_flags & SK_ROW_END));
1315 for (;;)
1316 {
1317 subkey++;
1318 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1319
1320 if (subkey->sk_flags & SK_ISNULL)
1321 {
1322 /*
1323 * NULL member key, can only use earlier keys.
1324 *
1325 * We deliberately avoid checking if this key is marked
1326 * required. All earlier keys are required, and this key
1327 * is unsatisfiable either way, so we can't miss anything.
1328 */
1329 tighten_strat = true;
1330 break;
1331 }
1332
1333 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1334 {
1335 /* nonrequired member key, can only use earlier keys */
1336 loosen_strat = true;
1337 break;
1338 }
1339
1340 Assert(subkey->sk_attno == keysz + 1);
1341 Assert(subkey->sk_strategy == bkey->sk_strategy);
1342 Assert(keysz < INDEX_MAX_KEYS);
1343
1344 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1345 keysz++;
1346
1347 if (subkey->sk_flags & SK_ROW_END)
1348 break;
1349 }
1351 if (loosen_strat)
1352 {
1353 /* Use less restrictive strategy (and fewer member keys) */
1354 switch (strat_total)
1355 {
1358 break;
1361 break;
1362 }
1363 }
1364 if (tighten_strat)
1365 {
1366 /* Use more restrictive strategy (and fewer member keys) */
1367 switch (strat_total)
1368 {
1371 break;
1374 break;
1375 }
1376 }
1377
1378 /* Done (row compare header key is always last startKeys[] key) */
1379 break;
1380 }
1381
1382 /*
1383 * Ordinary comparison key/search-style key.
1384 *
1385 * Transform the search-style scan key to an insertion scan key by
1386 * replacing the sk_func with the appropriate btree 3-way-comparison
1387 * function.
1388 *
1389 * If scankey operator is not a cross-type comparison, we can use the
1390 * cached comparison function; otherwise gotta look it up in the
1391 * catalogs. (That can't lead to infinite recursion, since no
1392 * indexscan initiated by syscache lookup will use cross-data-type
1393 * operators.)
1394 *
1395 * We support the convention that sk_subtype == InvalidOid means the
1396 * opclass input type; this hack simplifies life for ScanKeyInit().
1397 */
1398 if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1399 bkey->sk_subtype == InvalidOid)
1400 {
1402
1403 procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1405 bkey->sk_flags,
1406 bkey->sk_attno,
1408 bkey->sk_subtype,
1409 bkey->sk_collation,
1410 procinfo,
1411 bkey->sk_argument);
1412 }
1413 else
1414 {
1415 RegProcedure cmp_proc;
1416
1417 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1418 rel->rd_opcintype[i],
1419 bkey->sk_subtype, BTORDER_PROC);
1420 if (!RegProcedureIsValid(cmp_proc))
1421 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1422 BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1423 bkey->sk_attno, RelationGetRelationName(rel));
1425 bkey->sk_flags,
1426 bkey->sk_attno,
1428 bkey->sk_subtype,
1429 bkey->sk_collation,
1430 cmp_proc,
1431 bkey->sk_argument);
1432 }
1433 }
1434
1435 /*----------
1436 * Examine the selected initial-positioning strategy to determine exactly
1437 * where we need to start the scan, and set flag variables to control the
1438 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1439 * page _bt_search returns).
1440 *----------
1441 */
1442 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1443 inskey.anynullkeys = false; /* unused */
1444 inskey.scantid = NULL;
1445 inskey.keysz = keysz;
1446 switch (strat_total)
1447 {
1449
1450 inskey.nextkey = false;
1451 inskey.backward = true;
1452 break;
1453
1455
1456 inskey.nextkey = true;
1457 inskey.backward = true;
1458 break;
1459
1461
1462 /*
1463 * If a backward scan was specified, need to start with last equal
1464 * item not first one.
1465 */
1466 if (ScanDirectionIsBackward(dir))
1467 {
1468 /*
1469 * This is the same as the <= strategy
1470 */
1471 inskey.nextkey = true;
1472 inskey.backward = true;
1473 }
1474 else
1475 {
1476 /*
1477 * This is the same as the >= strategy
1478 */
1479 inskey.nextkey = false;
1480 inskey.backward = false;
1481 }
1482 break;
1483
1485
1486 /*
1487 * Find first item >= scankey
1488 */
1489 inskey.nextkey = false;
1490 inskey.backward = false;
1491 break;
1492
1494
1495 /*
1496 * Find first item > scankey
1497 */
1498 inskey.nextkey = true;
1499 inskey.backward = false;
1500 break;
1501
1502 default:
1503 /* can't get here, but keep compiler quiet */
1504 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1505 return false;
1506 }
1507
1508 /*
1509 * Use the manufactured insertion scan key to descend the tree and
1510 * position ourselves on the target leaf page.
1511 */
1512 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1513 _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
1514
1515 if (!BufferIsValid(so->currPos.buf))
1516 {
1517 Assert(!so->needPrimScan);
1518
1519 /*
1520 * We only get here if the index is completely empty. Lock relation
1521 * because nothing finer to lock exists. Without a buffer lock, it's
1522 * possible for another transaction to insert data between
1523 * _bt_search() and PredicateLockRelation(). We have to try again
1524 * after taking the relation-level predicate lock, to close a narrow
1525 * window where we wouldn't scan concurrently inserted tuples, but the
1526 * writer wouldn't see our predicate lock.
1527 */
1529 {
1531 _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
1532 }
1533
1534 if (!BufferIsValid(so->currPos.buf))
1535 {
1536 _bt_parallel_done(scan);
1537 return false;
1538 }
1539 }
1540
1541 /* position to the precise item on the page */
1542 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1543
1544 /*
1545 * Now load data from the first page of the scan (usually the page
1546 * currently in so->currPos.buf).
1547 *
1548 * If inskey.nextkey = false and inskey.backward = false, offnum is
1549 * positioned at the first non-pivot tuple >= inskey.scankeys.
1550 *
1551 * If inskey.nextkey = false and inskey.backward = true, offnum is
1552 * positioned at the last non-pivot tuple < inskey.scankeys.
1553 *
1554 * If inskey.nextkey = true and inskey.backward = false, offnum is
1555 * positioned at the first non-pivot tuple > inskey.scankeys.
1556 *
1557 * If inskey.nextkey = true and inskey.backward = true, offnum is
1558 * positioned at the last non-pivot tuple <= inskey.scankeys.
1559 *
1560 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1561 * for the page. For example, when inskey is both < the leaf page's high
1562 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1563 */
1564 if (!_bt_readfirstpage(scan, offnum, dir))
1565 return false;
1566
1567 _bt_returnitem(scan, so);
1568 return true;
1569}
1570
1571/*
1572 * _bt_next() -- Get the next item in a scan.
1573 *
1574 * On entry, so->currPos describes the current page, which may be pinned
1575 * but is not locked, and so->currPos.itemIndex identifies which item was
1576 * previously returned.
1577 *
1578 * On success exit, so->currPos is updated as needed, and _bt_returnitem
1579 * sets the next item to return to the scan. so->currPos remains valid.
1580 *
1581 * On failure exit (no more tuples), we invalidate so->currPos. It'll
1582 * still be possible for the scan to return tuples by changing direction,
1583 * though we'll need to call _bt_first anew in that other direction.
1584 */
1585bool
1587{
1589
1590 Assert(BTScanPosIsValid(so->currPos));
1591
1592 /*
1593 * Advance to next tuple on current page; or if there's no more, try to
1594 * step to the next page with data.
1595 */
1596 if (ScanDirectionIsForward(dir))
1597 {
1598 if (++so->currPos.itemIndex > so->currPos.lastItem)
1599 {
1600 if (!_bt_steppage(scan, dir))
1601 return false;
1602 }
1603 }
1604 else
1605 {
1606 if (--so->currPos.itemIndex < so->currPos.firstItem)
1607 {
1608 if (!_bt_steppage(scan, dir))
1609 return false;
1610 }
1611 }
1612
1613 _bt_returnitem(scan, so);
1614 return true;
1615}
1616
1617/*
1618 * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
1619 * index scan by setting the relevant fields in caller's index scan descriptor
1620 */
1621static inline void
1623{
1624 BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
1625
1626 /* Most recent _bt_readpage must have succeeded */
1627 Assert(BTScanPosIsValid(so->currPos));
1628 Assert(so->currPos.itemIndex >= so->currPos.firstItem);
1629 Assert(so->currPos.itemIndex <= so->currPos.lastItem);
1630
1631 /* Return next item, per amgettuple contract */
1632 scan->xs_heaptid = currItem->heapTid;
1633 if (so->currTuples)
1634 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1635}
1636
1637/*
1638 * _bt_steppage() -- Step to next page containing valid data for scan
1639 *
1640 * Wrapper on _bt_readnextpage that performs final steps for the current page.
1641 *
1642 * On entry, so->currPos must be valid. Its buffer will be pinned, though
1643 * never locked. (Actually, when so->dropPin there won't even be a pin held,
1644 * though so->currPos.currPage must still be set to a valid block number.)
1645 */
1646static bool
1648{
1650 BlockNumber blkno,
1652
1653 Assert(BTScanPosIsValid(so->currPos));
1654
1655 /* Before leaving current page, deal with any killed items */
1656 if (so->numKilled > 0)
1657 _bt_killitems(scan);
1658
1659 /*
1660 * Before we modify currPos, make a copy of the page data if there was a
1661 * mark position that needs it.
1662 */
1663 if (so->markItemIndex >= 0)
1664 {
1665 /* bump pin on current buffer for assignment to mark buffer */
1666 if (BTScanPosIsPinned(so->currPos))
1667 IncrBufferRefCount(so->currPos.buf);
1668 memcpy(&so->markPos, &so->currPos,
1670 so->currPos.lastItem * sizeof(BTScanPosItem));
1671 if (so->markTuples)
1672 memcpy(so->markTuples, so->currTuples,
1673 so->currPos.nextTupleOffset);
1674 so->markPos.itemIndex = so->markItemIndex;
1675 so->markItemIndex = -1;
1676
1677 /*
1678 * If we're just about to start the next primitive index scan
1679 * (possible with a scan that has arrays keys, and needs to skip to
1680 * continue in the current scan direction), moreLeft/moreRight only
1681 * indicate the end of the current primitive index scan. They must
1682 * never be taken to indicate that the top-level index scan has ended
1683 * (that would be wrong).
1684 *
1685 * We could handle this case by treating the current array keys as
1686 * markPos state. But depending on the current array state like this
1687 * would add complexity. Instead, we just unset markPos's copy of
1688 * moreRight or moreLeft (whichever might be affected), while making
1689 * btrestrpos reset the scan's arrays to their initial scan positions.
1690 * In effect, btrestrpos leaves advancing the arrays up to the first
1691 * _bt_readpage call (that takes place after it has restored markPos).
1692 */
1693 if (so->needPrimScan)
1694 {
1695 if (ScanDirectionIsForward(so->currPos.dir))
1696 so->markPos.moreRight = true;
1697 else
1698 so->markPos.moreLeft = true;
1699 }
1700
1701 /* mark/restore not supported by parallel scans */
1702 Assert(!scan->parallel_scan);
1703 }
1704
1705 BTScanPosUnpinIfPinned(so->currPos);
1706
1707 /* Walk to the next page with data */
1708 if (ScanDirectionIsForward(dir))
1709 blkno = so->currPos.nextPage;
1710 else
1711 blkno = so->currPos.prevPage;
1712 lastcurrblkno = so->currPos.currPage;
1713
1714 /*
1715 * Cancel primitive index scans that were scheduled when the call to
1716 * _bt_readpage for currPos happened to use the opposite direction to the
1717 * one that we're stepping in now. (It's okay to leave the scan's array
1718 * keys as-is, since the next _bt_readpage will advance them.)
1719 */
1720 if (so->currPos.dir != dir)
1721 so->needPrimScan = false;
1722
1723 return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
1724}
1725
1726/*
1727 * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
1728 *
1729 * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1730 * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1731 * When we're passed an offnum past the end of the page, we might still manage
1732 * to stop the scan on this page by calling _bt_checkkeys against the high
1733 * key. See _bt_readpage for full details.
1734 *
1735 * On entry, so->currPos must be pinned and locked (so offnum stays valid).
1736 * Parallel scan callers must have seized the scan before calling here.
1737 *
1738 * On exit, we'll have updated so->currPos and retained locks and pins
1739 * according to the same rules as those laid out for _bt_readnextpage exit.
1740 * Like _bt_readnextpage, our return value indicates if there are any matching
1741 * records in the given direction.
1742 *
1743 * We always release the scan for a parallel scan caller, regardless of
1744 * success or failure; we'll call _bt_parallel_release as soon as possible.
1745 */
1746static bool
1748{
1750
1751 so->numKilled = 0; /* just paranoia */
1752 so->markItemIndex = -1; /* ditto */
1753
1754 /* Initialize so->currPos for the first page (page in so->currPos.buf) */
1755 if (so->needPrimScan)
1756 {
1757 Assert(so->numArrayKeys);
1758
1759 so->currPos.moreLeft = true;
1760 so->currPos.moreRight = true;
1761 so->needPrimScan = false;
1762 }
1763 else if (ScanDirectionIsForward(dir))
1764 {
1765 so->currPos.moreLeft = false;
1766 so->currPos.moreRight = true;
1767 }
1768 else
1769 {
1770 so->currPos.moreLeft = true;
1771 so->currPos.moreRight = false;
1772 }
1773
1774 /*
1775 * Attempt to load matching tuples from the first page.
1776 *
1777 * Note that _bt_readpage will finish initializing the so->currPos fields.
1778 * _bt_readpage also releases parallel scan (even when it returns false).
1779 */
1780 if (_bt_readpage(scan, dir, offnum, true))
1781 {
1782 Relation rel = scan->indexRelation;
1783
1784 /*
1785 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1786 * so->currPos.buf in preparation for btgettuple returning tuples.
1787 */
1788 Assert(BTScanPosIsPinned(so->currPos));
1790 return true;
1791 }
1792
1793 /* There's no actually-matching data on the page in so->currPos.buf */
1794 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1795
1796 /* Call _bt_readnextpage using its _bt_steppage wrapper function */
1797 if (!_bt_steppage(scan, dir))
1798 return false;
1799
1800 /* _bt_readpage for a later page (now in so->currPos) succeeded */
1801 return true;
1802}
1803
1804/*
1805 * _bt_readnextpage() -- Read next page containing valid data for _bt_next
1806 *
1807 * Caller's blkno is the next interesting page's link, taken from either the
1808 * previously-saved right link or left link. lastcurrblkno is the page that
1809 * was current at the point where the blkno link was saved, which we use to
1810 * reason about concurrent page splits/page deletions during backwards scans.
1811 * In the common case where seized=false, blkno is either so->currPos.nextPage
1812 * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
1813 *
1814 * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
1815 * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
1816 * won't need to be read again in almost all cases). Parallel scan callers
1817 * that seized the scan before calling here should pass seized=true; such a
1818 * caller's blkno and lastcurrblkno arguments come from the seized scan.
1819 * seized=false callers just pass us the blkno/lastcurrblkno taken from their
1820 * so->currPos, which (along with so->currPos itself) can be used to end the
1821 * scan. A seized=false caller's blkno can never be assumed to be the page
1822 * that must be read next during a parallel scan, though. We must figure that
1823 * part out for ourselves by seizing the scan (the correct page to read might
1824 * already be beyond the seized=false caller's blkno during a parallel scan,
1825 * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
1826 * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
1827 *
1828 * On success exit, so->currPos is updated to contain data from the next
1829 * interesting page, and we return true. We hold a pin on the buffer on
1830 * success exit (except during so->dropPin index scans, when we drop the pin
1831 * eagerly to avoid blocking VACUUM).
1832 *
1833 * If there are no more matching records in the given direction, we invalidate
1834 * so->currPos (while ensuring it retains no locks or pins), and return false.
1835 *
1836 * We always release the scan for a parallel scan caller, regardless of
1837 * success or failure; we'll call _bt_parallel_release as soon as possible.
1838 */
1839static bool
1842{
1843 Relation rel = scan->indexRelation;
1845
1846 Assert(so->currPos.currPage == lastcurrblkno || seized);
1847 Assert(!(blkno == P_NONE && seized));
1848 Assert(!BTScanPosIsPinned(so->currPos));
1849
1850 /*
1851 * Remember that the scan already read lastcurrblkno, a page to the left
1852 * of blkno (or remember reading a page to the right, for backwards scans)
1853 */
1854 if (ScanDirectionIsForward(dir))
1855 so->currPos.moreLeft = true;
1856 else
1857 so->currPos.moreRight = true;
1858
1859 for (;;)
1860 {
1861 Page page;
1862 BTPageOpaque opaque;
1863
1864 if (blkno == P_NONE ||
1866 !so->currPos.moreRight : !so->currPos.moreLeft))
1867 {
1868 /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
1869 Assert(so->currPos.currPage == lastcurrblkno && !seized);
1870 BTScanPosInvalidate(so->currPos);
1871 _bt_parallel_done(scan); /* iff !so->needPrimScan */
1872 return false;
1873 }
1874
1875 Assert(!so->needPrimScan);
1876
1877 /* parallel scan must never actually visit so->currPos blkno */
1878 if (!seized && scan->parallel_scan != NULL &&
1879 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
1880 {
1881 /* whole scan is now done (or another primitive scan required) */
1882 BTScanPosInvalidate(so->currPos);
1883 return false;
1884 }
1885
1886 if (ScanDirectionIsForward(dir))
1887 {
1888 /* read blkno, but check for interrupts first */
1890 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1891 }
1892 else
1893 {
1894 /* read blkno, avoiding race (also checks for interrupts) */
1895 so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
1897 if (so->currPos.buf == InvalidBuffer)
1898 {
1899 /* must have been a concurrent deletion of leftmost page */
1900 BTScanPosInvalidate(so->currPos);
1901 _bt_parallel_done(scan);
1902 return false;
1903 }
1904 }
1905
1906 page = BufferGetPage(so->currPos.buf);
1907 opaque = BTPageGetOpaque(page);
1908 lastcurrblkno = blkno;
1909 if (likely(!P_IGNORE(opaque)))
1910 {
1911 /* see if there are any matches on this page */
1912 if (ScanDirectionIsForward(dir))
1913 {
1914 /* note that this will clear moreRight if we can stop */
1915 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
1916 break;
1917 blkno = so->currPos.nextPage;
1918 }
1919 else
1920 {
1921 /* note that this will clear moreLeft if we can stop */
1922 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
1923 break;
1924 blkno = so->currPos.prevPage;
1925 }
1926 }
1927 else
1928 {
1929 /* _bt_readpage not called, so do all this for ourselves */
1930 if (ScanDirectionIsForward(dir))
1931 blkno = opaque->btpo_next;
1932 else
1933 blkno = opaque->btpo_prev;
1934 if (scan->parallel_scan != NULL)
1935 _bt_parallel_release(scan, blkno, lastcurrblkno);
1936 }
1937
1938 /* no matching tuples on this page */
1939 _bt_relbuf(rel, so->currPos.buf);
1940 seized = false; /* released by _bt_readpage (or by us) */
1941 }
1942
1943 /*
1944 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1945 * so->currPos.buf in preparation for btgettuple returning tuples.
1946 */
1947 Assert(so->currPos.currPage == blkno);
1948 Assert(BTScanPosIsPinned(so->currPos));
1950
1951 return true;
1952}
1953
1954/*
1955 * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
1956 * recovering from concurrent page splits/page deletions when necessary
1957 *
1958 * Called during backwards scans, to deal with their unique concurrency rules.
1959 *
1960 * blkno points to the block number of the page that we expect to move the
1961 * scan to. We'll successfully move the scan there when we find that its
1962 * right sibling link still points to lastcurrblkno (the page we just read).
1963 * Otherwise, we have to figure out which page is the correct one for the scan
1964 * to now read the hard way, reasoning about concurrent splits and deletions.
1965 * See nbtree/README.
1966 *
1967 * On return, we have both a pin and a read lock on the returned page, whose
1968 * block number will be set in *blkno. Returns InvalidBuffer if there is no
1969 * page to the left (no lock or pin is held in that case).
1970 *
1971 * It is possible for the returned leaf page to be half-dead; caller must
1972 * check that condition and step left again when required.
1973 */
1974static Buffer
1977{
1978 BlockNumber origblkno = *blkno; /* detects circular links */
1979
1980 for (;;)
1981 {
1982 Buffer buf;
1983 Page page;
1984 BTPageOpaque opaque;
1985 int tries;
1986
1987 /* check for interrupts while we're not holding any buffer lock */
1989 buf = _bt_getbuf(rel, *blkno, BT_READ);
1990 page = BufferGetPage(buf);
1991 opaque = BTPageGetOpaque(page);
1992
1993 /*
1994 * If this isn't the page we want, walk right till we find what we
1995 * want --- but go no more than four hops (an arbitrary limit). If we
1996 * don't find the correct page by then, the most likely bet is that
1997 * lastcurrblkno got deleted and isn't in the sibling chain at all
1998 * anymore, not that its left sibling got split more than four times.
1999 *
2000 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2001 * because half-dead pages are still in the sibling chain.
2002 */
2003 tries = 0;
2004 for (;;)
2005 {
2006 if (likely(!P_ISDELETED(opaque) &&
2007 opaque->btpo_next == lastcurrblkno))
2008 {
2009 /* Found desired page, return it */
2010 return buf;
2011 }
2012 if (P_RIGHTMOST(opaque) || ++tries > 4)
2013 break;
2014 /* step right */
2015 *blkno = opaque->btpo_next;
2016 buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2017 page = BufferGetPage(buf);
2018 opaque = BTPageGetOpaque(page);
2019 }
2020
2021 /*
2022 * Return to the original page (usually the page most recently read by
2023 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2024 * what's up with its prev sibling link
2025 */
2027 page = BufferGetPage(buf);
2028 opaque = BTPageGetOpaque(page);
2029 if (P_ISDELETED(opaque))
2030 {
2031 /*
2032 * It was deleted. Move right to first nondeleted page (there
2033 * must be one); that is the page that has acquired the deleted
2034 * one's keyspace, so stepping left from it will take us where we
2035 * want to be.
2036 */
2037 for (;;)
2038 {
2039 if (P_RIGHTMOST(opaque))
2040 elog(ERROR, "fell off the end of index \"%s\"",
2042 lastcurrblkno = opaque->btpo_next;
2044 page = BufferGetPage(buf);
2045 opaque = BTPageGetOpaque(page);
2046 if (!P_ISDELETED(opaque))
2047 break;
2048 }
2049 }
2050 else
2051 {
2052 /*
2053 * Original lastcurrblkno wasn't deleted; the explanation had
2054 * better be that the page to the left got split or deleted.
2055 * Without this check, we risk going into an infinite loop.
2056 */
2057 if (opaque->btpo_prev == origblkno)
2058 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2060 /* Okay to try again, since left sibling link changed */
2061 }
2062
2063 /*
2064 * Original lastcurrblkno from caller was concurrently deleted (could
2065 * also have been a great many concurrent left sibling page splits).
2066 * Found a non-deleted page that should now act as our lastcurrblkno.
2067 */
2068 if (P_LEFTMOST(opaque))
2069 {
2070 /* New lastcurrblkno has no left sibling (concurrently deleted) */
2071 _bt_relbuf(rel, buf);
2072 break;
2073 }
2074
2075 /* Start from scratch with new lastcurrblkno's blkno/prev link */
2076 *blkno = origblkno = opaque->btpo_prev;
2077 _bt_relbuf(rel, buf);
2078 }
2079
2080 return InvalidBuffer;
2081}
2082
2083/*
2084 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2085 *
2086 * If the index is empty, we will return InvalidBuffer; any other failure
2087 * condition causes ereport(). We will not return a dead page.
2088 *
2089 * The returned buffer is pinned and read-locked.
2090 */
2091Buffer
2092_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2093{
2094 Buffer buf;
2095 Page page;
2096 BTPageOpaque opaque;
2097 OffsetNumber offnum;
2098 BlockNumber blkno;
2099 IndexTuple itup;
2100
2101 /*
2102 * If we are looking for a leaf page, okay to descend from fast root;
2103 * otherwise better descend from true root. (There is no point in being
2104 * smarter about intermediate levels.)
2105 */
2106 if (level == 0)
2107 buf = _bt_getroot(rel, NULL, BT_READ);
2108 else
2109 buf = _bt_gettrueroot(rel);
2110
2111 if (!BufferIsValid(buf))
2112 return InvalidBuffer;
2113
2114 page = BufferGetPage(buf);
2115 opaque = BTPageGetOpaque(page);
2116
2117 for (;;)
2118 {
2119 /*
2120 * If we landed on a deleted page, step right to find a live page
2121 * (there must be one). Also, if we want the rightmost page, step
2122 * right if needed to get to it (this could happen if the page split
2123 * since we obtained a pointer to it).
2124 */
2125 while (P_IGNORE(opaque) ||
2126 (rightmost && !P_RIGHTMOST(opaque)))
2127 {
2128 blkno = opaque->btpo_next;
2129 if (blkno == P_NONE)
2130 elog(ERROR, "fell off the end of index \"%s\"",
2132 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2133 page = BufferGetPage(buf);
2134 opaque = BTPageGetOpaque(page);
2135 }
2136
2137 /* Done? */
2138 if (opaque->btpo_level == level)
2139 break;
2140 if (opaque->btpo_level < level)
2141 ereport(ERROR,
2143 errmsg_internal("btree level %u not found in index \"%s\"",
2144 level, RelationGetRelationName(rel))));
2145
2146 /* Descend to leftmost or rightmost child page */
2147 if (rightmost)
2148 offnum = PageGetMaxOffsetNumber(page);
2149 else
2150 offnum = P_FIRSTDATAKEY(opaque);
2151
2153 elog(PANIC, "offnum out of range");
2154
2155 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2156 blkno = BTreeTupleGetDownLink(itup);
2157
2158 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2159 page = BufferGetPage(buf);
2160 opaque = BTPageGetOpaque(page);
2161 }
2162
2163 return buf;
2164}
2165
2166/*
2167 * _bt_endpoint() -- Find the first or last page in the index, and scan
2168 * from there to the first key satisfying all the quals.
2169 *
2170 * This is used by _bt_first() to set up a scan when we've determined
2171 * that the scan must start at the beginning or end of the index (for
2172 * a forward or backward scan respectively).
2173 *
2174 * Parallel scan callers must have seized the scan before calling here.
2175 * Exit conditions are the same as for _bt_first().
2176 */
2177static bool
2179{
2180 Relation rel = scan->indexRelation;
2182 Page page;
2183 BTPageOpaque opaque;
2185
2186 Assert(!BTScanPosIsValid(so->currPos));
2187 Assert(!so->needPrimScan);
2188
2189 /*
2190 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2191 * version of _bt_search().
2192 */
2193 so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2194
2195 if (!BufferIsValid(so->currPos.buf))
2196 {
2197 /*
2198 * Empty index. Lock the whole relation, as nothing finer to lock
2199 * exists.
2200 */
2202 _bt_parallel_done(scan);
2203 return false;
2204 }
2205
2206 page = BufferGetPage(so->currPos.buf);
2207 opaque = BTPageGetOpaque(page);
2208 Assert(P_ISLEAF(opaque));
2209
2210 if (ScanDirectionIsForward(dir))
2211 {
2212 /* There could be dead pages to the left, so not this: */
2213 /* Assert(P_LEFTMOST(opaque)); */
2214
2215 start = P_FIRSTDATAKEY(opaque);
2216 }
2217 else if (ScanDirectionIsBackward(dir))
2218 {
2219 Assert(P_RIGHTMOST(opaque));
2220
2222 }
2223 else
2224 {
2225 elog(ERROR, "invalid scan direction: %d", (int) dir);
2226 start = 0; /* keep compiler quiet */
2227 }
2228
2229 /*
2230 * Now load data from the first page of the scan.
2231 */
2232 if (!_bt_readfirstpage(scan, start, dir))
2233 return false;
2234
2235 _bt_returnitem(scan, so);
2236 return true;
2237}
int16 AttrNumber
Definition attnum.h:21
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5537
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4632
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:269
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:379
PageData * Page
Definition bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:397
#define RegProcedureIsValid(p)
Definition c.h:864
#define Min(x, y)
Definition c.h:1093
#define INVERT_COMPARE_RESULT(var)
Definition c.h:1195
#define likely(x)
Definition c.h:431
#define Assert(condition)
Definition c.h:945
regproc RegProcedure
Definition c.h:736
int32_t int32
Definition c.h:614
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
struct cursor * cur
Definition ecpg.c:29
int errcode(int sqlerrcode)
Definition elog.c:874
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define PANIC
Definition elog.h:42
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition fmgr.c:1151
return str start
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition indexam.c:917
int i
Definition isn.c:77
#define ItemIdIsDead(itemId)
Definition itemid.h:113
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
IndexTupleData * IndexTuple
Definition itup.h:53
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition itup.h:131
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition lsyscache.c:915
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition nbtinsert.c:2272
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition nbtpage.c:1008
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1028
Buffer _bt_gettrueroot(Relation rel)
Definition nbtpage.c:585
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition nbtpage.c:744
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition nbtpage.c:850
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1075
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition nbtpage.c:1044
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition nbtpage.c:347
void _bt_preprocess_keys(IndexScanDesc scan)
bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition nbtree.c:869
void _bt_parallel_done(IndexScanDesc scan)
Definition nbtree.c:1034
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition nbtree.c:1007
#define BTScanPosIsPinned(scanpos)
Definition nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
BTStackData * BTStack
Definition nbtree.h:750
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define SK_BT_SKIP
Definition nbtree.h:1106
#define P_HIKEY
Definition nbtree.h:368
#define BTORDER_PROC
Definition nbtree.h:717
#define P_LEFTMOST(opaque)
Definition nbtree.h:219
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define SK_BT_PRIOR
Definition nbtree.h:1112
#define P_ISDELETED(opaque)
Definition nbtree.h:223
#define SK_BT_NEXT
Definition nbtree.h:1111
#define BTScanPosIsValid(scanpos)
Definition nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
#define P_NONE
Definition nbtree.h:213
#define SK_BT_REQBKWD
Definition nbtree.h:1105
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define SK_BT_NULLS_FIRST
Definition nbtree.h:1117
#define P_INCOMPLETE_SPLIT(opaque)
Definition nbtree.h:228
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition nbtree.h:545
#define SK_BT_MAXVAL
Definition nbtree.h:1110
#define BT_READ
Definition nbtree.h:730
#define SK_BT_REQFWD
Definition nbtree.h:1104
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition nbtree.h:557
#define SK_BT_DESC
Definition nbtree.h:1116
#define P_IGNORE(opaque)
Definition nbtree.h:226
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition nbtree.h:1015
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition nbtree.h:639
#define BT_WRITE
Definition nbtree.h:731
#define BTreeTupleGetNAtts(itup, rel)
Definition nbtree.h:578
#define SK_BT_MINVAL
Definition nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition nbtree.h:1097
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition nbtsearch.c:2092
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition nbtsearch.c:242
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition nbtsearch.c:603
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:883
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition nbtsearch.c:1840
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition nbtsearch.c:344
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:2178
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1647
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition nbtsearch.c:1747
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition nbtsearch.c:1975
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition nbtsearch.c:475
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1586
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition nbtsearch.c:1622
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition nbtsearch.c:689
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access, bool returnstack)
Definition nbtsearch.c:100
static void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
Definition nbtsearch.c:55
void _bt_killitems(IndexScanDesc scan)
Definition nbtutils.c:189
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition nbtutils.c:959
#define InvalidOffsetNumber
Definition off.h:26
uint16 OffsetNumber
Definition off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
#define INDEX_MAX_KEYS
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pgstat_count_index_scan(rel)
Definition pgstat.h:708
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define InvalidOid
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition predicate.c:2585
static int fb(int x)
short access
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition scankey.c:101
#define ScanDirectionIsForward(direction)
Definition sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition sdir.h:50
ScanDirection
Definition sdir.h:25
#define SK_ROW_HEADER
Definition skey.h:117
#define SK_ROW_MEMBER
Definition skey.h:118
#define SK_SEARCHNOTNULL
Definition skey.h:122
#define SK_ROW_END
Definition skey.h:119
ScanKeyData * ScanKey
Definition skey.h:75
#define SK_ISNULL
Definition skey.h:115
uint16 StrategyNumber
Definition stratnum.h:22
#define BTGreaterStrategyNumber
Definition stratnum.h:33
#define InvalidStrategy
Definition stratnum.h:24
#define BTLessStrategyNumber
Definition stratnum.h:29
#define BTEqualStrategyNumber
Definition stratnum.h:31
#define BTLessEqualStrategyNumber
Definition stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition stratnum.h:32
ScanKey high_compare
Definition nbtree.h:1050
ScanKey low_compare
Definition nbtree.h:1049
BlockNumber btpo_next
Definition nbtree.h:66
BlockNumber btpo_prev
Definition nbtree.h:65
uint32 btpo_level
Definition nbtree.h:67
ItemPointer scantid
Definition nbtree.h:802
bool allequalimage
Definition nbtree.h:798
bool heapkeyspace
Definition nbtree.h:797
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition nbtree.h:804
struct ParallelIndexScanDescData * parallel_scan
Definition relscan.h:192
struct IndexScanInstrumentation * instrument
Definition relscan.h:160
IndexTuple xs_itup
Definition relscan.h:168
Relation indexRelation
Definition relscan.h:138
ItemPointerData xs_heaptid
Definition relscan.h:173
struct SnapshotData * xs_snapshot
Definition relscan.h:139
Oid * rd_opcintype
Definition rel.h:208
Oid * rd_opfamily
Definition rel.h:207
int sk_flags
Definition skey.h:66
static ItemArray items
#define IsolationIsSerializable()
Definition xact.h:53