PostgreSQL Source Code git master
nbtsearch.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
5 *
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "access/nbtree.h"
19#include "access/relscan.h"
20#include "access/xact.h"
21#include "miscadmin.h"
22#include "pgstat.h"
23#include "storage/predicate.h"
24#include "utils/lsyscache.h"
25#include "utils/rel.h"
26
27
28static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
30 Buffer buf, bool forupdate, BTStack stack,
31 int access);
34 OffsetNumber offnum);
35static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
36static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
37static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
38 ScanDirection dir);
39static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
40 BlockNumber lastcurrblkno, ScanDirection dir,
41 bool seized);
43 BlockNumber lastcurrblkno);
44static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
45
46
47/*
48 * _bt_drop_lock_and_maybe_pin()
49 *
50 * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
51 * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
52 */
53static inline void
55{
56 if (!so->dropPin)
57 {
58 /* Just drop the lock (not the pin) */
59 _bt_unlockbuf(rel, so->currPos.buf);
60 return;
61 }
62
63 /*
64 * Drop both the lock and the pin.
65 *
66 * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
67 * when concurrent heap TID recycling by VACUUM might have taken place.
68 */
71 _bt_relbuf(rel, so->currPos.buf);
73}
74
75/*
76 * _bt_search() -- Search the tree for a particular scankey,
77 * or more precisely for the first leaf page it could be on.
78 *
79 * The passed scankey is an insertion-type scankey (see nbtree/README),
80 * but it can omit the rightmost column(s) of the index.
81 *
82 * Return value is a stack of parent-page pointers (i.e. there is no entry for
83 * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
84 * which is locked and pinned. No locks are held on the parent pages,
85 * however!
86 *
87 * The returned buffer is locked according to access parameter. Additionally,
88 * access = BT_WRITE will allow an empty root page to be created and returned.
89 * When access = BT_READ, an empty index will result in *bufP being set to
90 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
91 * during the search will be finished.
92 *
93 * heaprel must be provided by callers that pass access = BT_WRITE, since we
94 * might need to allocate a new root page for caller -- see _bt_allocbuf.
95 */
98 int access)
99{
100 BTStack stack_in = NULL;
101 int page_access = BT_READ;
102
103 /* heaprel must be set whenever _bt_allocbuf is reachable */
105 Assert(access == BT_READ || heaprel != NULL);
106
107 /* Get the root page to start with */
108 *bufP = _bt_getroot(rel, heaprel, access);
109
110 /* If index is empty and access = BT_READ, no root page is created. */
111 if (!BufferIsValid(*bufP))
112 return (BTStack) NULL;
113
114 /* Loop iterates once per level descended in the tree */
115 for (;;)
116 {
117 Page page;
118 BTPageOpaque opaque;
119 OffsetNumber offnum;
120 ItemId itemid;
121 IndexTuple itup;
122 BlockNumber child;
123 BTStack new_stack;
124
125 /*
126 * Race -- the page we just grabbed may have split since we read its
127 * downlink in its parent page (or the metapage). If it has, we may
128 * need to move right to its new sibling. Do that.
129 *
130 * In write-mode, allow _bt_moveright to finish any incomplete splits
131 * along the way. Strictly speaking, we'd only need to finish an
132 * incomplete split on the leaf page we're about to insert to, not on
133 * any of the upper levels (internal pages with incomplete splits are
134 * also taken care of in _bt_getstackbuf). But this is a good
135 * opportunity to finish splits of internal pages too.
136 */
137 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
138 stack_in, page_access);
139
140 /* if this is a leaf page, we're done */
141 page = BufferGetPage(*bufP);
142 opaque = BTPageGetOpaque(page);
143 if (P_ISLEAF(opaque))
144 break;
145
146 /*
147 * Find the appropriate pivot tuple on this page. Its downlink points
148 * to the child page that we're about to descend to.
149 */
150 offnum = _bt_binsrch(rel, key, *bufP);
151 itemid = PageGetItemId(page, offnum);
152 itup = (IndexTuple) PageGetItem(page, itemid);
153 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
154 child = BTreeTupleGetDownLink(itup);
155
156 /*
157 * We need to save the location of the pivot tuple we chose in a new
158 * stack entry for this page/level. If caller ends up splitting a
159 * page one level down, it usually ends up inserting a new pivot
160 * tuple/downlink immediately after the location recorded here.
161 */
162 new_stack = (BTStack) palloc_object(BTStackData);
163 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
164 new_stack->bts_offset = offnum;
165 new_stack->bts_parent = stack_in;
166
167 /*
168 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
169 * we're on the level 1 and asked to lock leaf page in write mode,
170 * then lock next page in write mode, because it must be a leaf.
171 */
172 if (opaque->btpo_level == 1 && access == BT_WRITE)
173 page_access = BT_WRITE;
174
175 /* drop the read lock on the page, then acquire one on its child */
176 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
177
178 /* okay, all set to move down a level */
179 stack_in = new_stack;
180 }
181
182 /*
183 * If we're asked to lock leaf in write mode, but didn't manage to, then
184 * relock. This should only happen when the root page is a leaf page (and
185 * the only page in the index other than the metapage).
186 */
187 if (access == BT_WRITE && page_access == BT_READ)
188 {
189 /* trade in our read lock for a write lock */
190 _bt_unlockbuf(rel, *bufP);
191 _bt_lockbuf(rel, *bufP, BT_WRITE);
192
193 /*
194 * Race -- the leaf page may have split after we dropped the read lock
195 * but before we acquired a write lock. If it has, we may need to
196 * move right to its new sibling. Do that.
197 */
198 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
199 }
200
201 return stack_in;
202}
203
204/*
205 * _bt_moveright() -- move right in the btree if necessary.
206 *
207 * When we follow a pointer to reach a page, it is possible that
208 * the page has changed in the meanwhile. If this happens, we're
209 * guaranteed that the page has "split right" -- that is, that any
210 * data that appeared on the page originally is either on the page
211 * or strictly to the right of it.
212 *
213 * This routine decides whether or not we need to move right in the
214 * tree by examining the high key entry on the page. If that entry is
215 * strictly less than the scankey, or <= the scankey in the
216 * key.nextkey=true case, then we followed the wrong link and we need
217 * to move right.
218 *
219 * The passed insertion-type scankey can omit the rightmost column(s) of the
220 * index. (see nbtree/README)
221 *
222 * When key.nextkey is false (the usual case), we are looking for the first
223 * item >= key. When key.nextkey is true, we are looking for the first item
224 * strictly greater than key.
225 *
226 * If forupdate is true, we will attempt to finish any incomplete splits
227 * that we encounter. This is required when locking a target page for an
228 * insertion, because we don't allow inserting on a page before the split is
229 * completed. 'heaprel' and 'stack' are only used if forupdate is true.
230 *
231 * On entry, we have the buffer pinned and a lock of the type specified by
232 * 'access'. If we move right, we release the buffer and lock and acquire
233 * the same on the right sibling. Return value is the buffer we stop at.
234 */
235static Buffer
237 Relation heaprel,
239 Buffer buf,
240 bool forupdate,
241 BTStack stack,
242 int access)
243{
244 Page page;
245 BTPageOpaque opaque;
246 int32 cmpval;
247
248 Assert(!forupdate || heaprel != NULL);
249
250 /*
251 * When nextkey = false (normal case): if the scan key that brought us to
252 * this page is > the high key stored on the page, then the page has split
253 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
254 * have some duplicates to the right as well as the left, but that's
255 * something that's only ever dealt with on the leaf level, after
256 * _bt_search has found an initial leaf page.)
257 *
258 * When nextkey = true: move right if the scan key is >= page's high key.
259 * (Note that key.scantid cannot be set in this case.)
260 *
261 * The page could even have split more than once, so scan as far as
262 * needed.
263 *
264 * We also have to move right if we followed a link that brought us to a
265 * dead page.
266 */
267 cmpval = key->nextkey ? 0 : 1;
268
269 for (;;)
270 {
271 page = BufferGetPage(buf);
272 opaque = BTPageGetOpaque(page);
273
274 if (P_RIGHTMOST(opaque))
275 break;
276
277 /*
278 * Finish any incomplete splits we encounter along the way.
279 */
280 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
281 {
283
284 /* upgrade our lock if necessary */
285 if (access == BT_READ)
286 {
287 _bt_unlockbuf(rel, buf);
288 _bt_lockbuf(rel, buf, BT_WRITE);
289 }
290
291 if (P_INCOMPLETE_SPLIT(opaque))
292 _bt_finish_split(rel, heaprel, buf, stack);
293 else
294 _bt_relbuf(rel, buf);
295
296 /* re-acquire the lock in the right mode, and re-check */
297 buf = _bt_getbuf(rel, blkno, access);
298 continue;
299 }
300
301 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
302 {
303 /* step right one page */
304 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
305 continue;
306 }
307 else
308 break;
309 }
310
311 if (P_IGNORE(opaque))
312 elog(ERROR, "fell off the end of index \"%s\"",
314
315 return buf;
316}
317
318/*
319 * _bt_binsrch() -- Do a binary search for a key on a particular page.
320 *
321 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
322 * of the last key < given scankey, or last key <= given scankey if nextkey
323 * is true. (Since _bt_compare treats the first data key of such a page as
324 * minus infinity, there will be at least one key < scankey, so the result
325 * always points at one of the keys on the page.)
326 *
327 * On a leaf page, _bt_binsrch() returns the final result of the initial
328 * positioning process that started with _bt_first's call to _bt_search.
329 * We're returning a non-pivot tuple offset, so things are a little different.
330 * It is possible that we'll return an offset that's either past the last
331 * non-pivot slot, or (in the case of a backward scan) before the first slot.
332 *
333 * This procedure is not responsible for walking right, it just examines
334 * the given page. _bt_binsrch() has no lock or refcount side effects
335 * on the buffer.
336 */
337static OffsetNumber
340 Buffer buf)
341{
342 Page page;
343 BTPageOpaque opaque;
344 OffsetNumber low,
345 high;
346 int32 result,
347 cmpval;
348
349 page = BufferGetPage(buf);
350 opaque = BTPageGetOpaque(page);
351
352 /* Requesting nextkey semantics while using scantid seems nonsensical */
353 Assert(!key->nextkey || key->scantid == NULL);
354 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
355 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
356
357 low = P_FIRSTDATAKEY(opaque);
358 high = PageGetMaxOffsetNumber(page);
359
360 /*
361 * If there are no keys on the page, return the first available slot. Note
362 * this covers two cases: the page is really empty (no keys), or it
363 * contains only a high key. The latter case is possible after vacuuming.
364 * This can never happen on an internal page, however, since they are
365 * never empty (an internal page must have at least one child).
366 */
367 if (unlikely(high < low))
368 return low;
369
370 /*
371 * Binary search to find the first key on the page >= scan key, or first
372 * key > scankey when nextkey is true.
373 *
374 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
375 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
376 *
377 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
378 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
379 *
380 * We can fall out when high == low.
381 */
382 high++; /* establish the loop invariant for high */
383
384 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
385
386 while (high > low)
387 {
388 OffsetNumber mid = low + ((high - low) / 2);
389
390 /* We have low <= mid < high, so mid points at a real slot */
391
392 result = _bt_compare(rel, key, page, mid);
393
394 if (result >= cmpval)
395 low = mid + 1;
396 else
397 high = mid;
398 }
399
400 /*
401 * At this point we have high == low.
402 *
403 * On a leaf page we always return the first non-pivot tuple >= scan key
404 * (resp. > scan key) for forward scan callers. For backward scans, it's
405 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
406 */
407 if (P_ISLEAF(opaque))
408 {
409 /*
410 * In the backward scan case we're supposed to locate the last
411 * matching tuple on the leaf level -- not the first matching tuple
412 * (the last tuple will be the first one returned by the scan).
413 *
414 * At this point we've located the first non-pivot tuple immediately
415 * after the last matching tuple (which might just be maxoff + 1).
416 * Compensate by stepping back.
417 */
418 if (key->backward)
419 return OffsetNumberPrev(low);
420
421 return low;
422 }
423
424 /*
425 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
426 * There must be one if _bt_compare() is playing by the rules.
427 *
428 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
429 * a truncated -inf heap TID is usually enough to prevent it altogether.
430 * Even omitted scan key entries are treated as > truncated attributes.
431 *
432 * However, during backward scans _bt_compare() interprets omitted scan
433 * key attributes as == corresponding truncated -inf attributes instead.
434 * This works just like < would work here. Under this scheme, < strategy
435 * backward scans will always directly descend to the correct leaf page.
436 * In particular, they will never incur an "extra" leaf page access with a
437 * scan key that happens to contain the same prefix of values as some
438 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
439 * it uses a leaf page high key to "re-find" a page undergoing deletion.
440 */
441 Assert(low > P_FIRSTDATAKEY(opaque));
442
443 return OffsetNumberPrev(low);
444}
445
446/*
447 *
448 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
449 *
450 * Like _bt_binsrch(), but with support for caching the binary search
451 * bounds. Only used during insertion, and only on the leaf page that it
452 * looks like caller will insert tuple on. Exclusive-locked and pinned
453 * leaf page is contained within insertstate.
454 *
455 * Caches the bounds fields in insertstate so that a subsequent call can
456 * reuse the low and strict high bounds of original binary search. Callers
457 * that use these fields directly must be prepared for the case where low
458 * and/or stricthigh are not on the same page (one or both exceed maxoff
459 * for the page). The case where there are no items on the page (high <
460 * low) makes bounds invalid.
461 *
462 * Caller is responsible for invalidating bounds when it modifies the page
463 * before calling here a second time, and for dealing with posting list
464 * tuple matches (callers can use insertstate's postingoff field to
465 * determine which existing heap TID will need to be replaced by a posting
466 * list split).
467 */
470{
471 BTScanInsert key = insertstate->itup_key;
472 Page page;
473 BTPageOpaque opaque;
474 OffsetNumber low,
475 high,
476 stricthigh;
477 int32 result,
478 cmpval;
479
480 page = BufferGetPage(insertstate->buf);
481 opaque = BTPageGetOpaque(page);
482
483 Assert(P_ISLEAF(opaque));
484 Assert(!key->nextkey);
485 Assert(insertstate->postingoff == 0);
486
487 if (!insertstate->bounds_valid)
488 {
489 /* Start new binary search */
490 low = P_FIRSTDATAKEY(opaque);
491 high = PageGetMaxOffsetNumber(page);
492 }
493 else
494 {
495 /* Restore result of previous binary search against same page */
496 low = insertstate->low;
497 high = insertstate->stricthigh;
498 }
499
500 /* If there are no keys on the page, return the first available slot */
501 if (unlikely(high < low))
502 {
503 /* Caller can't reuse bounds */
504 insertstate->low = InvalidOffsetNumber;
505 insertstate->stricthigh = InvalidOffsetNumber;
506 insertstate->bounds_valid = false;
507 return low;
508 }
509
510 /*
511 * Binary search to find the first key on the page >= scan key. (nextkey
512 * is always false when inserting).
513 *
514 * The loop invariant is: all slots before 'low' are < scan key, all slots
515 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
516 * maintained to save additional search effort for caller.
517 *
518 * We can fall out when high == low.
519 */
520 if (!insertstate->bounds_valid)
521 high++; /* establish the loop invariant for high */
522 stricthigh = high; /* high initially strictly higher */
523
524 cmpval = 1; /* !nextkey comparison value */
525
526 while (high > low)
527 {
528 OffsetNumber mid = low + ((high - low) / 2);
529
530 /* We have low <= mid < high, so mid points at a real slot */
531
532 result = _bt_compare(rel, key, page, mid);
533
534 if (result >= cmpval)
535 low = mid + 1;
536 else
537 {
538 high = mid;
539 if (result != 0)
540 stricthigh = high;
541 }
542
543 /*
544 * If tuple at offset located by binary search is a posting list whose
545 * TID range overlaps with caller's scantid, perform posting list
546 * binary search to set postingoff for caller. Caller must split the
547 * posting list when postingoff is set. This should happen
548 * infrequently.
549 */
550 if (unlikely(result == 0 && key->scantid != NULL))
551 {
552 /*
553 * postingoff should never be set more than once per leaf page
554 * binary search. That would mean that there are duplicate table
555 * TIDs in the index, which is never okay. Check for that here.
556 */
557 if (insertstate->postingoff != 0)
559 (errcode(ERRCODE_INDEX_CORRUPTED),
560 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
563 low, stricthigh,
564 BufferGetBlockNumber(insertstate->buf),
566
567 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
568 }
569 }
570
571 /*
572 * On a leaf page, a binary search always returns the first key >= scan
573 * key (at least in !nextkey case), which could be the last slot + 1. This
574 * is also the lower bound of cached search.
575 *
576 * stricthigh may also be the last slot + 1, which prevents caller from
577 * using bounds directly, but is still useful to us if we're called a
578 * second time with cached bounds (cached low will be < stricthigh when
579 * that happens).
580 */
581 insertstate->low = low;
582 insertstate->stricthigh = stricthigh;
583 insertstate->bounds_valid = true;
584
585 return low;
586}
587
588/*----------
589 * _bt_binsrch_posting() -- posting list binary search.
590 *
591 * Helper routine for _bt_binsrch_insert().
592 *
593 * Returns offset into posting list where caller's scantid belongs.
594 *----------
595 */
596static int
598{
599 IndexTuple itup;
600 ItemId itemid;
601 int low,
602 high,
603 mid,
604 res;
605
606 /*
607 * If this isn't a posting tuple, then the index must be corrupt (if it is
608 * an ordinary non-pivot tuple then there must be an existing tuple with a
609 * heap TID that equals inserter's new heap TID/scantid). Defensively
610 * check that tuple is a posting list tuple whose posting list range
611 * includes caller's scantid.
612 *
613 * (This is also needed because contrib/amcheck's rootdescend option needs
614 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
615 */
616 itemid = PageGetItemId(page, offnum);
617 itup = (IndexTuple) PageGetItem(page, itemid);
618 if (!BTreeTupleIsPosting(itup))
619 return 0;
620
621 Assert(key->heapkeyspace && key->allequalimage);
622
623 /*
624 * In the event that posting list tuple has LP_DEAD bit set, indicate this
625 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
626 * second call to _bt_binsrch_insert() can take place when its caller has
627 * removed the dead item.
628 */
629 if (ItemIdIsDead(itemid))
630 return -1;
631
632 /* "high" is past end of posting list for loop invariant */
633 low = 0;
634 high = BTreeTupleGetNPosting(itup);
635 Assert(high >= 2);
636
637 while (high > low)
638 {
639 mid = low + ((high - low) / 2);
640 res = ItemPointerCompare(key->scantid,
641 BTreeTupleGetPostingN(itup, mid));
642
643 if (res > 0)
644 low = mid + 1;
645 else if (res < 0)
646 high = mid;
647 else
648 return mid;
649 }
650
651 /* Exact match not found */
652 return low;
653}
654
655/*----------
656 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
657 *
658 * page/offnum: location of btree item to be compared to.
659 *
660 * This routine returns:
661 * <0 if scankey < tuple at offnum;
662 * 0 if scankey == tuple at offnum;
663 * >0 if scankey > tuple at offnum.
664 *
665 * NULLs in the keys are treated as sortable values. Therefore
666 * "equality" does not necessarily mean that the item should be returned
667 * to the caller as a matching key. Similarly, an insertion scankey
668 * with its scantid set is treated as equal to a posting tuple whose TID
669 * range overlaps with their scantid. There generally won't be a
670 * matching TID in the posting tuple, which caller must handle
671 * themselves (e.g., by splitting the posting list tuple).
672 *
673 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
674 * "minus infinity": this routine will always claim it is less than the
675 * scankey. The actual key value stored is explicitly truncated to 0
676 * attributes (explicitly minus infinity) with version 3+ indexes, but
677 * that isn't relied upon. This allows us to implement the Lehman and
678 * Yao convention that the first down-link pointer is before the first
679 * key. See backend/access/nbtree/README for details.
680 *----------
681 */
682int32
685 Page page,
686 OffsetNumber offnum)
687{
688 TupleDesc itupdesc = RelationGetDescr(rel);
689 BTPageOpaque opaque = BTPageGetOpaque(page);
690 IndexTuple itup;
691 ItemPointer heapTid;
692 ScanKey scankey;
693 int ncmpkey;
694 int ntupatts;
695 int32 result;
696
697 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
699 Assert(key->heapkeyspace || key->scantid == NULL);
700
701 /*
702 * Force result ">" if target item is first data item on an internal page
703 * --- see NOTE above.
704 */
705 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
706 return 1;
707
708 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
709 ntupatts = BTreeTupleGetNAtts(itup, rel);
710
711 /*
712 * The scan key is set up with the attribute number associated with each
713 * term in the key. It is important that, if the index is multi-key, the
714 * scan contain the first k key attributes, and that they be in order. If
715 * you think about how multi-key ordering works, you'll understand why
716 * this is.
717 *
718 * We don't test for violation of this condition here, however. The
719 * initial setup for the index scan had better have gotten it right (see
720 * _bt_first).
721 */
722
723 ncmpkey = Min(ntupatts, key->keysz);
724 Assert(key->heapkeyspace || ncmpkey == key->keysz);
725 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
726 scankey = key->scankeys;
727 for (int i = 1; i <= ncmpkey; i++)
728 {
729 Datum datum;
730 bool isNull;
731
732 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
733
734 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
735 {
736 if (isNull)
737 result = 0; /* NULL "=" NULL */
738 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
739 result = -1; /* NULL "<" NOT_NULL */
740 else
741 result = 1; /* NULL ">" NOT_NULL */
742 }
743 else if (isNull) /* key is NOT_NULL and item is NULL */
744 {
745 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
746 result = 1; /* NOT_NULL ">" NULL */
747 else
748 result = -1; /* NOT_NULL "<" NULL */
749 }
750 else
751 {
752 /*
753 * The sk_func needs to be passed the index value as left arg and
754 * the sk_argument as right arg (they might be of different
755 * types). Since it is convenient for callers to think of
756 * _bt_compare as comparing the scankey to the index item, we have
757 * to flip the sign of the comparison result. (Unless it's a DESC
758 * column, in which case we *don't* flip the sign.)
759 */
760 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
761 scankey->sk_collation,
762 datum,
763 scankey->sk_argument));
764
765 if (!(scankey->sk_flags & SK_BT_DESC))
766 INVERT_COMPARE_RESULT(result);
767 }
768
769 /* if the keys are unequal, return the difference */
770 if (result != 0)
771 return result;
772
773 scankey++;
774 }
775
776 /*
777 * All non-truncated attributes (other than heap TID) were found to be
778 * equal. Treat truncated attributes as minus infinity when scankey has a
779 * key attribute value that would otherwise be compared directly.
780 *
781 * Note: it doesn't matter if ntupatts includes non-key attributes;
782 * scankey won't, so explicitly excluding non-key attributes isn't
783 * necessary.
784 */
785 if (key->keysz > ntupatts)
786 return 1;
787
788 /*
789 * Use the heap TID attribute and scantid to try to break the tie. The
790 * rules are the same as any other key attribute -- only the
791 * representation differs.
792 */
793 heapTid = BTreeTupleGetHeapTID(itup);
794 if (key->scantid == NULL)
795 {
796 /*
797 * Forward scans have a scankey that is considered greater than a
798 * truncated pivot tuple if and when the scankey has equal values for
799 * attributes up to and including the least significant untruncated
800 * attribute in tuple. Even attributes that were omitted from the
801 * scan key are considered greater than -inf truncated attributes.
802 * (See _bt_binsrch for an explanation of our backward scan behavior.)
803 *
804 * For example, if an index has the minimum two attributes (single
805 * user key attribute, plus heap TID attribute), and a page's high key
806 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
807 * will not descend to the page to the left. The search will descend
808 * right instead. The truncated attribute in pivot tuple means that
809 * all non-pivot tuples on the page to the left are strictly < 'foo',
810 * so it isn't necessary to descend left. In other words, search
811 * doesn't have to descend left because it isn't interested in a match
812 * that has a heap TID value of -inf.
813 *
814 * Note: the heap TID part of the test ensures that scankey is being
815 * compared to a pivot tuple with one or more truncated -inf key
816 * attributes. The heap TID attribute is the last key attribute in
817 * every index, of course, but other than that it isn't special.
818 */
819 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
820 key->heapkeyspace)
821 return 1;
822
823 /* All provided scankey arguments found to be equal */
824 return 0;
825 }
826
827 /*
828 * Treat truncated heap TID as minus infinity, since scankey has a key
829 * attribute value (scantid) that would otherwise be compared directly
830 */
832 if (heapTid == NULL)
833 return 1;
834
835 /*
836 * Scankey must be treated as equal to a posting list tuple if its scantid
837 * value falls within the range of the posting list. In all other cases
838 * there can only be a single heap TID value, which is compared directly
839 * with scantid.
840 */
842 result = ItemPointerCompare(key->scantid, heapTid);
843 if (result <= 0 || !BTreeTupleIsPosting(itup))
844 return result;
845 else
846 {
847 result = ItemPointerCompare(key->scantid,
849 if (result > 0)
850 return 1;
851 }
852
853 return 0;
854}
855
856/*
857 * _bt_first() -- Find the first item in a scan.
858 *
859 * We need to be clever about the direction of scan, the search
860 * conditions, and the tree ordering. We find the first item (or,
861 * if backwards scan, the last item) in the tree that satisfies the
862 * qualifications in the scan key. On success exit, data about the
863 * matching tuple(s) on the page has been loaded into so->currPos. We'll
864 * drop all locks and hold onto a pin on page's buffer, except during
865 * so->dropPin scans, when we drop both the lock and the pin.
866 * _bt_returnitem sets the next item to return to scan on success exit.
867 *
868 * If there are no matching items in the index, we return false, with no
869 * pins or locks held. so->currPos will remain invalid.
870 *
871 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
872 * are both search-type scankeys (see nbtree/README for more about this).
873 * Within this routine, we build a temporary insertion-type scankey to use
874 * in locating the scan start position.
875 */
876bool
878{
879 Relation rel = scan->indexRelation;
880 BTScanOpaque so = (BTScanOpaque) scan->opaque;
881 BTStack stack;
882 OffsetNumber offnum;
883 BTScanInsertData inskey;
884 ScanKey startKeys[INDEX_MAX_KEYS];
885 ScanKeyData notnullkey;
886 int keysz = 0;
887 StrategyNumber strat_total = InvalidStrategy;
889 lastcurrblkno;
890
892
893 /*
894 * Examine the scan keys and eliminate any redundant keys; also mark the
895 * keys that must be matched to continue the scan.
896 */
898
899 /*
900 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
901 * never be satisfied (eg, x == 1 AND x > 2).
902 */
903 if (!so->qual_ok)
904 {
905 Assert(!so->needPrimScan);
906 _bt_parallel_done(scan);
907 return false;
908 }
909
910 /*
911 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
912 * will likely release the parallel scan later on.
913 */
914 if (scan->parallel_scan != NULL &&
915 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
916 return false;
917
918 /*
919 * Initialize the scan's arrays (if any) for the current scan direction
920 * (except when they were already set to later values as part of
921 * scheduling the primitive index scan that is now underway)
922 */
923 if (so->numArrayKeys && !so->needPrimScan)
924 _bt_start_array_keys(scan, dir);
925
926 if (blkno != InvalidBlockNumber)
927 {
928 /*
929 * We anticipated calling _bt_search, but another worker bet us to it.
930 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
931 */
932 Assert(scan->parallel_scan != NULL);
933 Assert(!so->needPrimScan);
934 Assert(blkno != P_NONE);
935
936 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
937 return false;
938
939 _bt_returnitem(scan, so);
940 return true;
941 }
942
943 /*
944 * Count an indexscan for stats, now that we know that we'll call
945 * _bt_search/_bt_endpoint below
946 */
948 if (scan->instrument)
949 scan->instrument->nsearches++;
950
951 /*----------
952 * Examine the scan keys to discover where we need to start the scan.
953 * The selected scan keys (at most one per index column) are remembered by
954 * storing their addresses into the local startKeys[] array. The final
955 * startKeys[] entry's strategy is set in strat_total. (Actually, there
956 * are a couple of cases where we force a less/more restrictive strategy.)
957 *
958 * We must use the key that was marked required (in the direction opposite
959 * our own scan's) during preprocessing. Each index attribute can only
960 * have one such required key. In general, the keys that we use to find
961 * an initial position when scanning forwards are the same keys that end
962 * the scan on the leaf level when scanning backwards (and vice-versa).
963 *
964 * When the scan keys include cross-type operators, _bt_preprocess_keys
965 * may not be able to eliminate redundant keys; in such cases it will
966 * arbitrarily pick a usable key for each attribute (and scan direction),
967 * ensuring that there is no more than one key required in each direction.
968 * We stop considering further keys once we reach the first nonrequired
969 * key (which must come after all required keys), so this can't affect us.
970 *
971 * The required keys that we use as starting boundaries have to be =, >,
972 * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
973 * We can use keys for multiple attributes so long as the prior attributes
974 * had only =, >= (resp. =, <=) keys. These rules are very similar to the
975 * rules that preprocessing used to determine which keys to mark required.
976 * We cannot always use every required key as a positioning key, though.
977 * Skip arrays necessitate independently applying our own rules here.
978 * Skip arrays are always generally considered = array keys, but we'll
979 * nevertheless treat them as inequalities at certain points of the scan.
980 * When that happens, it _might_ have implications for the number of
981 * required keys that we can safely use for initial positioning purposes.
982 *
983 * For example, a forward scan with a skip array on its leading attribute
984 * (with no low_compare/high_compare) will have at least two required scan
985 * keys, but we won't use any of them as boundary keys during the scan's
986 * initial call here. Our positioning key during the first call here can
987 * be thought of as representing "> -infinity". Similarly, if such a skip
988 * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
989 * during the scan's initial call here; a lower-order key such as "b = 42"
990 * can't be used until the "a" array advances beyond MINVAL/low_compare.
991 *
992 * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
993 * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
994 * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
995 * that we treat = and >= as equivalent when scanning forwards (just as we
996 * treat = and <= as equivalent when scanning backwards). We effectively
997 * do the same thing (though with a distinct "a" element/value) each time.
998 *
999 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1000 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1001 * If the index stores nulls at the end of the index we'll be starting
1002 * from, and we have no boundary key for the column (which means the key
1003 * we deduced NOT NULL from is an inequality key that constrains the other
1004 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1005 * use as a boundary key. If we didn't do this, we might find ourselves
1006 * traversing a lot of null entries at the start of the scan.
1007 *
1008 * In this loop, row-comparison keys are treated the same as keys on their
1009 * first (leftmost) columns. We'll add all lower-order columns of the row
1010 * comparison that were marked required during preprocessing below.
1011 *
1012 * _bt_advance_array_keys needs to know exactly how we'll reposition the
1013 * scan (should it opt to schedule another primitive index scan). It is
1014 * critical that primscans only be scheduled when they'll definitely make
1015 * some useful progress. _bt_advance_array_keys does this by calling
1016 * _bt_checkkeys routines that report whether a tuple is past the end of
1017 * matches for the scan's keys (given the scan's current array elements).
1018 * If the page's final tuple is "after the end of matches" for a scan that
1019 * uses the *opposite* scan direction, then it must follow that it's also
1020 * "before the start of matches" for the actual current scan direction.
1021 * It is therefore essential that all of our initial positioning rules are
1022 * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1023 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1024 * need to be kept in sync.
1025 *----------
1026 */
1027 if (so->numberOfKeys > 0)
1028 {
1029 AttrNumber curattr;
1030 ScanKey bkey;
1031 ScanKey impliesNN;
1032 ScanKey cur;
1033
1034 /*
1035 * bkey will be set to the key that preprocessing left behind as the
1036 * boundary key for this attribute, in this scan direction (if any)
1037 */
1038 cur = so->keyData;
1039 curattr = 1;
1040 bkey = NULL;
1041 /* Also remember any scankey that implies a NOT NULL constraint */
1042 impliesNN = NULL;
1043
1044 /*
1045 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1046 * pass to handle after-last-key processing. Actual exit from the
1047 * loop is at one of the "break" statements below.
1048 */
1049 for (int i = 0;; cur++, i++)
1050 {
1051 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1052 {
1053 /* Done looking for the curattr boundary key */
1054 Assert(bkey == NULL ||
1055 (bkey->sk_attno == curattr &&
1056 (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1057 Assert(impliesNN == NULL ||
1058 (impliesNN->sk_attno == curattr &&
1059 (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1060
1061 /*
1062 * If this is a scan key for a skip array whose current
1063 * element is MINVAL, choose low_compare (when scanning
1064 * backwards it'll be MAXVAL, and we'll choose high_compare).
1065 *
1066 * Note: if the array's low_compare key makes 'bkey' NULL,
1067 * then we behave as if the array's first element is -inf,
1068 * except when !array->null_elem implies a usable NOT NULL
1069 * constraint.
1070 */
1071 if (bkey != NULL &&
1072 (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1073 {
1074 int ikey = bkey - so->keyData;
1075 ScanKey skipequalitykey = bkey;
1076 BTArrayKeyInfo *array = NULL;
1077
1078 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1079 {
1080 array = &so->arrayKeys[arridx];
1081 if (array->scan_key == ikey)
1082 break;
1083 }
1084
1085 if (ScanDirectionIsForward(dir))
1086 {
1087 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1088 bkey = array->low_compare;
1089 }
1090 else
1091 {
1092 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1093 bkey = array->high_compare;
1094 }
1095
1096 Assert(bkey == NULL ||
1097 bkey->sk_attno == skipequalitykey->sk_attno);
1098
1099 if (!array->null_elem)
1100 impliesNN = skipequalitykey;
1101 else
1102 Assert(bkey == NULL && impliesNN == NULL);
1103 }
1104
1105 /*
1106 * If we didn't find a usable boundary key, see if we can
1107 * deduce a NOT NULL key
1108 */
1109 if (bkey == NULL && impliesNN != NULL &&
1110 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1113 {
1114 /* Final startKeys[] entry will be deduced NOT NULL key */
1115 bkey = &notnullkey;
1118 (impliesNN->sk_flags &
1120 curattr,
1123 InvalidOid,
1124 InvalidOid,
1125 InvalidOid,
1126 (Datum) 0);
1127 }
1128
1129 /*
1130 * If preprocessing didn't leave a usable boundary key, quit;
1131 * else save the boundary key pointer in startKeys[]
1132 */
1133 if (bkey == NULL)
1134 break;
1135 startKeys[keysz++] = bkey;
1136
1137 /*
1138 * We can only consider adding more boundary keys when the one
1139 * that we just chose to add uses either the = or >= strategy
1140 * (during backwards scans we can only do so when the key that
1141 * we just added to startKeys[] uses the = or <= strategy)
1142 */
1143 strat_total = bkey->sk_strategy;
1144 if (strat_total == BTGreaterStrategyNumber ||
1145 strat_total == BTLessStrategyNumber)
1146 break;
1147
1148 /*
1149 * If the key that we just added to startKeys[] is a skip
1150 * array = key whose current element is marked NEXT or PRIOR,
1151 * make strat_total > or < (and stop adding boundary keys).
1152 * This can only happen with opclasses that lack skip support.
1153 */
1154 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1155 {
1156 Assert(bkey->sk_flags & SK_BT_SKIP);
1157 Assert(strat_total == BTEqualStrategyNumber);
1158
1159 if (ScanDirectionIsForward(dir))
1160 {
1161 Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1162 strat_total = BTGreaterStrategyNumber;
1163 }
1164 else
1165 {
1166 Assert(!(bkey->sk_flags & SK_BT_NEXT));
1167 strat_total = BTLessStrategyNumber;
1168 }
1169
1170 /*
1171 * We're done. We'll never find an exact = match for a
1172 * NEXT or PRIOR sentinel sk_argument value. There's no
1173 * sense in trying to add more keys to startKeys[].
1174 */
1175 break;
1176 }
1177
1178 /*
1179 * Done if that was the last scan key output by preprocessing.
1180 * Also done if we've now examined all keys marked required.
1181 */
1182 if (i >= so->numberOfKeys ||
1183 !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1184 break;
1185
1186 /*
1187 * Reset for next attr.
1188 */
1189 Assert(cur->sk_attno == curattr + 1);
1190 curattr = cur->sk_attno;
1191 bkey = NULL;
1192 impliesNN = NULL;
1193 }
1194
1195 /*
1196 * If we've located the starting boundary key for curattr, we have
1197 * no interest in curattr's other required key
1198 */
1199 if (bkey != NULL)
1200 continue;
1201
1202 /*
1203 * Is this key the starting boundary key for curattr?
1204 *
1205 * If not, does it imply a NOT NULL constraint? (Because
1206 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1207 * *any* inequality key works for that; we need not test.)
1208 */
1209 switch (cur->sk_strategy)
1210 {
1213 if (ScanDirectionIsBackward(dir))
1214 bkey = cur;
1215 else if (impliesNN == NULL)
1216 impliesNN = cur;
1217 break;
1219 bkey = cur;
1220 break;
1223 if (ScanDirectionIsForward(dir))
1224 bkey = cur;
1225 else if (impliesNN == NULL)
1226 impliesNN = cur;
1227 break;
1228 }
1229 }
1230 }
1231
1232 /*
1233 * If we found no usable boundary keys, we have to start from one end of
1234 * the tree. Walk down that edge to the first or last key, and scan from
1235 * there.
1236 *
1237 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1238 */
1239 if (keysz == 0)
1240 return _bt_endpoint(scan, dir);
1241
1242 /*
1243 * We want to start the scan somewhere within the index. Set up an
1244 * insertion scankey we can use to search for the boundary point we
1245 * identified above. The insertion scankey is built using the keys
1246 * identified by startKeys[]. (Remaining insertion scankey fields are
1247 * initialized after initial-positioning scan keys are finalized.)
1248 */
1249 Assert(keysz <= INDEX_MAX_KEYS);
1250 for (int i = 0; i < keysz; i++)
1251 {
1252 ScanKey bkey = startKeys[i];
1253
1254 Assert(bkey->sk_attno == i + 1);
1255
1256 if (bkey->sk_flags & SK_ROW_HEADER)
1257 {
1258 /*
1259 * Row comparison header: look to the first row member instead
1260 */
1261 ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1262 bool loosen_strat = false,
1263 tighten_strat = false;
1264
1265 /*
1266 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1267 * would've marked the qual as unsatisfiable, preventing us from
1268 * ever getting this far
1269 */
1270 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1271 Assert(subkey->sk_attno == bkey->sk_attno);
1272 Assert(!(subkey->sk_flags & SK_ISNULL));
1273
1274 /*
1275 * This is either a > or >= key (during backwards scans it is
1276 * either < or <=) that was marked required during preprocessing.
1277 * Later so->keyData[] keys can't have been marked required, so
1278 * our row compare header key must be the final startKeys[] entry.
1279 */
1281 Assert(subkey->sk_strategy == bkey->sk_strategy);
1282 Assert(subkey->sk_strategy == strat_total);
1283 Assert(i == keysz - 1);
1284
1285 /*
1286 * The member scankeys are already in insertion format (ie, they
1287 * have sk_func = 3-way-comparison function)
1288 */
1289 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1290
1291 /*
1292 * Now look to later row compare members.
1293 *
1294 * If there's an "index attribute gap" between two row compare
1295 * members, the second member won't have been marked required, and
1296 * so can't be used as a starting boundary key here. The part of
1297 * the row comparison that we do still use has to be treated as a
1298 * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1299 * with an omitted intervening index attribute "b" will use an
1300 * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1301 * the leaf level might satisfy the row compare qual.
1302 *
1303 * We're able to use a _more_ restrictive strategy when we reach a
1304 * NULL row compare member, since they're always unsatisfiable.
1305 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1306 * insertion scan key "a > 1". All tuples where "a = 1" cannot
1307 * possibly satisfy the row compare qual, so this is safe.
1308 */
1309 Assert(!(subkey->sk_flags & SK_ROW_END));
1310 for (;;)
1311 {
1312 subkey++;
1313 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1314
1315 if (subkey->sk_flags & SK_ISNULL)
1316 {
1317 /*
1318 * NULL member key, can only use earlier keys.
1319 *
1320 * We deliberately avoid checking if this key is marked
1321 * required. All earlier keys are required, and this key
1322 * is unsatisfiable either way, so we can't miss anything.
1323 */
1324 tighten_strat = true;
1325 break;
1326 }
1327
1328 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1329 {
1330 /* nonrequired member key, can only use earlier keys */
1331 loosen_strat = true;
1332 break;
1333 }
1334
1335 Assert(subkey->sk_attno == keysz + 1);
1336 Assert(subkey->sk_strategy == bkey->sk_strategy);
1337 Assert(keysz < INDEX_MAX_KEYS);
1338
1339 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1340 keysz++;
1341
1342 if (subkey->sk_flags & SK_ROW_END)
1343 break;
1344 }
1345 Assert(!(loosen_strat && tighten_strat));
1346 if (loosen_strat)
1347 {
1348 /* Use less restrictive strategy (and fewer member keys) */
1349 switch (strat_total)
1350 {
1352 strat_total = BTLessEqualStrategyNumber;
1353 break;
1355 strat_total = BTGreaterEqualStrategyNumber;
1356 break;
1357 }
1358 }
1359 if (tighten_strat)
1360 {
1361 /* Use more restrictive strategy (and fewer member keys) */
1362 switch (strat_total)
1363 {
1365 strat_total = BTLessStrategyNumber;
1366 break;
1368 strat_total = BTGreaterStrategyNumber;
1369 break;
1370 }
1371 }
1372
1373 /* Done (row compare header key is always last startKeys[] key) */
1374 break;
1375 }
1376
1377 /*
1378 * Ordinary comparison key/search-style key.
1379 *
1380 * Transform the search-style scan key to an insertion scan key by
1381 * replacing the sk_func with the appropriate btree 3-way-comparison
1382 * function.
1383 *
1384 * If scankey operator is not a cross-type comparison, we can use the
1385 * cached comparison function; otherwise gotta look it up in the
1386 * catalogs. (That can't lead to infinite recursion, since no
1387 * indexscan initiated by syscache lookup will use cross-data-type
1388 * operators.)
1389 *
1390 * We support the convention that sk_subtype == InvalidOid means the
1391 * opclass input type; this hack simplifies life for ScanKeyInit().
1392 */
1393 if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1394 bkey->sk_subtype == InvalidOid)
1395 {
1396 FmgrInfo *procinfo;
1397
1398 procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1399 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1400 bkey->sk_flags,
1401 bkey->sk_attno,
1403 bkey->sk_subtype,
1404 bkey->sk_collation,
1405 procinfo,
1406 bkey->sk_argument);
1407 }
1408 else
1409 {
1410 RegProcedure cmp_proc;
1411
1412 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1413 rel->rd_opcintype[i],
1414 bkey->sk_subtype, BTORDER_PROC);
1415 if (!RegProcedureIsValid(cmp_proc))
1416 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1417 BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1418 bkey->sk_attno, RelationGetRelationName(rel));
1419 ScanKeyEntryInitialize(inskey.scankeys + i,
1420 bkey->sk_flags,
1421 bkey->sk_attno,
1423 bkey->sk_subtype,
1424 bkey->sk_collation,
1425 cmp_proc,
1426 bkey->sk_argument);
1427 }
1428 }
1429
1430 /*----------
1431 * Examine the selected initial-positioning strategy to determine exactly
1432 * where we need to start the scan, and set flag variables to control the
1433 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1434 * page _bt_search returns).
1435 *----------
1436 */
1437 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1438 inskey.anynullkeys = false; /* unused */
1439 inskey.scantid = NULL;
1440 inskey.keysz = keysz;
1441 switch (strat_total)
1442 {
1444
1445 inskey.nextkey = false;
1446 inskey.backward = true;
1447 break;
1448
1450
1451 inskey.nextkey = true;
1452 inskey.backward = true;
1453 break;
1454
1456
1457 /*
1458 * If a backward scan was specified, need to start with last equal
1459 * item not first one.
1460 */
1461 if (ScanDirectionIsBackward(dir))
1462 {
1463 /*
1464 * This is the same as the <= strategy
1465 */
1466 inskey.nextkey = true;
1467 inskey.backward = true;
1468 }
1469 else
1470 {
1471 /*
1472 * This is the same as the >= strategy
1473 */
1474 inskey.nextkey = false;
1475 inskey.backward = false;
1476 }
1477 break;
1478
1480
1481 /*
1482 * Find first item >= scankey
1483 */
1484 inskey.nextkey = false;
1485 inskey.backward = false;
1486 break;
1487
1489
1490 /*
1491 * Find first item > scankey
1492 */
1493 inskey.nextkey = true;
1494 inskey.backward = false;
1495 break;
1496
1497 default:
1498 /* can't get here, but keep compiler quiet */
1499 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1500 return false;
1501 }
1502
1503 /*
1504 * Use the manufactured insertion scan key to descend the tree and
1505 * position ourselves on the target leaf page.
1506 */
1507 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1508 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1509
1510 /* don't need to keep the stack around... */
1511 _bt_freestack(stack);
1512
1513 if (!BufferIsValid(so->currPos.buf))
1514 {
1515 Assert(!so->needPrimScan);
1516
1517 /*
1518 * We only get here if the index is completely empty. Lock relation
1519 * because nothing finer to lock exists. Without a buffer lock, it's
1520 * possible for another transaction to insert data between
1521 * _bt_search() and PredicateLockRelation(). We have to try again
1522 * after taking the relation-level predicate lock, to close a narrow
1523 * window where we wouldn't scan concurrently inserted tuples, but the
1524 * writer wouldn't see our predicate lock.
1525 */
1527 {
1529 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1530 _bt_freestack(stack);
1531 }
1532
1533 if (!BufferIsValid(so->currPos.buf))
1534 {
1535 _bt_parallel_done(scan);
1536 return false;
1537 }
1538 }
1539
1540 /* position to the precise item on the page */
1541 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1542
1543 /*
1544 * Now load data from the first page of the scan (usually the page
1545 * currently in so->currPos.buf).
1546 *
1547 * If inskey.nextkey = false and inskey.backward = false, offnum is
1548 * positioned at the first non-pivot tuple >= inskey.scankeys.
1549 *
1550 * If inskey.nextkey = false and inskey.backward = true, offnum is
1551 * positioned at the last non-pivot tuple < inskey.scankeys.
1552 *
1553 * If inskey.nextkey = true and inskey.backward = false, offnum is
1554 * positioned at the first non-pivot tuple > inskey.scankeys.
1555 *
1556 * If inskey.nextkey = true and inskey.backward = true, offnum is
1557 * positioned at the last non-pivot tuple <= inskey.scankeys.
1558 *
1559 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1560 * for the page. For example, when inskey is both < the leaf page's high
1561 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1562 */
1563 if (!_bt_readfirstpage(scan, offnum, dir))
1564 return false;
1565
1566 _bt_returnitem(scan, so);
1567 return true;
1568}
1569
1570/*
1571 * _bt_next() -- Get the next item in a scan.
1572 *
1573 * On entry, so->currPos describes the current page, which may be pinned
1574 * but is not locked, and so->currPos.itemIndex identifies which item was
1575 * previously returned.
1576 *
1577 * On success exit, so->currPos is updated as needed, and _bt_returnitem
1578 * sets the next item to return to the scan. so->currPos remains valid.
1579 *
1580 * On failure exit (no more tuples), we invalidate so->currPos. It'll
1581 * still be possible for the scan to return tuples by changing direction,
1582 * though we'll need to call _bt_first anew in that other direction.
1583 */
1584bool
1586{
1587 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1588
1590
1591 /*
1592 * Advance to next tuple on current page; or if there's no more, try to
1593 * step to the next page with data.
1594 */
1595 if (ScanDirectionIsForward(dir))
1596 {
1597 if (++so->currPos.itemIndex > so->currPos.lastItem)
1598 {
1599 if (!_bt_steppage(scan, dir))
1600 return false;
1601 }
1602 }
1603 else
1604 {
1605 if (--so->currPos.itemIndex < so->currPos.firstItem)
1606 {
1607 if (!_bt_steppage(scan, dir))
1608 return false;
1609 }
1610 }
1611
1612 _bt_returnitem(scan, so);
1613 return true;
1614}
1615
1616/*
1617 * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
1618 * index scan by setting the relevant fields in caller's index scan descriptor
1619 */
1620static inline void
1622{
1623 BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
1624
1625 /* Most recent _bt_readpage must have succeeded */
1629
1630 /* Return next item, per amgettuple contract */
1631 scan->xs_heaptid = currItem->heapTid;
1632 if (so->currTuples)
1633 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1634}
1635
1636/*
1637 * _bt_steppage() -- Step to next page containing valid data for scan
1638 *
1639 * Wrapper on _bt_readnextpage that performs final steps for the current page.
1640 *
1641 * On entry, so->currPos must be valid. Its buffer will be pinned, though
1642 * never locked. (Actually, when so->dropPin there won't even be a pin held,
1643 * though so->currPos.currPage must still be set to a valid block number.)
1644 */
1645static bool
1647{
1648 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1649 BlockNumber blkno,
1650 lastcurrblkno;
1651
1653
1654 /* Before leaving current page, deal with any killed items */
1655 if (so->numKilled > 0)
1656 _bt_killitems(scan);
1657
1658 /*
1659 * Before we modify currPos, make a copy of the page data if there was a
1660 * mark position that needs it.
1661 */
1662 if (so->markItemIndex >= 0)
1663 {
1664 /* bump pin on current buffer for assignment to mark buffer */
1665 if (BTScanPosIsPinned(so->currPos))
1667 memcpy(&so->markPos, &so->currPos,
1668 offsetof(BTScanPosData, items[1]) +
1669 so->currPos.lastItem * sizeof(BTScanPosItem));
1670 if (so->markTuples)
1671 memcpy(so->markTuples, so->currTuples,
1674 so->markItemIndex = -1;
1675
1676 /*
1677 * If we're just about to start the next primitive index scan
1678 * (possible with a scan that has arrays keys, and needs to skip to
1679 * continue in the current scan direction), moreLeft/moreRight only
1680 * indicate the end of the current primitive index scan. They must
1681 * never be taken to indicate that the top-level index scan has ended
1682 * (that would be wrong).
1683 *
1684 * We could handle this case by treating the current array keys as
1685 * markPos state. But depending on the current array state like this
1686 * would add complexity. Instead, we just unset markPos's copy of
1687 * moreRight or moreLeft (whichever might be affected), while making
1688 * btrestrpos reset the scan's arrays to their initial scan positions.
1689 * In effect, btrestrpos leaves advancing the arrays up to the first
1690 * _bt_readpage call (that takes place after it has restored markPos).
1691 */
1692 if (so->needPrimScan)
1693 {
1695 so->markPos.moreRight = true;
1696 else
1697 so->markPos.moreLeft = true;
1698 }
1699
1700 /* mark/restore not supported by parallel scans */
1701 Assert(!scan->parallel_scan);
1702 }
1703
1705
1706 /* Walk to the next page with data */
1707 if (ScanDirectionIsForward(dir))
1708 blkno = so->currPos.nextPage;
1709 else
1710 blkno = so->currPos.prevPage;
1711 lastcurrblkno = so->currPos.currPage;
1712
1713 /*
1714 * Cancel primitive index scans that were scheduled when the call to
1715 * _bt_readpage for currPos happened to use the opposite direction to the
1716 * one that we're stepping in now. (It's okay to leave the scan's array
1717 * keys as-is, since the next _bt_readpage will advance them.)
1718 */
1719 if (so->currPos.dir != dir)
1720 so->needPrimScan = false;
1721
1722 return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
1723}
1724
1725/*
1726 * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
1727 *
1728 * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1729 * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1730 * When we're passed an offnum past the end of the page, we might still manage
1731 * to stop the scan on this page by calling _bt_checkkeys against the high
1732 * key. See _bt_readpage for full details.
1733 *
1734 * On entry, so->currPos must be pinned and locked (so offnum stays valid).
1735 * Parallel scan callers must have seized the scan before calling here.
1736 *
1737 * On exit, we'll have updated so->currPos and retained locks and pins
1738 * according to the same rules as those laid out for _bt_readnextpage exit.
1739 * Like _bt_readnextpage, our return value indicates if there are any matching
1740 * records in the given direction.
1741 *
1742 * We always release the scan for a parallel scan caller, regardless of
1743 * success or failure; we'll call _bt_parallel_release as soon as possible.
1744 */
1745static bool
1747{
1748 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1749
1750 so->numKilled = 0; /* just paranoia */
1751 so->markItemIndex = -1; /* ditto */
1752
1753 /* Initialize so->currPos for the first page (page in so->currPos.buf) */
1754 if (so->needPrimScan)
1755 {
1756 Assert(so->numArrayKeys);
1757
1758 so->currPos.moreLeft = true;
1759 so->currPos.moreRight = true;
1760 so->needPrimScan = false;
1761 }
1762 else if (ScanDirectionIsForward(dir))
1763 {
1764 so->currPos.moreLeft = false;
1765 so->currPos.moreRight = true;
1766 }
1767 else
1768 {
1769 so->currPos.moreLeft = true;
1770 so->currPos.moreRight = false;
1771 }
1772
1773 /*
1774 * Attempt to load matching tuples from the first page.
1775 *
1776 * Note that _bt_readpage will finish initializing the so->currPos fields.
1777 * _bt_readpage also releases parallel scan (even when it returns false).
1778 */
1779 if (_bt_readpage(scan, dir, offnum, true))
1780 {
1781 Relation rel = scan->indexRelation;
1782
1783 /*
1784 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1785 * so->currPos.buf in preparation for btgettuple returning tuples.
1786 */
1789 return true;
1790 }
1791
1792 /* There's no actually-matching data on the page in so->currPos.buf */
1794
1795 /* Call _bt_readnextpage using its _bt_steppage wrapper function */
1796 if (!_bt_steppage(scan, dir))
1797 return false;
1798
1799 /* _bt_readpage for a later page (now in so->currPos) succeeded */
1800 return true;
1801}
1802
1803/*
1804 * _bt_readnextpage() -- Read next page containing valid data for _bt_next
1805 *
1806 * Caller's blkno is the next interesting page's link, taken from either the
1807 * previously-saved right link or left link. lastcurrblkno is the page that
1808 * was current at the point where the blkno link was saved, which we use to
1809 * reason about concurrent page splits/page deletions during backwards scans.
1810 * In the common case where seized=false, blkno is either so->currPos.nextPage
1811 * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
1812 *
1813 * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
1814 * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
1815 * won't need to be read again in almost all cases). Parallel scan callers
1816 * that seized the scan before calling here should pass seized=true; such a
1817 * caller's blkno and lastcurrblkno arguments come from the seized scan.
1818 * seized=false callers just pass us the blkno/lastcurrblkno taken from their
1819 * so->currPos, which (along with so->currPos itself) can be used to end the
1820 * scan. A seized=false caller's blkno can never be assumed to be the page
1821 * that must be read next during a parallel scan, though. We must figure that
1822 * part out for ourselves by seizing the scan (the correct page to read might
1823 * already be beyond the seized=false caller's blkno during a parallel scan,
1824 * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
1825 * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
1826 *
1827 * On success exit, so->currPos is updated to contain data from the next
1828 * interesting page, and we return true. We hold a pin on the buffer on
1829 * success exit (except during so->dropPin index scans, when we drop the pin
1830 * eagerly to avoid blocking VACUUM).
1831 *
1832 * If there are no more matching records in the given direction, we invalidate
1833 * so->currPos (while ensuring it retains no locks or pins), and return false.
1834 *
1835 * We always release the scan for a parallel scan caller, regardless of
1836 * success or failure; we'll call _bt_parallel_release as soon as possible.
1837 */
1838static bool
1840 BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
1841{
1842 Relation rel = scan->indexRelation;
1843 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1844
1845 Assert(so->currPos.currPage == lastcurrblkno || seized);
1846 Assert(!(blkno == P_NONE && seized));
1848
1849 /*
1850 * Remember that the scan already read lastcurrblkno, a page to the left
1851 * of blkno (or remember reading a page to the right, for backwards scans)
1852 */
1853 if (ScanDirectionIsForward(dir))
1854 so->currPos.moreLeft = true;
1855 else
1856 so->currPos.moreRight = true;
1857
1858 for (;;)
1859 {
1860 Page page;
1861 BTPageOpaque opaque;
1862
1863 if (blkno == P_NONE ||
1865 !so->currPos.moreRight : !so->currPos.moreLeft))
1866 {
1867 /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
1868 Assert(so->currPos.currPage == lastcurrblkno && !seized);
1870 _bt_parallel_done(scan); /* iff !so->needPrimScan */
1871 return false;
1872 }
1873
1874 Assert(!so->needPrimScan);
1875
1876 /* parallel scan must never actually visit so->currPos blkno */
1877 if (!seized && scan->parallel_scan != NULL &&
1878 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
1879 {
1880 /* whole scan is now done (or another primitive scan required) */
1882 return false;
1883 }
1884
1885 if (ScanDirectionIsForward(dir))
1886 {
1887 /* read blkno, but check for interrupts first */
1889 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1890 }
1891 else
1892 {
1893 /* read blkno, avoiding race (also checks for interrupts) */
1894 so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
1895 lastcurrblkno);
1896 if (so->currPos.buf == InvalidBuffer)
1897 {
1898 /* must have been a concurrent deletion of leftmost page */
1900 _bt_parallel_done(scan);
1901 return false;
1902 }
1903 }
1904
1905 page = BufferGetPage(so->currPos.buf);
1906 opaque = BTPageGetOpaque(page);
1907 lastcurrblkno = blkno;
1908 if (likely(!P_IGNORE(opaque)))
1909 {
1910 /* see if there are any matches on this page */
1911 if (ScanDirectionIsForward(dir))
1912 {
1913 /* note that this will clear moreRight if we can stop */
1914 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
1915 break;
1916 blkno = so->currPos.nextPage;
1917 }
1918 else
1919 {
1920 /* note that this will clear moreLeft if we can stop */
1921 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
1922 break;
1923 blkno = so->currPos.prevPage;
1924 }
1925 }
1926 else
1927 {
1928 /* _bt_readpage not called, so do all this for ourselves */
1929 if (ScanDirectionIsForward(dir))
1930 blkno = opaque->btpo_next;
1931 else
1932 blkno = opaque->btpo_prev;
1933 if (scan->parallel_scan != NULL)
1934 _bt_parallel_release(scan, blkno, lastcurrblkno);
1935 }
1936
1937 /* no matching tuples on this page */
1938 _bt_relbuf(rel, so->currPos.buf);
1939 seized = false; /* released by _bt_readpage (or by us) */
1940 }
1941
1942 /*
1943 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1944 * so->currPos.buf in preparation for btgettuple returning tuples.
1945 */
1946 Assert(so->currPos.currPage == blkno);
1949
1950 return true;
1951}
1952
1953/*
1954 * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
1955 * recovering from concurrent page splits/page deletions when necessary
1956 *
1957 * Called during backwards scans, to deal with their unique concurrency rules.
1958 *
1959 * blkno points to the block number of the page that we expect to move the
1960 * scan to. We'll successfully move the scan there when we find that its
1961 * right sibling link still points to lastcurrblkno (the page we just read).
1962 * Otherwise, we have to figure out which page is the correct one for the scan
1963 * to now read the hard way, reasoning about concurrent splits and deletions.
1964 * See nbtree/README.
1965 *
1966 * On return, we have both a pin and a read lock on the returned page, whose
1967 * block number will be set in *blkno. Returns InvalidBuffer if there is no
1968 * page to the left (no lock or pin is held in that case).
1969 *
1970 * It is possible for the returned leaf page to be half-dead; caller must
1971 * check that condition and step left again when required.
1972 */
1973static Buffer
1975 BlockNumber lastcurrblkno)
1976{
1977 BlockNumber origblkno = *blkno; /* detects circular links */
1978
1979 for (;;)
1980 {
1981 Buffer buf;
1982 Page page;
1983 BTPageOpaque opaque;
1984 int tries;
1985
1986 /* check for interrupts while we're not holding any buffer lock */
1988 buf = _bt_getbuf(rel, *blkno, BT_READ);
1989 page = BufferGetPage(buf);
1990 opaque = BTPageGetOpaque(page);
1991
1992 /*
1993 * If this isn't the page we want, walk right till we find what we
1994 * want --- but go no more than four hops (an arbitrary limit). If we
1995 * don't find the correct page by then, the most likely bet is that
1996 * lastcurrblkno got deleted and isn't in the sibling chain at all
1997 * anymore, not that its left sibling got split more than four times.
1998 *
1999 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2000 * because half-dead pages are still in the sibling chain.
2001 */
2002 tries = 0;
2003 for (;;)
2004 {
2005 if (likely(!P_ISDELETED(opaque) &&
2006 opaque->btpo_next == lastcurrblkno))
2007 {
2008 /* Found desired page, return it */
2009 return buf;
2010 }
2011 if (P_RIGHTMOST(opaque) || ++tries > 4)
2012 break;
2013 /* step right */
2014 *blkno = opaque->btpo_next;
2015 buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2016 page = BufferGetPage(buf);
2017 opaque = BTPageGetOpaque(page);
2018 }
2019
2020 /*
2021 * Return to the original page (usually the page most recently read by
2022 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2023 * what's up with its prev sibling link
2024 */
2025 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2026 page = BufferGetPage(buf);
2027 opaque = BTPageGetOpaque(page);
2028 if (P_ISDELETED(opaque))
2029 {
2030 /*
2031 * It was deleted. Move right to first nondeleted page (there
2032 * must be one); that is the page that has acquired the deleted
2033 * one's keyspace, so stepping left from it will take us where we
2034 * want to be.
2035 */
2036 for (;;)
2037 {
2038 if (P_RIGHTMOST(opaque))
2039 elog(ERROR, "fell off the end of index \"%s\"",
2041 lastcurrblkno = opaque->btpo_next;
2042 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2043 page = BufferGetPage(buf);
2044 opaque = BTPageGetOpaque(page);
2045 if (!P_ISDELETED(opaque))
2046 break;
2047 }
2048 }
2049 else
2050 {
2051 /*
2052 * Original lastcurrblkno wasn't deleted; the explanation had
2053 * better be that the page to the left got split or deleted.
2054 * Without this check, we risk going into an infinite loop.
2055 */
2056 if (opaque->btpo_prev == origblkno)
2057 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2058 lastcurrblkno, RelationGetRelationName(rel));
2059 /* Okay to try again, since left sibling link changed */
2060 }
2061
2062 /*
2063 * Original lastcurrblkno from caller was concurrently deleted (could
2064 * also have been a great many concurrent left sibling page splits).
2065 * Found a non-deleted page that should now act as our lastcurrblkno.
2066 */
2067 if (P_LEFTMOST(opaque))
2068 {
2069 /* New lastcurrblkno has no left sibling (concurrently deleted) */
2070 _bt_relbuf(rel, buf);
2071 break;
2072 }
2073
2074 /* Start from scratch with new lastcurrblkno's blkno/prev link */
2075 *blkno = origblkno = opaque->btpo_prev;
2076 _bt_relbuf(rel, buf);
2077 }
2078
2079 return InvalidBuffer;
2080}
2081
2082/*
2083 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2084 *
2085 * If the index is empty, we will return InvalidBuffer; any other failure
2086 * condition causes ereport(). We will not return a dead page.
2087 *
2088 * The returned buffer is pinned and read-locked.
2089 */
2090Buffer
2091_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2092{
2093 Buffer buf;
2094 Page page;
2095 BTPageOpaque opaque;
2096 OffsetNumber offnum;
2097 BlockNumber blkno;
2098 IndexTuple itup;
2099
2100 /*
2101 * If we are looking for a leaf page, okay to descend from fast root;
2102 * otherwise better descend from true root. (There is no point in being
2103 * smarter about intermediate levels.)
2104 */
2105 if (level == 0)
2106 buf = _bt_getroot(rel, NULL, BT_READ);
2107 else
2108 buf = _bt_gettrueroot(rel);
2109
2110 if (!BufferIsValid(buf))
2111 return InvalidBuffer;
2112
2113 page = BufferGetPage(buf);
2114 opaque = BTPageGetOpaque(page);
2115
2116 for (;;)
2117 {
2118 /*
2119 * If we landed on a deleted page, step right to find a live page
2120 * (there must be one). Also, if we want the rightmost page, step
2121 * right if needed to get to it (this could happen if the page split
2122 * since we obtained a pointer to it).
2123 */
2124 while (P_IGNORE(opaque) ||
2125 (rightmost && !P_RIGHTMOST(opaque)))
2126 {
2127 blkno = opaque->btpo_next;
2128 if (blkno == P_NONE)
2129 elog(ERROR, "fell off the end of index \"%s\"",
2131 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2132 page = BufferGetPage(buf);
2133 opaque = BTPageGetOpaque(page);
2134 }
2135
2136 /* Done? */
2137 if (opaque->btpo_level == level)
2138 break;
2139 if (opaque->btpo_level < level)
2140 ereport(ERROR,
2141 (errcode(ERRCODE_INDEX_CORRUPTED),
2142 errmsg_internal("btree level %u not found in index \"%s\"",
2143 level, RelationGetRelationName(rel))));
2144
2145 /* Descend to leftmost or rightmost child page */
2146 if (rightmost)
2147 offnum = PageGetMaxOffsetNumber(page);
2148 else
2149 offnum = P_FIRSTDATAKEY(opaque);
2150
2151 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2152 blkno = BTreeTupleGetDownLink(itup);
2153
2154 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2155 page = BufferGetPage(buf);
2156 opaque = BTPageGetOpaque(page);
2157 }
2158
2159 return buf;
2160}
2161
2162/*
2163 * _bt_endpoint() -- Find the first or last page in the index, and scan
2164 * from there to the first key satisfying all the quals.
2165 *
2166 * This is used by _bt_first() to set up a scan when we've determined
2167 * that the scan must start at the beginning or end of the index (for
2168 * a forward or backward scan respectively).
2169 *
2170 * Parallel scan callers must have seized the scan before calling here.
2171 * Exit conditions are the same as for _bt_first().
2172 */
2173static bool
2175{
2176 Relation rel = scan->indexRelation;
2177 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2178 Page page;
2179 BTPageOpaque opaque;
2181
2183 Assert(!so->needPrimScan);
2184
2185 /*
2186 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2187 * version of _bt_search().
2188 */
2190
2191 if (!BufferIsValid(so->currPos.buf))
2192 {
2193 /*
2194 * Empty index. Lock the whole relation, as nothing finer to lock
2195 * exists.
2196 */
2198 _bt_parallel_done(scan);
2199 return false;
2200 }
2201
2202 page = BufferGetPage(so->currPos.buf);
2203 opaque = BTPageGetOpaque(page);
2204 Assert(P_ISLEAF(opaque));
2205
2206 if (ScanDirectionIsForward(dir))
2207 {
2208 /* There could be dead pages to the left, so not this: */
2209 /* Assert(P_LEFTMOST(opaque)); */
2210
2211 start = P_FIRSTDATAKEY(opaque);
2212 }
2213 else if (ScanDirectionIsBackward(dir))
2214 {
2215 Assert(P_RIGHTMOST(opaque));
2216
2218 }
2219 else
2220 {
2221 elog(ERROR, "invalid scan direction: %d", (int) dir);
2222 start = 0; /* keep compiler quiet */
2223 }
2224
2225 /*
2226 * Now load data from the first page of the scan.
2227 */
2228 if (!_bt_readfirstpage(scan, start, dir))
2229 return false;
2230
2231 _bt_returnitem(scan, so);
2232 return true;
2233}
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5398
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4223
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4499
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
PageData * Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
#define RegProcedureIsValid(p)
Definition: c.h:790
#define Min(x, y)
Definition: c.h:1016
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1118
#define likely(x)
Definition: c.h:417
regproc RegProcedure
Definition: c.h:669
int32_t int32
Definition: c.h:548
#define unlikely(x)
Definition: c.h:418
uint32_t uint32
Definition: c.h:552
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode(int sqlerrcode)
Definition: elog.c:863
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
#define palloc_object(type)
Definition: fe_memutils.h:74
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
Assert(PointerIsAligned(start, uint64))
return str start
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:917
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition: itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:887
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2256
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1004
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:581
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:740
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:846
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1071
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1040
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:345
void _bt_preprocess_keys(IndexScanDesc scan)
bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
Definition: nbtreadpage.c:134
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtreadpage.c:537
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:885
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:1050
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:1023
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
BTStackData * BTStack
Definition: nbtree.h:750
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define SK_BT_SKIP
Definition: nbtree.h:1106
#define P_HIKEY
Definition: nbtree.h:368
#define BTORDER_PROC
Definition: nbtree.h:717
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define SK_BT_PRIOR
Definition: nbtree.h:1112
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define SK_BT_NEXT
Definition: nbtree.h:1111
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_NONE
Definition: nbtree.h:213
#define SK_BT_REQBKWD
Definition: nbtree.h:1105
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1117
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define SK_BT_MAXVAL
Definition: nbtree.h:1110
#define BT_READ
Definition: nbtree.h:730
#define SK_BT_REQFWD
Definition: nbtree.h:1104
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define SK_BT_DESC
Definition: nbtree.h:1116
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define BT_WRITE
Definition: nbtree.h:731
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
#define SK_BT_MINVAL
Definition: nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1097
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2091
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:236
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:597
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:877
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:1839
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:338
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2174
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1646
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:1746
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition: nbtsearch.c:1974
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:97
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:469
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1585
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:1621
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:683
static void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
Definition: nbtsearch.c:54
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:151
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:205
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:964
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define INDEX_MAX_KEYS
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:705
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:37
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2574
short access
Definition: preproc-type.c:36
#define RelationGetDescr(relation)
Definition: rel.h:541
#define RelationGetRelationName(relation)
Definition: rel.h:549
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
ScanDirection
Definition: sdir.h:25
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
ScanKey high_compare
Definition: nbtree.h:1050
ScanKey low_compare
Definition: nbtree.h:1049
bool null_elem
Definition: nbtree.h:1047
OffsetNumber stricthigh
Definition: nbtree.h:836
bool bounds_valid
Definition: nbtree.h:834
OffsetNumber low
Definition: nbtree.h:835
BTScanInsert itup_key
Definition: nbtree.h:824
BlockNumber btpo_next
Definition: nbtree.h:66
BlockNumber btpo_prev
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:67
bool needPrimScan
Definition: nbtree.h:1063
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
char * markTuples
Definition: nbtree.h:1081
BTScanPosData currPos
Definition: nbtree.h:1093
char * currTuples
Definition: nbtree.h:1080
BTScanPosData markPos
Definition: nbtree.h:1094
ScanKey keyData
Definition: nbtree.h:1058
bool moreRight
Definition: nbtree.h:986
Buffer buf
Definition: nbtree.h:964
BlockNumber currPage
Definition: nbtree.h:967
int firstItem
Definition: nbtree.h:995
int nextTupleOffset
Definition: nbtree.h:979
BlockNumber prevPage
Definition: nbtree.h:968
BlockNumber nextPage
Definition: nbtree.h:969
bool moreLeft
Definition: nbtree.h:985
int lastItem
Definition: nbtree.h:996
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
int itemIndex
Definition: nbtree.h:997
ScanDirection dir
Definition: nbtree.h:973
XLogRecPtr lsn
Definition: nbtree.h:970
ItemPointerData heapTid
Definition: nbtree.h:957
LocationIndex tupleOffset
Definition: nbtree.h:959
BlockNumber bts_blkno
Definition: nbtree.h:745
struct BTStackData * bts_parent
Definition: nbtree.h:747
OffsetNumber bts_offset
Definition: nbtree.h:746
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:193
struct IndexScanInstrumentation * instrument
Definition: relscan.h:161
IndexTuple xs_itup
Definition: relscan.h:169
Relation indexRelation
Definition: relscan.h:139
ItemPointerData xs_heaptid
Definition: relscan.h:174
struct SnapshotData * xs_snapshot
Definition: relscan.h:140
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_subtype
Definition: skey.h:69
Oid sk_collation
Definition: skey.h:70
StrategyNumber sk_strategy
Definition: skey.h:68
AttrNumber sk_attno
Definition: skey.h:67
static ItemArray items
Definition: test_tidstore.c:48
#define IsolationIsSerializable()
Definition: xact.h:53