PostgreSQL Source Code git master
nbtsearch.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
5 *
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "access/nbtree.h"
19#include "access/relscan.h"
20#include "access/xact.h"
21#include "miscadmin.h"
22#include "pgstat.h"
23#include "storage/predicate.h"
24#include "utils/lsyscache.h"
25#include "utils/rel.h"
26
27
30 Buffer buf, bool forupdate, BTStack stack,
31 int access);
34 OffsetNumber offnum);
35static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
36 OffsetNumber offnum, bool firstPage);
37static void _bt_saveitem(BTScanOpaque so, int itemIndex,
38 OffsetNumber offnum, IndexTuple itup);
39static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
40 OffsetNumber offnum, ItemPointer heapTid,
41 IndexTuple itup);
42static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
43 OffsetNumber offnum,
44 ItemPointer heapTid, int tupleOffset);
45static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
46static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
47static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
48 ScanDirection dir);
49static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
50 BlockNumber lastcurrblkno, ScanDirection dir,
51 bool seized);
53 BlockNumber lastcurrblkno);
54static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
55
56
57/*
58 * _bt_drop_lock_and_maybe_pin()
59 *
60 * Unlock the buffer; and if it is safe to release the pin, do that, too.
61 * This will prevent vacuum from stalling in a blocked state trying to read a
62 * page when a cursor is sitting on it.
63 *
64 * See nbtree/README section on making concurrent TID recycling safe.
65 */
66static void
68{
70
71 if (IsMVCCSnapshot(scan->xs_snapshot) &&
73 !scan->xs_want_itup)
74 {
75 ReleaseBuffer(sp->buf);
76 sp->buf = InvalidBuffer;
77 }
78}
79
80/*
81 * _bt_search() -- Search the tree for a particular scankey,
82 * or more precisely for the first leaf page it could be on.
83 *
84 * The passed scankey is an insertion-type scankey (see nbtree/README),
85 * but it can omit the rightmost column(s) of the index.
86 *
87 * Return value is a stack of parent-page pointers (i.e. there is no entry for
88 * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
89 * which is locked and pinned. No locks are held on the parent pages,
90 * however!
91 *
92 * The returned buffer is locked according to access parameter. Additionally,
93 * access = BT_WRITE will allow an empty root page to be created and returned.
94 * When access = BT_READ, an empty index will result in *bufP being set to
95 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
96 * during the search will be finished.
97 *
98 * heaprel must be provided by callers that pass access = BT_WRITE, since we
99 * might need to allocate a new root page for caller -- see _bt_allocbuf.
100 */
103 int access)
104{
105 BTStack stack_in = NULL;
106 int page_access = BT_READ;
107
108 /* heaprel must be set whenever _bt_allocbuf is reachable */
110 Assert(access == BT_READ || heaprel != NULL);
111
112 /* Get the root page to start with */
113 *bufP = _bt_getroot(rel, heaprel, access);
114
115 /* If index is empty and access = BT_READ, no root page is created. */
116 if (!BufferIsValid(*bufP))
117 return (BTStack) NULL;
118
119 /* Loop iterates once per level descended in the tree */
120 for (;;)
121 {
122 Page page;
123 BTPageOpaque opaque;
124 OffsetNumber offnum;
125 ItemId itemid;
126 IndexTuple itup;
127 BlockNumber child;
128 BTStack new_stack;
129
130 /*
131 * Race -- the page we just grabbed may have split since we read its
132 * downlink in its parent page (or the metapage). If it has, we may
133 * need to move right to its new sibling. Do that.
134 *
135 * In write-mode, allow _bt_moveright to finish any incomplete splits
136 * along the way. Strictly speaking, we'd only need to finish an
137 * incomplete split on the leaf page we're about to insert to, not on
138 * any of the upper levels (internal pages with incomplete splits are
139 * also taken care of in _bt_getstackbuf). But this is a good
140 * opportunity to finish splits of internal pages too.
141 */
142 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
143 stack_in, page_access);
144
145 /* if this is a leaf page, we're done */
146 page = BufferGetPage(*bufP);
147 opaque = BTPageGetOpaque(page);
148 if (P_ISLEAF(opaque))
149 break;
150
151 /*
152 * Find the appropriate pivot tuple on this page. Its downlink points
153 * to the child page that we're about to descend to.
154 */
155 offnum = _bt_binsrch(rel, key, *bufP);
156 itemid = PageGetItemId(page, offnum);
157 itup = (IndexTuple) PageGetItem(page, itemid);
158 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
159 child = BTreeTupleGetDownLink(itup);
160
161 /*
162 * We need to save the location of the pivot tuple we chose in a new
163 * stack entry for this page/level. If caller ends up splitting a
164 * page one level down, it usually ends up inserting a new pivot
165 * tuple/downlink immediately after the location recorded here.
166 */
167 new_stack = (BTStack) palloc(sizeof(BTStackData));
168 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169 new_stack->bts_offset = offnum;
170 new_stack->bts_parent = stack_in;
171
172 /*
173 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
174 * we're on the level 1 and asked to lock leaf page in write mode,
175 * then lock next page in write mode, because it must be a leaf.
176 */
177 if (opaque->btpo_level == 1 && access == BT_WRITE)
178 page_access = BT_WRITE;
179
180 /* drop the read lock on the page, then acquire one on its child */
181 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
182
183 /* okay, all set to move down a level */
184 stack_in = new_stack;
185 }
186
187 /*
188 * If we're asked to lock leaf in write mode, but didn't manage to, then
189 * relock. This should only happen when the root page is a leaf page (and
190 * the only page in the index other than the metapage).
191 */
192 if (access == BT_WRITE && page_access == BT_READ)
193 {
194 /* trade in our read lock for a write lock */
195 _bt_unlockbuf(rel, *bufP);
196 _bt_lockbuf(rel, *bufP, BT_WRITE);
197
198 /*
199 * Race -- the leaf page may have split after we dropped the read lock
200 * but before we acquired a write lock. If it has, we may need to
201 * move right to its new sibling. Do that.
202 */
203 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
204 }
205
206 return stack_in;
207}
208
209/*
210 * _bt_moveright() -- move right in the btree if necessary.
211 *
212 * When we follow a pointer to reach a page, it is possible that
213 * the page has changed in the meanwhile. If this happens, we're
214 * guaranteed that the page has "split right" -- that is, that any
215 * data that appeared on the page originally is either on the page
216 * or strictly to the right of it.
217 *
218 * This routine decides whether or not we need to move right in the
219 * tree by examining the high key entry on the page. If that entry is
220 * strictly less than the scankey, or <= the scankey in the
221 * key.nextkey=true case, then we followed the wrong link and we need
222 * to move right.
223 *
224 * The passed insertion-type scankey can omit the rightmost column(s) of the
225 * index. (see nbtree/README)
226 *
227 * When key.nextkey is false (the usual case), we are looking for the first
228 * item >= key. When key.nextkey is true, we are looking for the first item
229 * strictly greater than key.
230 *
231 * If forupdate is true, we will attempt to finish any incomplete splits
232 * that we encounter. This is required when locking a target page for an
233 * insertion, because we don't allow inserting on a page before the split is
234 * completed. 'heaprel' and 'stack' are only used if forupdate is true.
235 *
236 * On entry, we have the buffer pinned and a lock of the type specified by
237 * 'access'. If we move right, we release the buffer and lock and acquire
238 * the same on the right sibling. Return value is the buffer we stop at.
239 */
240static Buffer
242 Relation heaprel,
244 Buffer buf,
245 bool forupdate,
246 BTStack stack,
247 int access)
248{
249 Page page;
250 BTPageOpaque opaque;
251 int32 cmpval;
252
253 Assert(!forupdate || heaprel != NULL);
254
255 /*
256 * When nextkey = false (normal case): if the scan key that brought us to
257 * this page is > the high key stored on the page, then the page has split
258 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
259 * have some duplicates to the right as well as the left, but that's
260 * something that's only ever dealt with on the leaf level, after
261 * _bt_search has found an initial leaf page.)
262 *
263 * When nextkey = true: move right if the scan key is >= page's high key.
264 * (Note that key.scantid cannot be set in this case.)
265 *
266 * The page could even have split more than once, so scan as far as
267 * needed.
268 *
269 * We also have to move right if we followed a link that brought us to a
270 * dead page.
271 */
272 cmpval = key->nextkey ? 0 : 1;
273
274 for (;;)
275 {
276 page = BufferGetPage(buf);
277 opaque = BTPageGetOpaque(page);
278
279 if (P_RIGHTMOST(opaque))
280 break;
281
282 /*
283 * Finish any incomplete splits we encounter along the way.
284 */
285 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
286 {
288
289 /* upgrade our lock if necessary */
290 if (access == BT_READ)
291 {
292 _bt_unlockbuf(rel, buf);
293 _bt_lockbuf(rel, buf, BT_WRITE);
294 }
295
296 if (P_INCOMPLETE_SPLIT(opaque))
297 _bt_finish_split(rel, heaprel, buf, stack);
298 else
299 _bt_relbuf(rel, buf);
300
301 /* re-acquire the lock in the right mode, and re-check */
302 buf = _bt_getbuf(rel, blkno, access);
303 continue;
304 }
305
306 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
307 {
308 /* step right one page */
309 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
310 continue;
311 }
312 else
313 break;
314 }
315
316 if (P_IGNORE(opaque))
317 elog(ERROR, "fell off the end of index \"%s\"",
319
320 return buf;
321}
322
323/*
324 * _bt_binsrch() -- Do a binary search for a key on a particular page.
325 *
326 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
327 * of the last key < given scankey, or last key <= given scankey if nextkey
328 * is true. (Since _bt_compare treats the first data key of such a page as
329 * minus infinity, there will be at least one key < scankey, so the result
330 * always points at one of the keys on the page.)
331 *
332 * On a leaf page, _bt_binsrch() returns the final result of the initial
333 * positioning process that started with _bt_first's call to _bt_search.
334 * We're returning a non-pivot tuple offset, so things are a little different.
335 * It is possible that we'll return an offset that's either past the last
336 * non-pivot slot, or (in the case of a backward scan) before the first slot.
337 *
338 * This procedure is not responsible for walking right, it just examines
339 * the given page. _bt_binsrch() has no lock or refcount side effects
340 * on the buffer.
341 */
342static OffsetNumber
345 Buffer buf)
346{
347 Page page;
348 BTPageOpaque opaque;
349 OffsetNumber low,
350 high;
351 int32 result,
352 cmpval;
353
354 page = BufferGetPage(buf);
355 opaque = BTPageGetOpaque(page);
356
357 /* Requesting nextkey semantics while using scantid seems nonsensical */
358 Assert(!key->nextkey || key->scantid == NULL);
359 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
360 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
361
362 low = P_FIRSTDATAKEY(opaque);
363 high = PageGetMaxOffsetNumber(page);
364
365 /*
366 * If there are no keys on the page, return the first available slot. Note
367 * this covers two cases: the page is really empty (no keys), or it
368 * contains only a high key. The latter case is possible after vacuuming.
369 * This can never happen on an internal page, however, since they are
370 * never empty (an internal page must have at least one child).
371 */
372 if (unlikely(high < low))
373 return low;
374
375 /*
376 * Binary search to find the first key on the page >= scan key, or first
377 * key > scankey when nextkey is true.
378 *
379 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381 *
382 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384 *
385 * We can fall out when high == low.
386 */
387 high++; /* establish the loop invariant for high */
388
389 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
390
391 while (high > low)
392 {
393 OffsetNumber mid = low + ((high - low) / 2);
394
395 /* We have low <= mid < high, so mid points at a real slot */
396
397 result = _bt_compare(rel, key, page, mid);
398
399 if (result >= cmpval)
400 low = mid + 1;
401 else
402 high = mid;
403 }
404
405 /*
406 * At this point we have high == low.
407 *
408 * On a leaf page we always return the first non-pivot tuple >= scan key
409 * (resp. > scan key) for forward scan callers. For backward scans, it's
410 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
411 */
412 if (P_ISLEAF(opaque))
413 {
414 /*
415 * In the backward scan case we're supposed to locate the last
416 * matching tuple on the leaf level -- not the first matching tuple
417 * (the last tuple will be the first one returned by the scan).
418 *
419 * At this point we've located the first non-pivot tuple immediately
420 * after the last matching tuple (which might just be maxoff + 1).
421 * Compensate by stepping back.
422 */
423 if (key->backward)
424 return OffsetNumberPrev(low);
425
426 return low;
427 }
428
429 /*
430 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
431 * There must be one if _bt_compare() is playing by the rules.
432 *
433 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
434 * a truncated -inf heap TID is usually enough to prevent it altogether.
435 * Even omitted scan key entries are treated as > truncated attributes.
436 *
437 * However, during backward scans _bt_compare() interprets omitted scan
438 * key attributes as == corresponding truncated -inf attributes instead.
439 * This works just like < would work here. Under this scheme, < strategy
440 * backward scans will always directly descend to the correct leaf page.
441 * In particular, they will never incur an "extra" leaf page access with a
442 * scan key that happens to contain the same prefix of values as some
443 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
444 * it uses a leaf page high key to "re-find" a page undergoing deletion.
445 */
446 Assert(low > P_FIRSTDATAKEY(opaque));
447
448 return OffsetNumberPrev(low);
449}
450
451/*
452 *
453 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
454 *
455 * Like _bt_binsrch(), but with support for caching the binary search
456 * bounds. Only used during insertion, and only on the leaf page that it
457 * looks like caller will insert tuple on. Exclusive-locked and pinned
458 * leaf page is contained within insertstate.
459 *
460 * Caches the bounds fields in insertstate so that a subsequent call can
461 * reuse the low and strict high bounds of original binary search. Callers
462 * that use these fields directly must be prepared for the case where low
463 * and/or stricthigh are not on the same page (one or both exceed maxoff
464 * for the page). The case where there are no items on the page (high <
465 * low) makes bounds invalid.
466 *
467 * Caller is responsible for invalidating bounds when it modifies the page
468 * before calling here a second time, and for dealing with posting list
469 * tuple matches (callers can use insertstate's postingoff field to
470 * determine which existing heap TID will need to be replaced by a posting
471 * list split).
472 */
475{
476 BTScanInsert key = insertstate->itup_key;
477 Page page;
478 BTPageOpaque opaque;
479 OffsetNumber low,
480 high,
481 stricthigh;
482 int32 result,
483 cmpval;
484
485 page = BufferGetPage(insertstate->buf);
486 opaque = BTPageGetOpaque(page);
487
488 Assert(P_ISLEAF(opaque));
489 Assert(!key->nextkey);
490 Assert(insertstate->postingoff == 0);
491
492 if (!insertstate->bounds_valid)
493 {
494 /* Start new binary search */
495 low = P_FIRSTDATAKEY(opaque);
496 high = PageGetMaxOffsetNumber(page);
497 }
498 else
499 {
500 /* Restore result of previous binary search against same page */
501 low = insertstate->low;
502 high = insertstate->stricthigh;
503 }
504
505 /* If there are no keys on the page, return the first available slot */
506 if (unlikely(high < low))
507 {
508 /* Caller can't reuse bounds */
509 insertstate->low = InvalidOffsetNumber;
510 insertstate->stricthigh = InvalidOffsetNumber;
511 insertstate->bounds_valid = false;
512 return low;
513 }
514
515 /*
516 * Binary search to find the first key on the page >= scan key. (nextkey
517 * is always false when inserting).
518 *
519 * The loop invariant is: all slots before 'low' are < scan key, all slots
520 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
521 * maintained to save additional search effort for caller.
522 *
523 * We can fall out when high == low.
524 */
525 if (!insertstate->bounds_valid)
526 high++; /* establish the loop invariant for high */
527 stricthigh = high; /* high initially strictly higher */
528
529 cmpval = 1; /* !nextkey comparison value */
530
531 while (high > low)
532 {
533 OffsetNumber mid = low + ((high - low) / 2);
534
535 /* We have low <= mid < high, so mid points at a real slot */
536
537 result = _bt_compare(rel, key, page, mid);
538
539 if (result >= cmpval)
540 low = mid + 1;
541 else
542 {
543 high = mid;
544 if (result != 0)
545 stricthigh = high;
546 }
547
548 /*
549 * If tuple at offset located by binary search is a posting list whose
550 * TID range overlaps with caller's scantid, perform posting list
551 * binary search to set postingoff for caller. Caller must split the
552 * posting list when postingoff is set. This should happen
553 * infrequently.
554 */
555 if (unlikely(result == 0 && key->scantid != NULL))
556 {
557 /*
558 * postingoff should never be set more than once per leaf page
559 * binary search. That would mean that there are duplicate table
560 * TIDs in the index, which is never okay. Check for that here.
561 */
562 if (insertstate->postingoff != 0)
564 (errcode(ERRCODE_INDEX_CORRUPTED),
565 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
568 low, stricthigh,
569 BufferGetBlockNumber(insertstate->buf),
571
572 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
573 }
574 }
575
576 /*
577 * On a leaf page, a binary search always returns the first key >= scan
578 * key (at least in !nextkey case), which could be the last slot + 1. This
579 * is also the lower bound of cached search.
580 *
581 * stricthigh may also be the last slot + 1, which prevents caller from
582 * using bounds directly, but is still useful to us if we're called a
583 * second time with cached bounds (cached low will be < stricthigh when
584 * that happens).
585 */
586 insertstate->low = low;
587 insertstate->stricthigh = stricthigh;
588 insertstate->bounds_valid = true;
589
590 return low;
591}
592
593/*----------
594 * _bt_binsrch_posting() -- posting list binary search.
595 *
596 * Helper routine for _bt_binsrch_insert().
597 *
598 * Returns offset into posting list where caller's scantid belongs.
599 *----------
600 */
601static int
603{
604 IndexTuple itup;
605 ItemId itemid;
606 int low,
607 high,
608 mid,
609 res;
610
611 /*
612 * If this isn't a posting tuple, then the index must be corrupt (if it is
613 * an ordinary non-pivot tuple then there must be an existing tuple with a
614 * heap TID that equals inserter's new heap TID/scantid). Defensively
615 * check that tuple is a posting list tuple whose posting list range
616 * includes caller's scantid.
617 *
618 * (This is also needed because contrib/amcheck's rootdescend option needs
619 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
620 */
621 itemid = PageGetItemId(page, offnum);
622 itup = (IndexTuple) PageGetItem(page, itemid);
623 if (!BTreeTupleIsPosting(itup))
624 return 0;
625
626 Assert(key->heapkeyspace && key->allequalimage);
627
628 /*
629 * In the event that posting list tuple has LP_DEAD bit set, indicate this
630 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
631 * second call to _bt_binsrch_insert() can take place when its caller has
632 * removed the dead item.
633 */
634 if (ItemIdIsDead(itemid))
635 return -1;
636
637 /* "high" is past end of posting list for loop invariant */
638 low = 0;
639 high = BTreeTupleGetNPosting(itup);
640 Assert(high >= 2);
641
642 while (high > low)
643 {
644 mid = low + ((high - low) / 2);
645 res = ItemPointerCompare(key->scantid,
646 BTreeTupleGetPostingN(itup, mid));
647
648 if (res > 0)
649 low = mid + 1;
650 else if (res < 0)
651 high = mid;
652 else
653 return mid;
654 }
655
656 /* Exact match not found */
657 return low;
658}
659
660/*----------
661 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
662 *
663 * page/offnum: location of btree item to be compared to.
664 *
665 * This routine returns:
666 * <0 if scankey < tuple at offnum;
667 * 0 if scankey == tuple at offnum;
668 * >0 if scankey > tuple at offnum.
669 *
670 * NULLs in the keys are treated as sortable values. Therefore
671 * "equality" does not necessarily mean that the item should be returned
672 * to the caller as a matching key. Similarly, an insertion scankey
673 * with its scantid set is treated as equal to a posting tuple whose TID
674 * range overlaps with their scantid. There generally won't be a
675 * matching TID in the posting tuple, which caller must handle
676 * themselves (e.g., by splitting the posting list tuple).
677 *
678 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
679 * "minus infinity": this routine will always claim it is less than the
680 * scankey. The actual key value stored is explicitly truncated to 0
681 * attributes (explicitly minus infinity) with version 3+ indexes, but
682 * that isn't relied upon. This allows us to implement the Lehman and
683 * Yao convention that the first down-link pointer is before the first
684 * key. See backend/access/nbtree/README for details.
685 *----------
686 */
687int32
690 Page page,
691 OffsetNumber offnum)
692{
693 TupleDesc itupdesc = RelationGetDescr(rel);
694 BTPageOpaque opaque = BTPageGetOpaque(page);
695 IndexTuple itup;
696 ItemPointer heapTid;
697 ScanKey scankey;
698 int ncmpkey;
699 int ntupatts;
700 int32 result;
701
702 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
704 Assert(key->heapkeyspace || key->scantid == NULL);
705
706 /*
707 * Force result ">" if target item is first data item on an internal page
708 * --- see NOTE above.
709 */
710 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
711 return 1;
712
713 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
714 ntupatts = BTreeTupleGetNAtts(itup, rel);
715
716 /*
717 * The scan key is set up with the attribute number associated with each
718 * term in the key. It is important that, if the index is multi-key, the
719 * scan contain the first k key attributes, and that they be in order. If
720 * you think about how multi-key ordering works, you'll understand why
721 * this is.
722 *
723 * We don't test for violation of this condition here, however. The
724 * initial setup for the index scan had better have gotten it right (see
725 * _bt_first).
726 */
727
728 ncmpkey = Min(ntupatts, key->keysz);
729 Assert(key->heapkeyspace || ncmpkey == key->keysz);
730 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
731 scankey = key->scankeys;
732 for (int i = 1; i <= ncmpkey; i++)
733 {
734 Datum datum;
735 bool isNull;
736
737 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
738
739 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
740 {
741 if (isNull)
742 result = 0; /* NULL "=" NULL */
743 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
744 result = -1; /* NULL "<" NOT_NULL */
745 else
746 result = 1; /* NULL ">" NOT_NULL */
747 }
748 else if (isNull) /* key is NOT_NULL and item is NULL */
749 {
750 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
751 result = 1; /* NOT_NULL ">" NULL */
752 else
753 result = -1; /* NOT_NULL "<" NULL */
754 }
755 else
756 {
757 /*
758 * The sk_func needs to be passed the index value as left arg and
759 * the sk_argument as right arg (they might be of different
760 * types). Since it is convenient for callers to think of
761 * _bt_compare as comparing the scankey to the index item, we have
762 * to flip the sign of the comparison result. (Unless it's a DESC
763 * column, in which case we *don't* flip the sign.)
764 */
765 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
766 scankey->sk_collation,
767 datum,
768 scankey->sk_argument));
769
770 if (!(scankey->sk_flags & SK_BT_DESC))
771 INVERT_COMPARE_RESULT(result);
772 }
773
774 /* if the keys are unequal, return the difference */
775 if (result != 0)
776 return result;
777
778 scankey++;
779 }
780
781 /*
782 * All non-truncated attributes (other than heap TID) were found to be
783 * equal. Treat truncated attributes as minus infinity when scankey has a
784 * key attribute value that would otherwise be compared directly.
785 *
786 * Note: it doesn't matter if ntupatts includes non-key attributes;
787 * scankey won't, so explicitly excluding non-key attributes isn't
788 * necessary.
789 */
790 if (key->keysz > ntupatts)
791 return 1;
792
793 /*
794 * Use the heap TID attribute and scantid to try to break the tie. The
795 * rules are the same as any other key attribute -- only the
796 * representation differs.
797 */
798 heapTid = BTreeTupleGetHeapTID(itup);
799 if (key->scantid == NULL)
800 {
801 /*
802 * Forward scans have a scankey that is considered greater than a
803 * truncated pivot tuple if and when the scankey has equal values for
804 * attributes up to and including the least significant untruncated
805 * attribute in tuple. Even attributes that were omitted from the
806 * scan key are considered greater than -inf truncated attributes.
807 * (See _bt_binsrch for an explanation of our backward scan behavior.)
808 *
809 * For example, if an index has the minimum two attributes (single
810 * user key attribute, plus heap TID attribute), and a page's high key
811 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
812 * will not descend to the page to the left. The search will descend
813 * right instead. The truncated attribute in pivot tuple means that
814 * all non-pivot tuples on the page to the left are strictly < 'foo',
815 * so it isn't necessary to descend left. In other words, search
816 * doesn't have to descend left because it isn't interested in a match
817 * that has a heap TID value of -inf.
818 *
819 * Note: the heap TID part of the test ensures that scankey is being
820 * compared to a pivot tuple with one or more truncated -inf key
821 * attributes. The heap TID attribute is the last key attribute in
822 * every index, of course, but other than that it isn't special.
823 */
824 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
825 key->heapkeyspace)
826 return 1;
827
828 /* All provided scankey arguments found to be equal */
829 return 0;
830 }
831
832 /*
833 * Treat truncated heap TID as minus infinity, since scankey has a key
834 * attribute value (scantid) that would otherwise be compared directly
835 */
837 if (heapTid == NULL)
838 return 1;
839
840 /*
841 * Scankey must be treated as equal to a posting list tuple if its scantid
842 * value falls within the range of the posting list. In all other cases
843 * there can only be a single heap TID value, which is compared directly
844 * with scantid.
845 */
847 result = ItemPointerCompare(key->scantid, heapTid);
848 if (result <= 0 || !BTreeTupleIsPosting(itup))
849 return result;
850 else
851 {
852 result = ItemPointerCompare(key->scantid,
854 if (result > 0)
855 return 1;
856 }
857
858 return 0;
859}
860
861/*
862 * _bt_first() -- Find the first item in a scan.
863 *
864 * We need to be clever about the direction of scan, the search
865 * conditions, and the tree ordering. We find the first item (or,
866 * if backwards scan, the last item) in the tree that satisfies the
867 * qualifications in the scan key. On success exit, data about the
868 * matching tuple(s) on the page has been loaded into so->currPos. We'll
869 * drop all locks and hold onto a pin on page's buffer, except when
870 * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.
871 * _bt_returnitem sets the next item to return to scan on success exit.
872 *
873 * If there are no matching items in the index, we return false, with no
874 * pins or locks held. so->currPos will remain invalid.
875 *
876 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
877 * are both search-type scankeys (see nbtree/README for more about this).
878 * Within this routine, we build a temporary insertion-type scankey to use
879 * in locating the scan start position.
880 */
881bool
883{
884 Relation rel = scan->indexRelation;
885 BTScanOpaque so = (BTScanOpaque) scan->opaque;
886 BTStack stack;
887 OffsetNumber offnum;
888 BTScanInsertData inskey;
889 ScanKey startKeys[INDEX_MAX_KEYS];
890 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
891 int keysz = 0;
892 StrategyNumber strat_total;
894 lastcurrblkno;
895
897
898 /*
899 * Examine the scan keys and eliminate any redundant keys; also mark the
900 * keys that must be matched to continue the scan.
901 */
903
904 /*
905 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 * never be satisfied (eg, x == 1 AND x > 2).
907 */
908 if (!so->qual_ok)
909 {
910 Assert(!so->needPrimScan);
911 _bt_parallel_done(scan);
912 return false;
913 }
914
915 /*
916 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 * will likely release the parallel scan later on.
918 */
919 if (scan->parallel_scan != NULL &&
920 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 return false;
922
923 /*
924 * Initialize the scan's arrays (if any) for the current scan direction
925 * (except when they were already set to later values as part of
926 * scheduling the primitive index scan that is now underway)
927 */
928 if (so->numArrayKeys && !so->needPrimScan)
929 _bt_start_array_keys(scan, dir);
930
931 if (blkno != InvalidBlockNumber)
932 {
933 /*
934 * We anticipated calling _bt_search, but another worker bet us to it.
935 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 */
937 Assert(scan->parallel_scan != NULL);
938 Assert(!so->needPrimScan);
939 Assert(blkno != P_NONE);
940
941 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
942 return false;
943
944 _bt_returnitem(scan, so);
945 return true;
946 }
947
948 /*
949 * Count an indexscan for stats, now that we know that we'll call
950 * _bt_search/_bt_endpoint below
951 */
953
954 /*----------
955 * Examine the scan keys to discover where we need to start the scan.
956 *
957 * We want to identify the keys that can be used as starting boundaries;
958 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
959 * a backwards scan. We can use keys for multiple attributes so long as
960 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
961 * a > or < boundary or find an attribute with no boundary (which can be
962 * thought of as the same as "> -infinity"), we can't use keys for any
963 * attributes to its right, because it would break our simplistic notion
964 * of what initial positioning strategy to use.
965 *
966 * When the scan keys include cross-type operators, _bt_preprocess_keys
967 * may not be able to eliminate redundant keys; in such cases we will
968 * arbitrarily pick a usable one for each attribute. This is correct
969 * but possibly not optimal behavior. (For example, with keys like
970 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
971 * x=5 would be more efficient.) Since the situation only arises given
972 * a poorly-worded query plus an incomplete opfamily, live with it.
973 *
974 * When both equality and inequality keys appear for a single attribute
975 * (again, only possible when cross-type operators appear), we *must*
976 * select one of the equality keys for the starting point, because
977 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
978 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
979 * start at x=4, we will fail and stop before reaching x=10. If multiple
980 * equality quals survive preprocessing, however, it doesn't matter which
981 * one we use --- by definition, they are either redundant or
982 * contradictory.
983 *
984 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
985 * If the index stores nulls at the end of the index we'll be starting
986 * from, and we have no boundary key for the column (which means the key
987 * we deduced NOT NULL from is an inequality key that constrains the other
988 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
989 * use as a boundary key. If we didn't do this, we might find ourselves
990 * traversing a lot of null entries at the start of the scan.
991 *
992 * In this loop, row-comparison keys are treated the same as keys on their
993 * first (leftmost) columns. We'll add on lower-order columns of the row
994 * comparison below, if possible.
995 *
996 * The selected scan keys (at most one per index column) are remembered by
997 * storing their addresses into the local startKeys[] array.
998 *
999 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1000 * the next primitive index scan (for scans with array keys) based in part
1001 * on an understanding of how it'll enable us to reposition the scan.
1002 * They're directly aware of how we'll sometimes cons up an explicit
1003 * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1004 * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1005 * scans to skip large groups of NULLs through repeated deductions about
1006 * key strictness (for a required inequality key) and whether NULLs in the
1007 * key's index column are stored last or first (relative to non-NULLs).
1008 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1009 * need to be kept in sync.
1010 *----------
1011 */
1012 strat_total = BTEqualStrategyNumber;
1013 if (so->numberOfKeys > 0)
1014 {
1015 AttrNumber curattr;
1016 ScanKey chosen;
1017 ScanKey impliesNN;
1018 ScanKey cur;
1019
1020 /*
1021 * chosen is the so-far-chosen key for the current attribute, if any.
1022 * We don't cast the decision in stone until we reach keys for the
1023 * next attribute.
1024 */
1025 cur = so->keyData;
1026 curattr = 1;
1027 chosen = NULL;
1028 /* Also remember any scankey that implies a NOT NULL constraint */
1029 impliesNN = NULL;
1030
1031 /*
1032 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1033 * pass to handle after-last-key processing. Actual exit from the
1034 * loop is at one of the "break" statements below.
1035 */
1036 for (int i = 0;; cur++, i++)
1037 {
1038 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1039 {
1040 /*
1041 * Done looking at keys for curattr. If we didn't find a
1042 * usable boundary key, see if we can deduce a NOT NULL key.
1043 */
1044 if (chosen == NULL && impliesNN != NULL &&
1045 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1048 {
1049 /* Yes, so build the key in notnullkeys[keysz] */
1050 chosen = &notnullkeys[keysz];
1053 (impliesNN->sk_flags &
1055 curattr,
1056 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1059 InvalidOid,
1060 InvalidOid,
1061 InvalidOid,
1062 (Datum) 0);
1063 }
1064
1065 /*
1066 * If we still didn't find a usable boundary key, quit; else
1067 * save the boundary key pointer in startKeys.
1068 */
1069 if (chosen == NULL)
1070 break;
1071 startKeys[keysz++] = chosen;
1072
1073 /*
1074 * We can only consider adding more boundary keys when the one
1075 * that we just chose to add uses either the = or >= strategy
1076 * (during backwards scans we can only do so when the key that
1077 * we just added to startKeys[] uses the = or <= strategy)
1078 */
1079 strat_total = chosen->sk_strategy;
1080 if (strat_total == BTGreaterStrategyNumber ||
1081 strat_total == BTLessStrategyNumber)
1082 break;
1083
1084 /*
1085 * Done if that was the last attribute, or if next key is not
1086 * in sequence (implying no boundary key is available for the
1087 * next attribute).
1088 */
1089 if (i >= so->numberOfKeys ||
1090 cur->sk_attno != curattr + 1)
1091 break;
1092
1093 /*
1094 * Reset for next attr.
1095 */
1096 curattr = cur->sk_attno;
1097 chosen = NULL;
1098 impliesNN = NULL;
1099 }
1100
1101 /*
1102 * Can we use this key as a starting boundary for this attr?
1103 *
1104 * If not, does it imply a NOT NULL constraint? (Because
1105 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1106 * *any* inequality key works for that; we need not test.)
1107 */
1108 switch (cur->sk_strategy)
1109 {
1112 if (chosen == NULL)
1113 {
1114 if (ScanDirectionIsBackward(dir))
1115 chosen = cur;
1116 else
1117 impliesNN = cur;
1118 }
1119 break;
1121 /* override any non-equality choice */
1122 chosen = cur;
1123 break;
1126 if (chosen == NULL)
1127 {
1128 if (ScanDirectionIsForward(dir))
1129 chosen = cur;
1130 else
1131 impliesNN = cur;
1132 }
1133 break;
1134 }
1135 }
1136 }
1137
1138 /*
1139 * If we found no usable boundary keys, we have to start from one end of
1140 * the tree. Walk down that edge to the first or last key, and scan from
1141 * there.
1142 *
1143 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1144 */
1145 if (keysz == 0)
1146 return _bt_endpoint(scan, dir);
1147
1148 /*
1149 * We want to start the scan somewhere within the index. Set up an
1150 * insertion scankey we can use to search for the boundary point we
1151 * identified above. The insertion scankey is built using the keys
1152 * identified by startKeys[]. (Remaining insertion scankey fields are
1153 * initialized after initial-positioning scan keys are finalized.)
1154 */
1155 Assert(keysz <= INDEX_MAX_KEYS);
1156 for (int i = 0; i < keysz; i++)
1157 {
1158 ScanKey cur = startKeys[i];
1159
1160 Assert(cur->sk_attno == i + 1);
1161
1162 if (cur->sk_flags & SK_ROW_HEADER)
1163 {
1164 /*
1165 * Row comparison header: look to the first row member instead
1166 */
1167 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1168
1169 /*
1170 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1171 * would've marked the qual as unsatisfiable, preventing us from
1172 * ever getting this far
1173 */
1174 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1175 Assert(subkey->sk_attno == cur->sk_attno);
1176 Assert(!(subkey->sk_flags & SK_ISNULL));
1177
1178 /*
1179 * The member scankeys are already in insertion format (ie, they
1180 * have sk_func = 3-way-comparison function)
1181 */
1182 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1183
1184 /*
1185 * If the row comparison is the last positioning key we accepted,
1186 * try to add additional keys from the lower-order row members.
1187 * (If we accepted independent conditions on additional index
1188 * columns, we use those instead --- doesn't seem worth trying to
1189 * determine which is more restrictive.) Note that this is OK
1190 * even if the row comparison is of ">" or "<" type, because the
1191 * condition applied to all but the last row member is effectively
1192 * ">=" or "<=", and so the extra keys don't break the positioning
1193 * scheme. But, by the same token, if we aren't able to use all
1194 * the row members, then the part of the row comparison that we
1195 * did use has to be treated as just a ">=" or "<=" condition, and
1196 * so we'd better adjust strat_total accordingly.
1197 */
1198 if (i == keysz - 1)
1199 {
1200 bool used_all_subkeys = false;
1201
1202 Assert(!(subkey->sk_flags & SK_ROW_END));
1203 for (;;)
1204 {
1205 subkey++;
1206 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1207 if (subkey->sk_attno != keysz + 1)
1208 break; /* out-of-sequence, can't use it */
1209 if (subkey->sk_strategy != cur->sk_strategy)
1210 break; /* wrong direction, can't use it */
1211 if (subkey->sk_flags & SK_ISNULL)
1212 break; /* can't use null keys */
1213 Assert(keysz < INDEX_MAX_KEYS);
1214 memcpy(inskey.scankeys + keysz, subkey,
1215 sizeof(ScanKeyData));
1216 keysz++;
1217 if (subkey->sk_flags & SK_ROW_END)
1218 {
1219 used_all_subkeys = true;
1220 break;
1221 }
1222 }
1223 if (!used_all_subkeys)
1224 {
1225 switch (strat_total)
1226 {
1228 strat_total = BTLessEqualStrategyNumber;
1229 break;
1231 strat_total = BTGreaterEqualStrategyNumber;
1232 break;
1233 }
1234 }
1235 break; /* done with outer loop */
1236 }
1237 }
1238 else
1239 {
1240 /*
1241 * Ordinary comparison key. Transform the search-style scan key
1242 * to an insertion scan key by replacing the sk_func with the
1243 * appropriate btree comparison function.
1244 *
1245 * If scankey operator is not a cross-type comparison, we can use
1246 * the cached comparison function; otherwise gotta look it up in
1247 * the catalogs. (That can't lead to infinite recursion, since no
1248 * indexscan initiated by syscache lookup will use cross-data-type
1249 * operators.)
1250 *
1251 * We support the convention that sk_subtype == InvalidOid means
1252 * the opclass input type; this is a hack to simplify life for
1253 * ScanKeyInit().
1254 */
1255 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1256 cur->sk_subtype == InvalidOid)
1257 {
1258 FmgrInfo *procinfo;
1259
1260 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1261 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1262 cur->sk_flags,
1263 cur->sk_attno,
1265 cur->sk_subtype,
1266 cur->sk_collation,
1267 procinfo,
1268 cur->sk_argument);
1269 }
1270 else
1271 {
1272 RegProcedure cmp_proc;
1273
1274 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1275 rel->rd_opcintype[i],
1276 cur->sk_subtype,
1277 BTORDER_PROC);
1278 if (!RegProcedureIsValid(cmp_proc))
1279 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1280 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1281 cur->sk_attno, RelationGetRelationName(rel));
1282 ScanKeyEntryInitialize(inskey.scankeys + i,
1283 cur->sk_flags,
1284 cur->sk_attno,
1286 cur->sk_subtype,
1287 cur->sk_collation,
1288 cmp_proc,
1289 cur->sk_argument);
1290 }
1291 }
1292 }
1293
1294 /*----------
1295 * Examine the selected initial-positioning strategy to determine exactly
1296 * where we need to start the scan, and set flag variables to control the
1297 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1298 * page _bt_search returns).
1299 *----------
1300 */
1301 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1302 inskey.anynullkeys = false; /* unused */
1303 inskey.scantid = NULL;
1304 inskey.keysz = keysz;
1305 switch (strat_total)
1306 {
1308
1309 inskey.nextkey = false;
1310 inskey.backward = true;
1311 break;
1312
1314
1315 inskey.nextkey = true;
1316 inskey.backward = true;
1317 break;
1318
1320
1321 /*
1322 * If a backward scan was specified, need to start with last equal
1323 * item not first one.
1324 */
1325 if (ScanDirectionIsBackward(dir))
1326 {
1327 /*
1328 * This is the same as the <= strategy
1329 */
1330 inskey.nextkey = true;
1331 inskey.backward = true;
1332 }
1333 else
1334 {
1335 /*
1336 * This is the same as the >= strategy
1337 */
1338 inskey.nextkey = false;
1339 inskey.backward = false;
1340 }
1341 break;
1342
1344
1345 /*
1346 * Find first item >= scankey
1347 */
1348 inskey.nextkey = false;
1349 inskey.backward = false;
1350 break;
1351
1353
1354 /*
1355 * Find first item > scankey
1356 */
1357 inskey.nextkey = true;
1358 inskey.backward = false;
1359 break;
1360
1361 default:
1362 /* can't get here, but keep compiler quiet */
1363 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1364 return false;
1365 }
1366
1367 /*
1368 * Use the manufactured insertion scan key to descend the tree and
1369 * position ourselves on the target leaf page.
1370 */
1371 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1372 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1373
1374 /* don't need to keep the stack around... */
1375 _bt_freestack(stack);
1376
1377 if (!BufferIsValid(so->currPos.buf))
1378 {
1379 /*
1380 * We only get here if the index is completely empty. Lock relation
1381 * because nothing finer to lock exists. Without a buffer lock, it's
1382 * possible for another transaction to insert data between
1383 * _bt_search() and PredicateLockRelation(). We have to try again
1384 * after taking the relation-level predicate lock, to close a narrow
1385 * window where we wouldn't scan concurrently inserted tuples, but the
1386 * writer wouldn't see our predicate lock.
1387 */
1389 {
1391 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1392 _bt_freestack(stack);
1393 }
1394
1395 if (!BufferIsValid(so->currPos.buf))
1396 {
1397 Assert(!so->needPrimScan);
1398 _bt_parallel_done(scan);
1399 return false;
1400 }
1401 }
1402
1403 /* position to the precise item on the page */
1404 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1405
1406 /*
1407 * Now load data from the first page of the scan (usually the page
1408 * currently in so->currPos.buf).
1409 *
1410 * If inskey.nextkey = false and inskey.backward = false, offnum is
1411 * positioned at the first non-pivot tuple >= inskey.scankeys.
1412 *
1413 * If inskey.nextkey = false and inskey.backward = true, offnum is
1414 * positioned at the last non-pivot tuple < inskey.scankeys.
1415 *
1416 * If inskey.nextkey = true and inskey.backward = false, offnum is
1417 * positioned at the first non-pivot tuple > inskey.scankeys.
1418 *
1419 * If inskey.nextkey = true and inskey.backward = true, offnum is
1420 * positioned at the last non-pivot tuple <= inskey.scankeys.
1421 *
1422 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1423 * for the page. For example, when inskey is both < the leaf page's high
1424 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1425 */
1426 if (!_bt_readfirstpage(scan, offnum, dir))
1427 return false;
1428
1429 _bt_returnitem(scan, so);
1430 return true;
1431}
1432
1433/*
1434 * _bt_next() -- Get the next item in a scan.
1435 *
1436 * On entry, so->currPos describes the current page, which may be pinned
1437 * but is not locked, and so->currPos.itemIndex identifies which item was
1438 * previously returned.
1439 *
1440 * On success exit, so->currPos is updated as needed, and _bt_returnitem
1441 * sets the next item to return to the scan. so->currPos remains valid.
1442 *
1443 * On failure exit (no more tuples), we invalidate so->currPos. It'll
1444 * still be possible for the scan to return tuples by changing direction,
1445 * though we'll need to call _bt_first anew in that other direction.
1446 */
1447bool
1449{
1450 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1451
1453
1454 /*
1455 * Advance to next tuple on current page; or if there's no more, try to
1456 * step to the next page with data.
1457 */
1458 if (ScanDirectionIsForward(dir))
1459 {
1460 if (++so->currPos.itemIndex > so->currPos.lastItem)
1461 {
1462 if (!_bt_steppage(scan, dir))
1463 return false;
1464 }
1465 }
1466 else
1467 {
1468 if (--so->currPos.itemIndex < so->currPos.firstItem)
1469 {
1470 if (!_bt_steppage(scan, dir))
1471 return false;
1472 }
1473 }
1474
1475 _bt_returnitem(scan, so);
1476 return true;
1477}
1478
1479/*
1480 * _bt_readpage() -- Load data from current index page into so->currPos
1481 *
1482 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1483 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1484 * they are updated as appropriate. All other fields of so->currPos are
1485 * initialized from scratch here.
1486 *
1487 * We scan the current page starting at offnum and moving in the indicated
1488 * direction. All items matching the scan keys are loaded into currPos.items.
1489 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1490 * that there can be no more matching tuples in the current scan direction
1491 * (could just be for the current primitive index scan when scan has arrays).
1492 *
1493 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1494 * prior to calling this function; this function will invoke
1495 * _bt_parallel_release before returning.
1496 *
1497 * Returns true if any matching items found on the page, false if none.
1498 */
1499static bool
1501 bool firstPage)
1502{
1503 Relation rel = scan->indexRelation;
1504 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1505 Page page;
1506 BTPageOpaque opaque;
1507 OffsetNumber minoff;
1508 OffsetNumber maxoff;
1509 BTReadPageState pstate;
1510 bool arrayKeys;
1511 int itemIndex,
1512 indnatts;
1513
1514 /* save the page/buffer block number, along with its sibling links */
1515 page = BufferGetPage(so->currPos.buf);
1516 opaque = BTPageGetOpaque(page);
1518 so->currPos.prevPage = opaque->btpo_prev;
1519 so->currPos.nextPage = opaque->btpo_next;
1520
1521 Assert(!P_IGNORE(opaque));
1523 Assert(!so->needPrimScan);
1524
1525 if (scan->parallel_scan)
1526 {
1527 /* allow next/prev page to be read by other worker without delay */
1528 if (ScanDirectionIsForward(dir))
1530 so->currPos.currPage);
1531 else
1533 so->currPos.currPage);
1534 }
1535
1536 /* initialize remaining currPos fields related to current page */
1538 so->currPos.dir = dir;
1539 so->currPos.nextTupleOffset = 0;
1540 /* either moreLeft or moreRight should be set now (may be unset later) */
1542 so->currPos.moreLeft);
1543
1545
1546 /* initialize local variables */
1547 indnatts = IndexRelationGetNumberOfAttributes(rel);
1548 arrayKeys = so->numArrayKeys != 0;
1549 minoff = P_FIRSTDATAKEY(opaque);
1550 maxoff = PageGetMaxOffsetNumber(page);
1551
1552 /* initialize page-level state that we'll pass to _bt_checkkeys */
1553 pstate.minoff = minoff;
1554 pstate.maxoff = maxoff;
1555 pstate.finaltup = NULL;
1556 pstate.page = page;
1557 pstate.offnum = InvalidOffsetNumber;
1558 pstate.skip = InvalidOffsetNumber;
1559 pstate.continuescan = true; /* default assumption */
1560 pstate.prechecked = false;
1561 pstate.firstmatch = false;
1562 pstate.rechecks = 0;
1563 pstate.targetdistance = 0;
1564
1565 /*
1566 * Prechecking the value of the continuescan flag for the last item on the
1567 * page (for backwards scan it will be the first item on a page). If we
1568 * observe it to be true, then it should be true for all other items. This
1569 * allows us to do significant optimizations in the _bt_checkkeys()
1570 * function for all the items on the page.
1571 *
1572 * With the forward scan, we do this check for the last item on the page
1573 * instead of the high key. It's relatively likely that the most
1574 * significant column in the high key will be different from the
1575 * corresponding value from the last item on the page. So checking with
1576 * the last item on the page would give a more precise answer.
1577 *
1578 * We skip this for the first page read by each (primitive) scan, to avoid
1579 * slowing down point queries. They typically don't stand to gain much
1580 * when the optimization can be applied, and are more likely to notice the
1581 * overhead of the precheck.
1582 *
1583 * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1584 * just set a low-order required array's key to the best available match
1585 * for a truncated -inf attribute value from the prior page's high key
1586 * (array element 0 is always the best available match in this scenario).
1587 * It's quite likely that matches for array element 0 begin on this page,
1588 * but the start of matches won't necessarily align with page boundaries.
1589 * When the start of matches is somewhere in the middle of this page, it
1590 * would be wrong to treat page's final non-pivot tuple as representative.
1591 * Doing so might lead us to treat some of the page's earlier tuples as
1592 * being part of a group of tuples thought to satisfy the required keys.
1593 *
1594 * Note: Conversely, in the case where the scan's arrays just advanced
1595 * using the prior page's HIKEY _without_ advancement setting scanBehind,
1596 * the start of matches must be aligned with page boundaries, which makes
1597 * it safe to attempt the optimization here now. It's also safe when the
1598 * prior page's HIKEY simply didn't need to advance any required array. In
1599 * both cases we can safely assume that the _first_ tuple from this page
1600 * must be >= the current set of array keys/equality constraints. And so
1601 * if the final tuple is == those same keys (and also satisfies any
1602 * required < or <= strategy scan keys) during the precheck, we can safely
1603 * assume that this must also be true of all earlier tuples from the page.
1604 */
1605 if (!firstPage && !so->scanBehind && minoff < maxoff)
1606 {
1607 ItemId iid;
1608 IndexTuple itup;
1609
1610 iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1611 itup = (IndexTuple) PageGetItem(page, iid);
1612
1613 /* Call with arrayKeys=false to avoid undesirable side-effects */
1614 _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1615 pstate.prechecked = pstate.continuescan;
1616 pstate.continuescan = true; /* reset */
1617 }
1618
1619 if (ScanDirectionIsForward(dir))
1620 {
1621 /* SK_SEARCHARRAY forward scans must provide high key up front */
1622 if (arrayKeys && !P_RIGHTMOST(opaque))
1623 {
1624 ItemId iid = PageGetItemId(page, P_HIKEY);
1625
1626 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1627
1628 if (unlikely(so->oppositeDirCheck))
1629 {
1630 Assert(so->scanBehind);
1631
1632 /*
1633 * Last _bt_readpage call scheduled a recheck of finaltup for
1634 * required scan keys up to and including a > or >= scan key.
1635 *
1636 * _bt_checkkeys won't consider the scanBehind flag unless the
1637 * scan is stopped by a scan key required in the current scan
1638 * direction. We need this recheck so that we'll notice when
1639 * all tuples on this page are still before the _bt_first-wise
1640 * start of matches for the current set of array keys.
1641 */
1642 if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
1643 {
1644 /* Schedule another primitive index scan after all */
1645 so->currPos.moreRight = false;
1646 so->needPrimScan = true;
1647 return false;
1648 }
1649
1650 /* Deliberately don't unset scanBehind flag just yet */
1651 }
1652 }
1653
1654 /* load items[] in ascending order */
1655 itemIndex = 0;
1656
1657 offnum = Max(offnum, minoff);
1658
1659 while (offnum <= maxoff)
1660 {
1661 ItemId iid = PageGetItemId(page, offnum);
1662 IndexTuple itup;
1663 bool passes_quals;
1664
1665 /*
1666 * If the scan specifies not to return killed tuples, then we
1667 * treat a killed tuple as not passing the qual
1668 */
1669 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1670 {
1671 offnum = OffsetNumberNext(offnum);
1672 continue;
1673 }
1674
1675 itup = (IndexTuple) PageGetItem(page, iid);
1676 Assert(!BTreeTupleIsPivot(itup));
1677
1678 pstate.offnum = offnum;
1679 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1680 itup, indnatts);
1681
1682 /*
1683 * Check if we need to skip ahead to a later tuple (only possible
1684 * when the scan uses array keys)
1685 */
1686 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1687 {
1688 Assert(!passes_quals && pstate.continuescan);
1689 Assert(offnum < pstate.skip);
1690
1691 offnum = pstate.skip;
1692 pstate.skip = InvalidOffsetNumber;
1693 continue;
1694 }
1695
1696 if (passes_quals)
1697 {
1698 /* tuple passes all scan key conditions */
1699 pstate.firstmatch = true;
1700 if (!BTreeTupleIsPosting(itup))
1701 {
1702 /* Remember it */
1703 _bt_saveitem(so, itemIndex, offnum, itup);
1704 itemIndex++;
1705 }
1706 else
1707 {
1708 int tupleOffset;
1709
1710 /*
1711 * Set up state to return posting list, and remember first
1712 * TID
1713 */
1714 tupleOffset =
1715 _bt_setuppostingitems(so, itemIndex, offnum,
1716 BTreeTupleGetPostingN(itup, 0),
1717 itup);
1718 itemIndex++;
1719 /* Remember additional TIDs */
1720 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1721 {
1722 _bt_savepostingitem(so, itemIndex, offnum,
1723 BTreeTupleGetPostingN(itup, i),
1724 tupleOffset);
1725 itemIndex++;
1726 }
1727 }
1728 }
1729 /* When !continuescan, there can't be any more matches, so stop */
1730 if (!pstate.continuescan)
1731 break;
1732
1733 offnum = OffsetNumberNext(offnum);
1734 }
1735
1736 /*
1737 * We don't need to visit page to the right when the high key
1738 * indicates that no more matches will be found there.
1739 *
1740 * Checking the high key like this works out more often than you might
1741 * think. Leaf page splits pick a split point between the two most
1742 * dissimilar tuples (this is weighed against the need to evenly share
1743 * free space). Leaf pages with high key attribute values that can
1744 * only appear on non-pivot tuples on the right sibling page are
1745 * common.
1746 */
1747 if (pstate.continuescan && !P_RIGHTMOST(opaque))
1748 {
1749 ItemId iid = PageGetItemId(page, P_HIKEY);
1750 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1751 int truncatt;
1752
1753 truncatt = BTreeTupleGetNAtts(itup, rel);
1754 pstate.prechecked = false; /* precheck didn't cover HIKEY */
1755 _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1756 }
1757
1758 if (!pstate.continuescan)
1759 so->currPos.moreRight = false;
1760
1761 Assert(itemIndex <= MaxTIDsPerBTreePage);
1762 so->currPos.firstItem = 0;
1763 so->currPos.lastItem = itemIndex - 1;
1764 so->currPos.itemIndex = 0;
1765 }
1766 else
1767 {
1768 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1769 if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1770 {
1771 ItemId iid = PageGetItemId(page, minoff);
1772
1773 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1774 }
1775
1776 /* load items[] in descending order */
1777 itemIndex = MaxTIDsPerBTreePage;
1778
1779 offnum = Min(offnum, maxoff);
1780
1781 while (offnum >= minoff)
1782 {
1783 ItemId iid = PageGetItemId(page, offnum);
1784 IndexTuple itup;
1785 bool tuple_alive;
1786 bool passes_quals;
1787
1788 /*
1789 * If the scan specifies not to return killed tuples, then we
1790 * treat a killed tuple as not passing the qual. Most of the
1791 * time, it's a win to not bother examining the tuple's index
1792 * keys, but just skip to the next tuple (previous, actually,
1793 * since we're scanning backwards). However, if this is the first
1794 * tuple on the page, we do check the index keys, to prevent
1795 * uselessly advancing to the page to the left. This is similar
1796 * to the high key optimization used by forward scans.
1797 */
1798 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1799 {
1800 if (offnum > minoff)
1801 {
1802 offnum = OffsetNumberPrev(offnum);
1803 continue;
1804 }
1805
1806 tuple_alive = false;
1807 }
1808 else
1809 tuple_alive = true;
1810
1811 itup = (IndexTuple) PageGetItem(page, iid);
1812 Assert(!BTreeTupleIsPivot(itup));
1813
1814 pstate.offnum = offnum;
1815 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1816 itup, indnatts);
1817
1818 /*
1819 * Check if we need to skip ahead to a later tuple (only possible
1820 * when the scan uses array keys)
1821 */
1822 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1823 {
1824 Assert(!passes_quals && pstate.continuescan);
1825 Assert(offnum > pstate.skip);
1826
1827 offnum = pstate.skip;
1828 pstate.skip = InvalidOffsetNumber;
1829 continue;
1830 }
1831
1832 if (passes_quals && tuple_alive)
1833 {
1834 /* tuple passes all scan key conditions */
1835 pstate.firstmatch = true;
1836 if (!BTreeTupleIsPosting(itup))
1837 {
1838 /* Remember it */
1839 itemIndex--;
1840 _bt_saveitem(so, itemIndex, offnum, itup);
1841 }
1842 else
1843 {
1844 int tupleOffset;
1845
1846 /*
1847 * Set up state to return posting list, and remember first
1848 * TID.
1849 *
1850 * Note that we deliberately save/return items from
1851 * posting lists in ascending heap TID order for backwards
1852 * scans. This allows _bt_killitems() to make a
1853 * consistent assumption about the order of items
1854 * associated with the same posting list tuple.
1855 */
1856 itemIndex--;
1857 tupleOffset =
1858 _bt_setuppostingitems(so, itemIndex, offnum,
1859 BTreeTupleGetPostingN(itup, 0),
1860 itup);
1861 /* Remember additional TIDs */
1862 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1863 {
1864 itemIndex--;
1865 _bt_savepostingitem(so, itemIndex, offnum,
1866 BTreeTupleGetPostingN(itup, i),
1867 tupleOffset);
1868 }
1869 }
1870 }
1871 /* When !continuescan, there can't be any more matches, so stop */
1872 if (!pstate.continuescan)
1873 break;
1874
1875 offnum = OffsetNumberPrev(offnum);
1876 }
1877
1878 /*
1879 * We don't need to visit page to the left when no more matches will
1880 * be found there
1881 */
1882 if (!pstate.continuescan)
1883 so->currPos.moreLeft = false;
1884
1885 Assert(itemIndex >= 0);
1886 so->currPos.firstItem = itemIndex;
1889 }
1890
1891 return (so->currPos.firstItem <= so->currPos.lastItem);
1892}
1893
1894/* Save an index item into so->currPos.items[itemIndex] */
1895static void
1896_bt_saveitem(BTScanOpaque so, int itemIndex,
1897 OffsetNumber offnum, IndexTuple itup)
1898{
1899 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1900
1902
1903 currItem->heapTid = itup->t_tid;
1904 currItem->indexOffset = offnum;
1905 if (so->currTuples)
1906 {
1907 Size itupsz = IndexTupleSize(itup);
1908
1909 currItem->tupleOffset = so->currPos.nextTupleOffset;
1910 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1911 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1912 }
1913}
1914
1915/*
1916 * Setup state to save TIDs/items from a single posting list tuple.
1917 *
1918 * Saves an index item into so->currPos.items[itemIndex] for TID that is
1919 * returned to scan first. Second or subsequent TIDs for posting list should
1920 * be saved by calling _bt_savepostingitem().
1921 *
1922 * Returns an offset into tuple storage space that main tuple is stored at if
1923 * needed.
1924 */
1925static int
1927 ItemPointer heapTid, IndexTuple itup)
1928{
1929 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1930
1932
1933 currItem->heapTid = *heapTid;
1934 currItem->indexOffset = offnum;
1935 if (so->currTuples)
1936 {
1937 /* Save base IndexTuple (truncate posting list) */
1938 IndexTuple base;
1939 Size itupsz = BTreeTupleGetPostingOffset(itup);
1940
1941 itupsz = MAXALIGN(itupsz);
1942 currItem->tupleOffset = so->currPos.nextTupleOffset;
1943 base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1944 memcpy(base, itup, itupsz);
1945 /* Defensively reduce work area index tuple header size */
1946 base->t_info &= ~INDEX_SIZE_MASK;
1947 base->t_info |= itupsz;
1948 so->currPos.nextTupleOffset += itupsz;
1949
1950 return currItem->tupleOffset;
1951 }
1952
1953 return 0;
1954}
1955
1956/*
1957 * Save an index item into so->currPos.items[itemIndex] for current posting
1958 * tuple.
1959 *
1960 * Assumes that _bt_setuppostingitems() has already been called for current
1961 * posting list tuple. Caller passes its return value as tupleOffset.
1962 */
1963static inline void
1965 ItemPointer heapTid, int tupleOffset)
1966{
1967 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1968
1969 currItem->heapTid = *heapTid;
1970 currItem->indexOffset = offnum;
1971
1972 /*
1973 * Have index-only scans return the same base IndexTuple for every TID
1974 * that originates from the same posting list
1975 */
1976 if (so->currTuples)
1977 currItem->tupleOffset = tupleOffset;
1978}
1979
1980/*
1981 * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
1982 * index scan by setting the relevant fields in caller's index scan descriptor
1983 */
1984static inline void
1986{
1987 BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
1988
1989 /* Most recent _bt_readpage must have succeeded */
1993
1994 /* Return next item, per amgettuple contract */
1995 scan->xs_heaptid = currItem->heapTid;
1996 if (so->currTuples)
1997 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1998}
1999
2000/*
2001 * _bt_steppage() -- Step to next page containing valid data for scan
2002 *
2003 * Wrapper on _bt_readnextpage that performs final steps for the current page.
2004 *
2005 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
2006 * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
2007 * the pin eagerly earlier on. The scan must have so->currPos.currPage set to
2008 * a valid block, in any case.
2009 */
2010static bool
2012{
2013 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2014 BlockNumber blkno,
2015 lastcurrblkno;
2016
2018
2019 /* Before leaving current page, deal with any killed items */
2020 if (so->numKilled > 0)
2021 _bt_killitems(scan);
2022
2023 /*
2024 * Before we modify currPos, make a copy of the page data if there was a
2025 * mark position that needs it.
2026 */
2027 if (so->markItemIndex >= 0)
2028 {
2029 /* bump pin on current buffer for assignment to mark buffer */
2030 if (BTScanPosIsPinned(so->currPos))
2032 memcpy(&so->markPos, &so->currPos,
2033 offsetof(BTScanPosData, items[1]) +
2034 so->currPos.lastItem * sizeof(BTScanPosItem));
2035 if (so->markTuples)
2036 memcpy(so->markTuples, so->currTuples,
2039 so->markItemIndex = -1;
2040
2041 /*
2042 * If we're just about to start the next primitive index scan
2043 * (possible with a scan that has arrays keys, and needs to skip to
2044 * continue in the current scan direction), moreLeft/moreRight only
2045 * indicate the end of the current primitive index scan. They must
2046 * never be taken to indicate that the top-level index scan has ended
2047 * (that would be wrong).
2048 *
2049 * We could handle this case by treating the current array keys as
2050 * markPos state. But depending on the current array state like this
2051 * would add complexity. Instead, we just unset markPos's copy of
2052 * moreRight or moreLeft (whichever might be affected), while making
2053 * btrestrpos reset the scan's arrays to their initial scan positions.
2054 * In effect, btrestrpos leaves advancing the arrays up to the first
2055 * _bt_readpage call (that takes place after it has restored markPos).
2056 */
2057 if (so->needPrimScan)
2058 {
2060 so->markPos.moreRight = true;
2061 else
2062 so->markPos.moreLeft = true;
2063 }
2064
2065 /* mark/restore not supported by parallel scans */
2066 Assert(!scan->parallel_scan);
2067 }
2068
2070
2071 /* Walk to the next page with data */
2072 if (ScanDirectionIsForward(dir))
2073 blkno = so->currPos.nextPage;
2074 else
2075 blkno = so->currPos.prevPage;
2076 lastcurrblkno = so->currPos.currPage;
2077
2078 /*
2079 * Cancel primitive index scans that were scheduled when the call to
2080 * _bt_readpage for currPos happened to use the opposite direction to the
2081 * one that we're stepping in now. (It's okay to leave the scan's array
2082 * keys as-is, since the next _bt_readpage will advance them.)
2083 */
2084 if (so->currPos.dir != dir)
2085 so->needPrimScan = false;
2086
2087 return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
2088}
2089
2090/*
2091 * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
2092 *
2093 * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
2094 * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
2095 * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
2096 * its scan key was marked required), so _bt_first _must_ pass us an offnum
2097 * exactly at the beginning of where equal tuples are to be found. When we're
2098 * passed an offnum past the end of the page, we might still manage to stop
2099 * the scan on this page by calling _bt_checkkeys against the high key. See
2100 * _bt_readpage for full details.
2101 *
2102 * On entry, so->currPos must be pinned and locked (so offnum stays valid).
2103 * Parallel scan callers must have seized the scan before calling here.
2104 *
2105 * On exit, we'll have updated so->currPos and retained locks and pins
2106 * according to the same rules as those laid out for _bt_readnextpage exit.
2107 * Like _bt_readnextpage, our return value indicates if there are any matching
2108 * records in the given direction.
2109 *
2110 * We always release the scan for a parallel scan caller, regardless of
2111 * success or failure; we'll call _bt_parallel_release as soon as possible.
2112 */
2113static bool
2115{
2116 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2117
2118 so->numKilled = 0; /* just paranoia */
2119 so->markItemIndex = -1; /* ditto */
2120
2121 /* Initialize so->currPos for the first page (page in so->currPos.buf) */
2122 if (so->needPrimScan)
2123 {
2124 Assert(so->numArrayKeys);
2125
2126 so->currPos.moreLeft = true;
2127 so->currPos.moreRight = true;
2128 so->needPrimScan = false;
2129 }
2130 else if (ScanDirectionIsForward(dir))
2131 {
2132 so->currPos.moreLeft = false;
2133 so->currPos.moreRight = true;
2134 }
2135 else
2136 {
2137 so->currPos.moreLeft = true;
2138 so->currPos.moreRight = false;
2139 }
2140
2141 /*
2142 * Attempt to load matching tuples from the first page.
2143 *
2144 * Note that _bt_readpage will finish initializing the so->currPos fields.
2145 * _bt_readpage also releases parallel scan (even when it returns false).
2146 */
2147 if (_bt_readpage(scan, dir, offnum, true))
2148 {
2149 /*
2150 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2151 * so->currPos.buf in preparation for btgettuple returning tuples.
2152 */
2155 return true;
2156 }
2157
2158 /* There's no actually-matching data on the page in so->currPos.buf */
2160
2161 /* Call _bt_readnextpage using its _bt_steppage wrapper function */
2162 if (!_bt_steppage(scan, dir))
2163 return false;
2164
2165 /* _bt_readpage for a later page (now in so->currPos) succeeded */
2166 return true;
2167}
2168
2169/*
2170 * _bt_readnextpage() -- Read next page containing valid data for _bt_next
2171 *
2172 * Caller's blkno is the next interesting page's link, taken from either the
2173 * previously-saved right link or left link. lastcurrblkno is the page that
2174 * was current at the point where the blkno link was saved, which we use to
2175 * reason about concurrent page splits/page deletions during backwards scans.
2176 *
2177 * On entry, caller shouldn't hold any locks or pins on any page (we work
2178 * directly off of blkno and lastcurrblkno instead). Parallel scan callers
2179 * that seized the scan before calling here should pass seized=true; such a
2180 * caller's blkno and lastcurrblkno arguments come from the seized scan.
2181 * seized=false callers just pass us the blkno/lastcurrblkno taken from their
2182 * so->currPos, which (along with so->currPos itself) can be used to end the
2183 * scan. A seized=false caller's blkno can never be assumed to be the page
2184 * that must be read next during a parallel scan, though. We must figure that
2185 * part out for ourselves by seizing the scan (the correct page to read might
2186 * already be beyond the seized=false caller's blkno during a parallel scan).
2187 *
2188 * On success exit, so->currPos is updated to contain data from the next
2189 * interesting page, and we return true. We hold a pin on the buffer on
2190 * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
2191 * to eagerly drop the pin (to avoid blocking VACUUM).
2192 *
2193 * If there are no more matching records in the given direction, we drop all
2194 * locks and pins, invalidate so->currPos, and return false.
2195 *
2196 * We always release the scan for a parallel scan caller, regardless of
2197 * success or failure; we'll call _bt_parallel_release as soon as possible.
2198 */
2199static bool
2201 BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
2202{
2203 Relation rel = scan->indexRelation;
2204 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2205
2206 Assert(so->currPos.currPage == lastcurrblkno || seized);
2208
2209 /*
2210 * Remember that the scan already read lastcurrblkno, a page to the left
2211 * of blkno (or remember reading a page to the right, for backwards scans)
2212 */
2213 if (ScanDirectionIsForward(dir))
2214 so->currPos.moreLeft = true;
2215 else
2216 so->currPos.moreRight = true;
2217
2218 for (;;)
2219 {
2220 Page page;
2221 BTPageOpaque opaque;
2222
2223 if (blkno == P_NONE ||
2225 !so->currPos.moreRight : !so->currPos.moreLeft))
2226 {
2227 /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
2228 Assert(so->currPos.currPage == lastcurrblkno && !seized);
2230 _bt_parallel_done(scan); /* iff !so->needPrimScan */
2231 return false;
2232 }
2233
2234 Assert(!so->needPrimScan);
2235
2236 /* parallel scan must never actually visit so->currPos blkno */
2237 if (!seized && scan->parallel_scan != NULL &&
2238 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
2239 {
2240 /* whole scan is now done (or another primitive scan required) */
2242 return false;
2243 }
2244
2245 if (ScanDirectionIsForward(dir))
2246 {
2247 /* read blkno, but check for interrupts first */
2249 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2250 }
2251 else
2252 {
2253 /* read blkno, avoiding race (also checks for interrupts) */
2254 so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
2255 lastcurrblkno);
2256 if (so->currPos.buf == InvalidBuffer)
2257 {
2258 /* must have been a concurrent deletion of leftmost page */
2260 _bt_parallel_done(scan);
2261 return false;
2262 }
2263 }
2264
2265 page = BufferGetPage(so->currPos.buf);
2266 opaque = BTPageGetOpaque(page);
2267 lastcurrblkno = blkno;
2268 if (likely(!P_IGNORE(opaque)))
2269 {
2270 /* see if there are any matches on this page */
2271 if (ScanDirectionIsForward(dir))
2272 {
2273 /* note that this will clear moreRight if we can stop */
2274 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2275 break;
2276 blkno = so->currPos.nextPage;
2277 }
2278 else
2279 {
2280 /* note that this will clear moreLeft if we can stop */
2281 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2282 break;
2283 blkno = so->currPos.prevPage;
2284 }
2285 }
2286 else
2287 {
2288 /* _bt_readpage not called, so do all this for ourselves */
2289 if (ScanDirectionIsForward(dir))
2290 blkno = opaque->btpo_next;
2291 else
2292 blkno = opaque->btpo_prev;
2293 if (scan->parallel_scan != NULL)
2294 _bt_parallel_release(scan, blkno, lastcurrblkno);
2295 }
2296
2297 /* no matching tuples on this page */
2298 _bt_relbuf(rel, so->currPos.buf);
2299 seized = false; /* released by _bt_readpage (or by us) */
2300 }
2301
2302 /*
2303 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2304 * so->currPos.buf in preparation for btgettuple returning tuples.
2305 */
2306 Assert(so->currPos.currPage == blkno);
2309
2310 return true;
2311}
2312
2313/*
2314 * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
2315 * recovering from concurrent page splits/page deletions when necessary
2316 *
2317 * Called during backwards scans, to deal with their unique concurrency rules.
2318 *
2319 * blkno points to the block number of the page that we expect to move the
2320 * scan to. We'll successfully move the scan there when we find that its
2321 * right sibling link still points to lastcurrblkno (the page we just read).
2322 * Otherwise, we have to figure out which page is the correct one for the scan
2323 * to now read the hard way, reasoning about concurrent splits and deletions.
2324 * See nbtree/README.
2325 *
2326 * On return, we have both a pin and a read lock on the returned page, whose
2327 * block number will be set in *blkno. Returns InvalidBuffer if there is no
2328 * page to the left (no lock or pin is held in that case).
2329 *
2330 * It is possible for the returned leaf page to be half-dead; caller must
2331 * check that condition and step left again when required.
2332 */
2333static Buffer
2335 BlockNumber lastcurrblkno)
2336{
2337 BlockNumber origblkno = *blkno; /* detects circular links */
2338
2339 for (;;)
2340 {
2341 Buffer buf;
2342 Page page;
2343 BTPageOpaque opaque;
2344 int tries;
2345
2346 /* check for interrupts while we're not holding any buffer lock */
2348 buf = _bt_getbuf(rel, *blkno, BT_READ);
2349 page = BufferGetPage(buf);
2350 opaque = BTPageGetOpaque(page);
2351
2352 /*
2353 * If this isn't the page we want, walk right till we find what we
2354 * want --- but go no more than four hops (an arbitrary limit). If we
2355 * don't find the correct page by then, the most likely bet is that
2356 * lastcurrblkno got deleted and isn't in the sibling chain at all
2357 * anymore, not that its left sibling got split more than four times.
2358 *
2359 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2360 * because half-dead pages are still in the sibling chain.
2361 */
2362 tries = 0;
2363 for (;;)
2364 {
2365 if (likely(!P_ISDELETED(opaque) &&
2366 opaque->btpo_next == lastcurrblkno))
2367 {
2368 /* Found desired page, return it */
2369 return buf;
2370 }
2371 if (P_RIGHTMOST(opaque) || ++tries > 4)
2372 break;
2373 /* step right */
2374 *blkno = opaque->btpo_next;
2375 buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2376 page = BufferGetPage(buf);
2377 opaque = BTPageGetOpaque(page);
2378 }
2379
2380 /*
2381 * Return to the original page (usually the page most recently read by
2382 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2383 * what's up with its prev sibling link
2384 */
2385 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2386 page = BufferGetPage(buf);
2387 opaque = BTPageGetOpaque(page);
2388 if (P_ISDELETED(opaque))
2389 {
2390 /*
2391 * It was deleted. Move right to first nondeleted page (there
2392 * must be one); that is the page that has acquired the deleted
2393 * one's keyspace, so stepping left from it will take us where we
2394 * want to be.
2395 */
2396 for (;;)
2397 {
2398 if (P_RIGHTMOST(opaque))
2399 elog(ERROR, "fell off the end of index \"%s\"",
2401 lastcurrblkno = opaque->btpo_next;
2402 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2403 page = BufferGetPage(buf);
2404 opaque = BTPageGetOpaque(page);
2405 if (!P_ISDELETED(opaque))
2406 break;
2407 }
2408 }
2409 else
2410 {
2411 /*
2412 * Original lastcurrblkno wasn't deleted; the explanation had
2413 * better be that the page to the left got split or deleted.
2414 * Without this check, we risk going into an infinite loop.
2415 */
2416 if (opaque->btpo_prev == origblkno)
2417 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2418 lastcurrblkno, RelationGetRelationName(rel));
2419 /* Okay to try again, since left sibling link changed */
2420 }
2421
2422 /*
2423 * Original lastcurrblkno from caller was concurrently deleted (could
2424 * also have been a great many concurrent left sibling page splits).
2425 * Found a non-deleted page that should now act as our lastcurrblkno.
2426 */
2427 if (P_LEFTMOST(opaque))
2428 {
2429 /* New lastcurrblkno has no left sibling (concurrently deleted) */
2430 _bt_relbuf(rel, buf);
2431 break;
2432 }
2433
2434 /* Start from scratch with new lastcurrblkno's blkno/prev link */
2435 *blkno = origblkno = opaque->btpo_prev;
2436 _bt_relbuf(rel, buf);
2437 }
2438
2439 return InvalidBuffer;
2440}
2441
2442/*
2443 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2444 *
2445 * If the index is empty, we will return InvalidBuffer; any other failure
2446 * condition causes ereport(). We will not return a dead page.
2447 *
2448 * The returned buffer is pinned and read-locked.
2449 */
2450Buffer
2451_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2452{
2453 Buffer buf;
2454 Page page;
2455 BTPageOpaque opaque;
2456 OffsetNumber offnum;
2457 BlockNumber blkno;
2458 IndexTuple itup;
2459
2460 /*
2461 * If we are looking for a leaf page, okay to descend from fast root;
2462 * otherwise better descend from true root. (There is no point in being
2463 * smarter about intermediate levels.)
2464 */
2465 if (level == 0)
2466 buf = _bt_getroot(rel, NULL, BT_READ);
2467 else
2468 buf = _bt_gettrueroot(rel);
2469
2470 if (!BufferIsValid(buf))
2471 return InvalidBuffer;
2472
2473 page = BufferGetPage(buf);
2474 opaque = BTPageGetOpaque(page);
2475
2476 for (;;)
2477 {
2478 /*
2479 * If we landed on a deleted page, step right to find a live page
2480 * (there must be one). Also, if we want the rightmost page, step
2481 * right if needed to get to it (this could happen if the page split
2482 * since we obtained a pointer to it).
2483 */
2484 while (P_IGNORE(opaque) ||
2485 (rightmost && !P_RIGHTMOST(opaque)))
2486 {
2487 blkno = opaque->btpo_next;
2488 if (blkno == P_NONE)
2489 elog(ERROR, "fell off the end of index \"%s\"",
2491 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2492 page = BufferGetPage(buf);
2493 opaque = BTPageGetOpaque(page);
2494 }
2495
2496 /* Done? */
2497 if (opaque->btpo_level == level)
2498 break;
2499 if (opaque->btpo_level < level)
2500 ereport(ERROR,
2501 (errcode(ERRCODE_INDEX_CORRUPTED),
2502 errmsg_internal("btree level %u not found in index \"%s\"",
2503 level, RelationGetRelationName(rel))));
2504
2505 /* Descend to leftmost or rightmost child page */
2506 if (rightmost)
2507 offnum = PageGetMaxOffsetNumber(page);
2508 else
2509 offnum = P_FIRSTDATAKEY(opaque);
2510
2511 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2512 blkno = BTreeTupleGetDownLink(itup);
2513
2514 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2515 page = BufferGetPage(buf);
2516 opaque = BTPageGetOpaque(page);
2517 }
2518
2519 return buf;
2520}
2521
2522/*
2523 * _bt_endpoint() -- Find the first or last page in the index, and scan
2524 * from there to the first key satisfying all the quals.
2525 *
2526 * This is used by _bt_first() to set up a scan when we've determined
2527 * that the scan must start at the beginning or end of the index (for
2528 * a forward or backward scan respectively).
2529 *
2530 * Parallel scan callers must have seized the scan before calling here.
2531 * Exit conditions are the same as for _bt_first().
2532 */
2533static bool
2535{
2536 Relation rel = scan->indexRelation;
2537 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2538 Page page;
2539 BTPageOpaque opaque;
2541
2543 Assert(!so->needPrimScan);
2544
2545 /*
2546 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2547 * version of _bt_search().
2548 */
2550
2551 if (!BufferIsValid(so->currPos.buf))
2552 {
2553 /*
2554 * Empty index. Lock the whole relation, as nothing finer to lock
2555 * exists.
2556 */
2558 _bt_parallel_done(scan);
2559 return false;
2560 }
2561
2562 page = BufferGetPage(so->currPos.buf);
2563 opaque = BTPageGetOpaque(page);
2564 Assert(P_ISLEAF(opaque));
2565
2566 if (ScanDirectionIsForward(dir))
2567 {
2568 /* There could be dead pages to the left, so not this: */
2569 /* Assert(P_LEFTMOST(opaque)); */
2570
2571 start = P_FIRSTDATAKEY(opaque);
2572 }
2573 else if (ScanDirectionIsBackward(dir))
2574 {
2575 Assert(P_RIGHTMOST(opaque));
2576
2578 }
2579 else
2580 {
2581 elog(ERROR, "invalid scan direction: %d", (int) dir);
2582 start = 0; /* keep compiler quiet */
2583 }
2584
2585 /*
2586 * Now load data from the first page of the scan.
2587 */
2588 if (!_bt_readfirstpage(scan, start, dir))
2589 return false;
2590
2591 _bt_returnitem(scan, so);
2592 return true;
2593}
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4898
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3724
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4866
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3985
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:396
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:347
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
PageData * Page
Definition: bufpage.h:82
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
#define RegProcedureIsValid(p)
Definition: c.h:734
#define Min(x, y)
Definition: c.h:961
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1063
#define likely(x)
Definition: c.h:332
#define MAXALIGN(LEN)
Definition: c.h:768
#define Max(x, y)
Definition: c.h:955
#define Assert(condition)
Definition: c.h:815
regproc RegProcedure
Definition: c.h:607
int32_t int32
Definition: c.h:484
#define unlikely(x)
Definition: c.h:333
uint32_t uint32
Definition: c.h:488
size_t Size
Definition: c.h:562
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
return str start
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:862
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:796
void * palloc(Size size)
Definition: mcxt.c:1317
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
void _bt_preprocess_keys(IndexScanDesc scan)
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:605
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:774
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:747
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:998
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
BTStackData * BTStack
Definition: nbtree.h:744
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define P_HIKEY
Definition: nbtree.h:367
#define BTORDER_PROC
Definition: nbtree.h:712
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1015
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1123
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
#define BT_READ
Definition: nbtree.h:724
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define SK_BT_DESC
Definition: nbtree.h:1122
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1021
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1009
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BT_WRITE
Definition: nbtree.h:725
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1078
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2451
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:241
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:602
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:882
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1896
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:1926
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2200
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:67
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2534
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2011
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2114
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition: nbtsearch.c:2334
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:1964
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:474
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1448
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
Definition: nbtsearch.c:1500
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:1985
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688
bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtutils.c:1627
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:172
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:2333
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:3079
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:413
bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
Definition: nbtutils.c:1782
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define INDEX_MAX_KEYS
static char * buf
Definition: pg_test_fsync.c:72
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:683
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define InvalidOid
Definition: postgres_ext.h:37
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2589
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2566
short access
Definition: preproc-type.c:36
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:517
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
ScanDirection
Definition: sdir.h:25
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
OffsetNumber stricthigh
Definition: nbtree.h:830
bool bounds_valid
Definition: nbtree.h:828
OffsetNumber low
Definition: nbtree.h:829
BTScanInsert itup_key
Definition: nbtree.h:818
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint32 btpo_level
Definition: nbtree.h:66
bool needPrimScan
Definition: nbtree.h:1045
char * markTuples
Definition: nbtree.h:1062
BTScanPosData currPos
Definition: nbtree.h:1074
char * currTuples
Definition: nbtree.h:1061
bool oppositeDirCheck
Definition: nbtree.h:1047
BTScanPosData markPos
Definition: nbtree.h:1075
ScanKey keyData
Definition: nbtree.h:1041
bool moreRight
Definition: nbtree.h:980
Buffer buf
Definition: nbtree.h:958
BlockNumber currPage
Definition: nbtree.h:961
int firstItem
Definition: nbtree.h:989
int nextTupleOffset
Definition: nbtree.h:973
BlockNumber prevPage
Definition: nbtree.h:962
BlockNumber nextPage
Definition: nbtree.h:963
bool moreLeft
Definition: nbtree.h:979
int lastItem
Definition: nbtree.h:990
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:993
int itemIndex
Definition: nbtree.h:991
ScanDirection dir
Definition: nbtree.h:967
XLogRecPtr lsn
Definition: nbtree.h:964
ItemPointerData heapTid
Definition: nbtree.h:951
LocationIndex tupleOffset
Definition: nbtree.h:953
OffsetNumber indexOffset
Definition: nbtree.h:952
BlockNumber bts_blkno
Definition: nbtree.h:739
struct BTStackData * bts_parent
Definition: nbtree.h:741
OffsetNumber bts_offset
Definition: nbtree.h:740
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:183
bool ignore_killed_tuples
Definition: relscan.h:146
IndexTuple xs_itup
Definition: relscan.h:159
Relation indexRelation
Definition: relscan.h:135
ItemPointerData xs_heaptid
Definition: relscan.h:164
struct SnapshotData * xs_snapshot
Definition: relscan.h:136
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
StrategyNumber sk_strategy
Definition: skey.h:68
AttrNumber sk_attno
Definition: skey.h:67
static ItemArray items
Definition: test_tidstore.c:48
#define IsolationIsSerializable()
Definition: xact.h:52