PostgreSQL Source Code git master
nbtsearch.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
5 *
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "access/nbtree.h"
19#include "access/relscan.h"
20#include "access/xact.h"
21#include "miscadmin.h"
22#include "pgstat.h"
23#include "storage/predicate.h"
24#include "utils/lsyscache.h"
25#include "utils/rel.h"
26
27
28static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
30 Buffer buf, bool forupdate, BTStack stack,
31 int access);
34 OffsetNumber offnum);
35static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
36 OffsetNumber offnum, bool firstpage);
37static void _bt_saveitem(BTScanOpaque so, int itemIndex,
38 OffsetNumber offnum, IndexTuple itup);
39static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
40 OffsetNumber offnum, const ItemPointerData *heapTid,
41 IndexTuple itup);
42static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
43 OffsetNumber offnum,
44 ItemPointer heapTid, int tupleOffset);
45static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
46static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
47static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
48 ScanDirection dir);
49static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
50 BlockNumber lastcurrblkno, ScanDirection dir,
51 bool seized);
53 BlockNumber lastcurrblkno);
54static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
55
56
57/*
58 * _bt_drop_lock_and_maybe_pin()
59 *
60 * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
61 * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
62 */
63static inline void
65{
66 if (!so->dropPin)
67 {
68 /* Just drop the lock (not the pin) */
69 _bt_unlockbuf(rel, so->currPos.buf);
70 return;
71 }
72
73 /*
74 * Drop both the lock and the pin.
75 *
76 * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
77 * when concurrent heap TID recycling by VACUUM might have taken place.
78 */
81 _bt_relbuf(rel, so->currPos.buf);
83}
84
85/*
86 * _bt_search() -- Search the tree for a particular scankey,
87 * or more precisely for the first leaf page it could be on.
88 *
89 * The passed scankey is an insertion-type scankey (see nbtree/README),
90 * but it can omit the rightmost column(s) of the index.
91 *
92 * Return value is a stack of parent-page pointers (i.e. there is no entry for
93 * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
94 * which is locked and pinned. No locks are held on the parent pages,
95 * however!
96 *
97 * The returned buffer is locked according to access parameter. Additionally,
98 * access = BT_WRITE will allow an empty root page to be created and returned.
99 * When access = BT_READ, an empty index will result in *bufP being set to
100 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
101 * during the search will be finished.
102 *
103 * heaprel must be provided by callers that pass access = BT_WRITE, since we
104 * might need to allocate a new root page for caller -- see _bt_allocbuf.
105 */
108 int access)
109{
110 BTStack stack_in = NULL;
111 int page_access = BT_READ;
112
113 /* heaprel must be set whenever _bt_allocbuf is reachable */
115 Assert(access == BT_READ || heaprel != NULL);
116
117 /* Get the root page to start with */
118 *bufP = _bt_getroot(rel, heaprel, access);
119
120 /* If index is empty and access = BT_READ, no root page is created. */
121 if (!BufferIsValid(*bufP))
122 return (BTStack) NULL;
123
124 /* Loop iterates once per level descended in the tree */
125 for (;;)
126 {
127 Page page;
128 BTPageOpaque opaque;
129 OffsetNumber offnum;
130 ItemId itemid;
131 IndexTuple itup;
132 BlockNumber child;
133 BTStack new_stack;
134
135 /*
136 * Race -- the page we just grabbed may have split since we read its
137 * downlink in its parent page (or the metapage). If it has, we may
138 * need to move right to its new sibling. Do that.
139 *
140 * In write-mode, allow _bt_moveright to finish any incomplete splits
141 * along the way. Strictly speaking, we'd only need to finish an
142 * incomplete split on the leaf page we're about to insert to, not on
143 * any of the upper levels (internal pages with incomplete splits are
144 * also taken care of in _bt_getstackbuf). But this is a good
145 * opportunity to finish splits of internal pages too.
146 */
147 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
148 stack_in, page_access);
149
150 /* if this is a leaf page, we're done */
151 page = BufferGetPage(*bufP);
152 opaque = BTPageGetOpaque(page);
153 if (P_ISLEAF(opaque))
154 break;
155
156 /*
157 * Find the appropriate pivot tuple on this page. Its downlink points
158 * to the child page that we're about to descend to.
159 */
160 offnum = _bt_binsrch(rel, key, *bufP);
161 itemid = PageGetItemId(page, offnum);
162 itup = (IndexTuple) PageGetItem(page, itemid);
163 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
164 child = BTreeTupleGetDownLink(itup);
165
166 /*
167 * We need to save the location of the pivot tuple we chose in a new
168 * stack entry for this page/level. If caller ends up splitting a
169 * page one level down, it usually ends up inserting a new pivot
170 * tuple/downlink immediately after the location recorded here.
171 */
172 new_stack = (BTStack) palloc(sizeof(BTStackData));
173 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
174 new_stack->bts_offset = offnum;
175 new_stack->bts_parent = stack_in;
176
177 /*
178 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
179 * we're on the level 1 and asked to lock leaf page in write mode,
180 * then lock next page in write mode, because it must be a leaf.
181 */
182 if (opaque->btpo_level == 1 && access == BT_WRITE)
183 page_access = BT_WRITE;
184
185 /* drop the read lock on the page, then acquire one on its child */
186 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
187
188 /* okay, all set to move down a level */
189 stack_in = new_stack;
190 }
191
192 /*
193 * If we're asked to lock leaf in write mode, but didn't manage to, then
194 * relock. This should only happen when the root page is a leaf page (and
195 * the only page in the index other than the metapage).
196 */
197 if (access == BT_WRITE && page_access == BT_READ)
198 {
199 /* trade in our read lock for a write lock */
200 _bt_unlockbuf(rel, *bufP);
201 _bt_lockbuf(rel, *bufP, BT_WRITE);
202
203 /*
204 * Race -- the leaf page may have split after we dropped the read lock
205 * but before we acquired a write lock. If it has, we may need to
206 * move right to its new sibling. Do that.
207 */
208 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
209 }
210
211 return stack_in;
212}
213
214/*
215 * _bt_moveright() -- move right in the btree if necessary.
216 *
217 * When we follow a pointer to reach a page, it is possible that
218 * the page has changed in the meanwhile. If this happens, we're
219 * guaranteed that the page has "split right" -- that is, that any
220 * data that appeared on the page originally is either on the page
221 * or strictly to the right of it.
222 *
223 * This routine decides whether or not we need to move right in the
224 * tree by examining the high key entry on the page. If that entry is
225 * strictly less than the scankey, or <= the scankey in the
226 * key.nextkey=true case, then we followed the wrong link and we need
227 * to move right.
228 *
229 * The passed insertion-type scankey can omit the rightmost column(s) of the
230 * index. (see nbtree/README)
231 *
232 * When key.nextkey is false (the usual case), we are looking for the first
233 * item >= key. When key.nextkey is true, we are looking for the first item
234 * strictly greater than key.
235 *
236 * If forupdate is true, we will attempt to finish any incomplete splits
237 * that we encounter. This is required when locking a target page for an
238 * insertion, because we don't allow inserting on a page before the split is
239 * completed. 'heaprel' and 'stack' are only used if forupdate is true.
240 *
241 * On entry, we have the buffer pinned and a lock of the type specified by
242 * 'access'. If we move right, we release the buffer and lock and acquire
243 * the same on the right sibling. Return value is the buffer we stop at.
244 */
245static Buffer
247 Relation heaprel,
249 Buffer buf,
250 bool forupdate,
251 BTStack stack,
252 int access)
253{
254 Page page;
255 BTPageOpaque opaque;
256 int32 cmpval;
257
258 Assert(!forupdate || heaprel != NULL);
259
260 /*
261 * When nextkey = false (normal case): if the scan key that brought us to
262 * this page is > the high key stored on the page, then the page has split
263 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
264 * have some duplicates to the right as well as the left, but that's
265 * something that's only ever dealt with on the leaf level, after
266 * _bt_search has found an initial leaf page.)
267 *
268 * When nextkey = true: move right if the scan key is >= page's high key.
269 * (Note that key.scantid cannot be set in this case.)
270 *
271 * The page could even have split more than once, so scan as far as
272 * needed.
273 *
274 * We also have to move right if we followed a link that brought us to a
275 * dead page.
276 */
277 cmpval = key->nextkey ? 0 : 1;
278
279 for (;;)
280 {
281 page = BufferGetPage(buf);
282 opaque = BTPageGetOpaque(page);
283
284 if (P_RIGHTMOST(opaque))
285 break;
286
287 /*
288 * Finish any incomplete splits we encounter along the way.
289 */
290 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
291 {
293
294 /* upgrade our lock if necessary */
295 if (access == BT_READ)
296 {
297 _bt_unlockbuf(rel, buf);
298 _bt_lockbuf(rel, buf, BT_WRITE);
299 }
300
301 if (P_INCOMPLETE_SPLIT(opaque))
302 _bt_finish_split(rel, heaprel, buf, stack);
303 else
304 _bt_relbuf(rel, buf);
305
306 /* re-acquire the lock in the right mode, and re-check */
307 buf = _bt_getbuf(rel, blkno, access);
308 continue;
309 }
310
311 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
312 {
313 /* step right one page */
314 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
315 continue;
316 }
317 else
318 break;
319 }
320
321 if (P_IGNORE(opaque))
322 elog(ERROR, "fell off the end of index \"%s\"",
324
325 return buf;
326}
327
328/*
329 * _bt_binsrch() -- Do a binary search for a key on a particular page.
330 *
331 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
332 * of the last key < given scankey, or last key <= given scankey if nextkey
333 * is true. (Since _bt_compare treats the first data key of such a page as
334 * minus infinity, there will be at least one key < scankey, so the result
335 * always points at one of the keys on the page.)
336 *
337 * On a leaf page, _bt_binsrch() returns the final result of the initial
338 * positioning process that started with _bt_first's call to _bt_search.
339 * We're returning a non-pivot tuple offset, so things are a little different.
340 * It is possible that we'll return an offset that's either past the last
341 * non-pivot slot, or (in the case of a backward scan) before the first slot.
342 *
343 * This procedure is not responsible for walking right, it just examines
344 * the given page. _bt_binsrch() has no lock or refcount side effects
345 * on the buffer.
346 */
347static OffsetNumber
350 Buffer buf)
351{
352 Page page;
353 BTPageOpaque opaque;
354 OffsetNumber low,
355 high;
356 int32 result,
357 cmpval;
358
359 page = BufferGetPage(buf);
360 opaque = BTPageGetOpaque(page);
361
362 /* Requesting nextkey semantics while using scantid seems nonsensical */
363 Assert(!key->nextkey || key->scantid == NULL);
364 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
365 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
366
367 low = P_FIRSTDATAKEY(opaque);
368 high = PageGetMaxOffsetNumber(page);
369
370 /*
371 * If there are no keys on the page, return the first available slot. Note
372 * this covers two cases: the page is really empty (no keys), or it
373 * contains only a high key. The latter case is possible after vacuuming.
374 * This can never happen on an internal page, however, since they are
375 * never empty (an internal page must have at least one child).
376 */
377 if (unlikely(high < low))
378 return low;
379
380 /*
381 * Binary search to find the first key on the page >= scan key, or first
382 * key > scankey when nextkey is true.
383 *
384 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
385 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
386 *
387 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
388 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
389 *
390 * We can fall out when high == low.
391 */
392 high++; /* establish the loop invariant for high */
393
394 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
395
396 while (high > low)
397 {
398 OffsetNumber mid = low + ((high - low) / 2);
399
400 /* We have low <= mid < high, so mid points at a real slot */
401
402 result = _bt_compare(rel, key, page, mid);
403
404 if (result >= cmpval)
405 low = mid + 1;
406 else
407 high = mid;
408 }
409
410 /*
411 * At this point we have high == low.
412 *
413 * On a leaf page we always return the first non-pivot tuple >= scan key
414 * (resp. > scan key) for forward scan callers. For backward scans, it's
415 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
416 */
417 if (P_ISLEAF(opaque))
418 {
419 /*
420 * In the backward scan case we're supposed to locate the last
421 * matching tuple on the leaf level -- not the first matching tuple
422 * (the last tuple will be the first one returned by the scan).
423 *
424 * At this point we've located the first non-pivot tuple immediately
425 * after the last matching tuple (which might just be maxoff + 1).
426 * Compensate by stepping back.
427 */
428 if (key->backward)
429 return OffsetNumberPrev(low);
430
431 return low;
432 }
433
434 /*
435 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
436 * There must be one if _bt_compare() is playing by the rules.
437 *
438 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
439 * a truncated -inf heap TID is usually enough to prevent it altogether.
440 * Even omitted scan key entries are treated as > truncated attributes.
441 *
442 * However, during backward scans _bt_compare() interprets omitted scan
443 * key attributes as == corresponding truncated -inf attributes instead.
444 * This works just like < would work here. Under this scheme, < strategy
445 * backward scans will always directly descend to the correct leaf page.
446 * In particular, they will never incur an "extra" leaf page access with a
447 * scan key that happens to contain the same prefix of values as some
448 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
449 * it uses a leaf page high key to "re-find" a page undergoing deletion.
450 */
451 Assert(low > P_FIRSTDATAKEY(opaque));
452
453 return OffsetNumberPrev(low);
454}
455
456/*
457 *
458 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
459 *
460 * Like _bt_binsrch(), but with support for caching the binary search
461 * bounds. Only used during insertion, and only on the leaf page that it
462 * looks like caller will insert tuple on. Exclusive-locked and pinned
463 * leaf page is contained within insertstate.
464 *
465 * Caches the bounds fields in insertstate so that a subsequent call can
466 * reuse the low and strict high bounds of original binary search. Callers
467 * that use these fields directly must be prepared for the case where low
468 * and/or stricthigh are not on the same page (one or both exceed maxoff
469 * for the page). The case where there are no items on the page (high <
470 * low) makes bounds invalid.
471 *
472 * Caller is responsible for invalidating bounds when it modifies the page
473 * before calling here a second time, and for dealing with posting list
474 * tuple matches (callers can use insertstate's postingoff field to
475 * determine which existing heap TID will need to be replaced by a posting
476 * list split).
477 */
480{
481 BTScanInsert key = insertstate->itup_key;
482 Page page;
483 BTPageOpaque opaque;
484 OffsetNumber low,
485 high,
486 stricthigh;
487 int32 result,
488 cmpval;
489
490 page = BufferGetPage(insertstate->buf);
491 opaque = BTPageGetOpaque(page);
492
493 Assert(P_ISLEAF(opaque));
494 Assert(!key->nextkey);
495 Assert(insertstate->postingoff == 0);
496
497 if (!insertstate->bounds_valid)
498 {
499 /* Start new binary search */
500 low = P_FIRSTDATAKEY(opaque);
501 high = PageGetMaxOffsetNumber(page);
502 }
503 else
504 {
505 /* Restore result of previous binary search against same page */
506 low = insertstate->low;
507 high = insertstate->stricthigh;
508 }
509
510 /* If there are no keys on the page, return the first available slot */
511 if (unlikely(high < low))
512 {
513 /* Caller can't reuse bounds */
514 insertstate->low = InvalidOffsetNumber;
515 insertstate->stricthigh = InvalidOffsetNumber;
516 insertstate->bounds_valid = false;
517 return low;
518 }
519
520 /*
521 * Binary search to find the first key on the page >= scan key. (nextkey
522 * is always false when inserting).
523 *
524 * The loop invariant is: all slots before 'low' are < scan key, all slots
525 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
526 * maintained to save additional search effort for caller.
527 *
528 * We can fall out when high == low.
529 */
530 if (!insertstate->bounds_valid)
531 high++; /* establish the loop invariant for high */
532 stricthigh = high; /* high initially strictly higher */
533
534 cmpval = 1; /* !nextkey comparison value */
535
536 while (high > low)
537 {
538 OffsetNumber mid = low + ((high - low) / 2);
539
540 /* We have low <= mid < high, so mid points at a real slot */
541
542 result = _bt_compare(rel, key, page, mid);
543
544 if (result >= cmpval)
545 low = mid + 1;
546 else
547 {
548 high = mid;
549 if (result != 0)
550 stricthigh = high;
551 }
552
553 /*
554 * If tuple at offset located by binary search is a posting list whose
555 * TID range overlaps with caller's scantid, perform posting list
556 * binary search to set postingoff for caller. Caller must split the
557 * posting list when postingoff is set. This should happen
558 * infrequently.
559 */
560 if (unlikely(result == 0 && key->scantid != NULL))
561 {
562 /*
563 * postingoff should never be set more than once per leaf page
564 * binary search. That would mean that there are duplicate table
565 * TIDs in the index, which is never okay. Check for that here.
566 */
567 if (insertstate->postingoff != 0)
569 (errcode(ERRCODE_INDEX_CORRUPTED),
570 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
573 low, stricthigh,
574 BufferGetBlockNumber(insertstate->buf),
576
577 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
578 }
579 }
580
581 /*
582 * On a leaf page, a binary search always returns the first key >= scan
583 * key (at least in !nextkey case), which could be the last slot + 1. This
584 * is also the lower bound of cached search.
585 *
586 * stricthigh may also be the last slot + 1, which prevents caller from
587 * using bounds directly, but is still useful to us if we're called a
588 * second time with cached bounds (cached low will be < stricthigh when
589 * that happens).
590 */
591 insertstate->low = low;
592 insertstate->stricthigh = stricthigh;
593 insertstate->bounds_valid = true;
594
595 return low;
596}
597
598/*----------
599 * _bt_binsrch_posting() -- posting list binary search.
600 *
601 * Helper routine for _bt_binsrch_insert().
602 *
603 * Returns offset into posting list where caller's scantid belongs.
604 *----------
605 */
606static int
608{
609 IndexTuple itup;
610 ItemId itemid;
611 int low,
612 high,
613 mid,
614 res;
615
616 /*
617 * If this isn't a posting tuple, then the index must be corrupt (if it is
618 * an ordinary non-pivot tuple then there must be an existing tuple with a
619 * heap TID that equals inserter's new heap TID/scantid). Defensively
620 * check that tuple is a posting list tuple whose posting list range
621 * includes caller's scantid.
622 *
623 * (This is also needed because contrib/amcheck's rootdescend option needs
624 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
625 */
626 itemid = PageGetItemId(page, offnum);
627 itup = (IndexTuple) PageGetItem(page, itemid);
628 if (!BTreeTupleIsPosting(itup))
629 return 0;
630
631 Assert(key->heapkeyspace && key->allequalimage);
632
633 /*
634 * In the event that posting list tuple has LP_DEAD bit set, indicate this
635 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
636 * second call to _bt_binsrch_insert() can take place when its caller has
637 * removed the dead item.
638 */
639 if (ItemIdIsDead(itemid))
640 return -1;
641
642 /* "high" is past end of posting list for loop invariant */
643 low = 0;
644 high = BTreeTupleGetNPosting(itup);
645 Assert(high >= 2);
646
647 while (high > low)
648 {
649 mid = low + ((high - low) / 2);
650 res = ItemPointerCompare(key->scantid,
651 BTreeTupleGetPostingN(itup, mid));
652
653 if (res > 0)
654 low = mid + 1;
655 else if (res < 0)
656 high = mid;
657 else
658 return mid;
659 }
660
661 /* Exact match not found */
662 return low;
663}
664
665/*----------
666 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
667 *
668 * page/offnum: location of btree item to be compared to.
669 *
670 * This routine returns:
671 * <0 if scankey < tuple at offnum;
672 * 0 if scankey == tuple at offnum;
673 * >0 if scankey > tuple at offnum.
674 *
675 * NULLs in the keys are treated as sortable values. Therefore
676 * "equality" does not necessarily mean that the item should be returned
677 * to the caller as a matching key. Similarly, an insertion scankey
678 * with its scantid set is treated as equal to a posting tuple whose TID
679 * range overlaps with their scantid. There generally won't be a
680 * matching TID in the posting tuple, which caller must handle
681 * themselves (e.g., by splitting the posting list tuple).
682 *
683 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
684 * "minus infinity": this routine will always claim it is less than the
685 * scankey. The actual key value stored is explicitly truncated to 0
686 * attributes (explicitly minus infinity) with version 3+ indexes, but
687 * that isn't relied upon. This allows us to implement the Lehman and
688 * Yao convention that the first down-link pointer is before the first
689 * key. See backend/access/nbtree/README for details.
690 *----------
691 */
692int32
695 Page page,
696 OffsetNumber offnum)
697{
698 TupleDesc itupdesc = RelationGetDescr(rel);
699 BTPageOpaque opaque = BTPageGetOpaque(page);
700 IndexTuple itup;
701 ItemPointer heapTid;
702 ScanKey scankey;
703 int ncmpkey;
704 int ntupatts;
705 int32 result;
706
707 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
709 Assert(key->heapkeyspace || key->scantid == NULL);
710
711 /*
712 * Force result ">" if target item is first data item on an internal page
713 * --- see NOTE above.
714 */
715 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
716 return 1;
717
718 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
719 ntupatts = BTreeTupleGetNAtts(itup, rel);
720
721 /*
722 * The scan key is set up with the attribute number associated with each
723 * term in the key. It is important that, if the index is multi-key, the
724 * scan contain the first k key attributes, and that they be in order. If
725 * you think about how multi-key ordering works, you'll understand why
726 * this is.
727 *
728 * We don't test for violation of this condition here, however. The
729 * initial setup for the index scan had better have gotten it right (see
730 * _bt_first).
731 */
732
733 ncmpkey = Min(ntupatts, key->keysz);
734 Assert(key->heapkeyspace || ncmpkey == key->keysz);
735 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
736 scankey = key->scankeys;
737 for (int i = 1; i <= ncmpkey; i++)
738 {
739 Datum datum;
740 bool isNull;
741
742 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
743
744 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
745 {
746 if (isNull)
747 result = 0; /* NULL "=" NULL */
748 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
749 result = -1; /* NULL "<" NOT_NULL */
750 else
751 result = 1; /* NULL ">" NOT_NULL */
752 }
753 else if (isNull) /* key is NOT_NULL and item is NULL */
754 {
755 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
756 result = 1; /* NOT_NULL ">" NULL */
757 else
758 result = -1; /* NOT_NULL "<" NULL */
759 }
760 else
761 {
762 /*
763 * The sk_func needs to be passed the index value as left arg and
764 * the sk_argument as right arg (they might be of different
765 * types). Since it is convenient for callers to think of
766 * _bt_compare as comparing the scankey to the index item, we have
767 * to flip the sign of the comparison result. (Unless it's a DESC
768 * column, in which case we *don't* flip the sign.)
769 */
770 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
771 scankey->sk_collation,
772 datum,
773 scankey->sk_argument));
774
775 if (!(scankey->sk_flags & SK_BT_DESC))
776 INVERT_COMPARE_RESULT(result);
777 }
778
779 /* if the keys are unequal, return the difference */
780 if (result != 0)
781 return result;
782
783 scankey++;
784 }
785
786 /*
787 * All non-truncated attributes (other than heap TID) were found to be
788 * equal. Treat truncated attributes as minus infinity when scankey has a
789 * key attribute value that would otherwise be compared directly.
790 *
791 * Note: it doesn't matter if ntupatts includes non-key attributes;
792 * scankey won't, so explicitly excluding non-key attributes isn't
793 * necessary.
794 */
795 if (key->keysz > ntupatts)
796 return 1;
797
798 /*
799 * Use the heap TID attribute and scantid to try to break the tie. The
800 * rules are the same as any other key attribute -- only the
801 * representation differs.
802 */
803 heapTid = BTreeTupleGetHeapTID(itup);
804 if (key->scantid == NULL)
805 {
806 /*
807 * Forward scans have a scankey that is considered greater than a
808 * truncated pivot tuple if and when the scankey has equal values for
809 * attributes up to and including the least significant untruncated
810 * attribute in tuple. Even attributes that were omitted from the
811 * scan key are considered greater than -inf truncated attributes.
812 * (See _bt_binsrch for an explanation of our backward scan behavior.)
813 *
814 * For example, if an index has the minimum two attributes (single
815 * user key attribute, plus heap TID attribute), and a page's high key
816 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
817 * will not descend to the page to the left. The search will descend
818 * right instead. The truncated attribute in pivot tuple means that
819 * all non-pivot tuples on the page to the left are strictly < 'foo',
820 * so it isn't necessary to descend left. In other words, search
821 * doesn't have to descend left because it isn't interested in a match
822 * that has a heap TID value of -inf.
823 *
824 * Note: the heap TID part of the test ensures that scankey is being
825 * compared to a pivot tuple with one or more truncated -inf key
826 * attributes. The heap TID attribute is the last key attribute in
827 * every index, of course, but other than that it isn't special.
828 */
829 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
830 key->heapkeyspace)
831 return 1;
832
833 /* All provided scankey arguments found to be equal */
834 return 0;
835 }
836
837 /*
838 * Treat truncated heap TID as minus infinity, since scankey has a key
839 * attribute value (scantid) that would otherwise be compared directly
840 */
842 if (heapTid == NULL)
843 return 1;
844
845 /*
846 * Scankey must be treated as equal to a posting list tuple if its scantid
847 * value falls within the range of the posting list. In all other cases
848 * there can only be a single heap TID value, which is compared directly
849 * with scantid.
850 */
852 result = ItemPointerCompare(key->scantid, heapTid);
853 if (result <= 0 || !BTreeTupleIsPosting(itup))
854 return result;
855 else
856 {
857 result = ItemPointerCompare(key->scantid,
859 if (result > 0)
860 return 1;
861 }
862
863 return 0;
864}
865
866/*
867 * _bt_first() -- Find the first item in a scan.
868 *
869 * We need to be clever about the direction of scan, the search
870 * conditions, and the tree ordering. We find the first item (or,
871 * if backwards scan, the last item) in the tree that satisfies the
872 * qualifications in the scan key. On success exit, data about the
873 * matching tuple(s) on the page has been loaded into so->currPos. We'll
874 * drop all locks and hold onto a pin on page's buffer, except during
875 * so->dropPin scans, when we drop both the lock and the pin.
876 * _bt_returnitem sets the next item to return to scan on success exit.
877 *
878 * If there are no matching items in the index, we return false, with no
879 * pins or locks held. so->currPos will remain invalid.
880 *
881 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
882 * are both search-type scankeys (see nbtree/README for more about this).
883 * Within this routine, we build a temporary insertion-type scankey to use
884 * in locating the scan start position.
885 */
886bool
888{
889 Relation rel = scan->indexRelation;
890 BTScanOpaque so = (BTScanOpaque) scan->opaque;
891 BTStack stack;
892 OffsetNumber offnum;
893 BTScanInsertData inskey;
894 ScanKey startKeys[INDEX_MAX_KEYS];
895 ScanKeyData notnullkey;
896 int keysz = 0;
897 StrategyNumber strat_total = InvalidStrategy;
899 lastcurrblkno;
900
902
903 /*
904 * Examine the scan keys and eliminate any redundant keys; also mark the
905 * keys that must be matched to continue the scan.
906 */
908
909 /*
910 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
911 * never be satisfied (eg, x == 1 AND x > 2).
912 */
913 if (!so->qual_ok)
914 {
915 Assert(!so->needPrimScan);
916 _bt_parallel_done(scan);
917 return false;
918 }
919
920 /*
921 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
922 * will likely release the parallel scan later on.
923 */
924 if (scan->parallel_scan != NULL &&
925 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
926 return false;
927
928 /*
929 * Initialize the scan's arrays (if any) for the current scan direction
930 * (except when they were already set to later values as part of
931 * scheduling the primitive index scan that is now underway)
932 */
933 if (so->numArrayKeys && !so->needPrimScan)
934 _bt_start_array_keys(scan, dir);
935
936 if (blkno != InvalidBlockNumber)
937 {
938 /*
939 * We anticipated calling _bt_search, but another worker bet us to it.
940 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
941 */
942 Assert(scan->parallel_scan != NULL);
943 Assert(!so->needPrimScan);
944 Assert(blkno != P_NONE);
945
946 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
947 return false;
948
949 _bt_returnitem(scan, so);
950 return true;
951 }
952
953 /*
954 * Count an indexscan for stats, now that we know that we'll call
955 * _bt_search/_bt_endpoint below
956 */
958 if (scan->instrument)
959 scan->instrument->nsearches++;
960
961 /*----------
962 * Examine the scan keys to discover where we need to start the scan.
963 * The selected scan keys (at most one per index column) are remembered by
964 * storing their addresses into the local startKeys[] array. The final
965 * startKeys[] entry's strategy is set in strat_total. (Actually, there
966 * are a couple of cases where we force a less/more restrictive strategy.)
967 *
968 * We must use the key that was marked required (in the direction opposite
969 * our own scan's) during preprocessing. Each index attribute can only
970 * have one such required key. In general, the keys that we use to find
971 * an initial position when scanning forwards are the same keys that end
972 * the scan on the leaf level when scanning backwards (and vice-versa).
973 *
974 * When the scan keys include cross-type operators, _bt_preprocess_keys
975 * may not be able to eliminate redundant keys; in such cases it will
976 * arbitrarily pick a usable key for each attribute (and scan direction),
977 * ensuring that there is no more than one key required in each direction.
978 * We stop considering further keys once we reach the first nonrequired
979 * key (which must come after all required keys), so this can't affect us.
980 *
981 * The required keys that we use as starting boundaries have to be =, >,
982 * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
983 * We can use keys for multiple attributes so long as the prior attributes
984 * had only =, >= (resp. =, <=) keys. These rules are very similar to the
985 * rules that preprocessing used to determine which keys to mark required.
986 * We cannot always use every required key as a positioning key, though.
987 * Skip arrays necessitate independently applying our own rules here.
988 * Skip arrays are always generally considered = array keys, but we'll
989 * nevertheless treat them as inequalities at certain points of the scan.
990 * When that happens, it _might_ have implications for the number of
991 * required keys that we can safely use for initial positioning purposes.
992 *
993 * For example, a forward scan with a skip array on its leading attribute
994 * (with no low_compare/high_compare) will have at least two required scan
995 * keys, but we won't use any of them as boundary keys during the scan's
996 * initial call here. Our positioning key during the first call here can
997 * be thought of as representing "> -infinity". Similarly, if such a skip
998 * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
999 * during the scan's initial call here; a lower-order key such as "b = 42"
1000 * can't be used until the "a" array advances beyond MINVAL/low_compare.
1001 *
1002 * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
1003 * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
1004 * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
1005 * that we treat = and >= as equivalent when scanning forwards (just as we
1006 * treat = and <= as equivalent when scanning backwards). We effectively
1007 * do the same thing (though with a distinct "a" element/value) each time.
1008 *
1009 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1010 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1011 * If the index stores nulls at the end of the index we'll be starting
1012 * from, and we have no boundary key for the column (which means the key
1013 * we deduced NOT NULL from is an inequality key that constrains the other
1014 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1015 * use as a boundary key. If we didn't do this, we might find ourselves
1016 * traversing a lot of null entries at the start of the scan.
1017 *
1018 * In this loop, row-comparison keys are treated the same as keys on their
1019 * first (leftmost) columns. We'll add all lower-order columns of the row
1020 * comparison that were marked required during preprocessing below.
1021 *
1022 * _bt_advance_array_keys needs to know exactly how we'll reposition the
1023 * scan (should it opt to schedule another primitive index scan). It is
1024 * critical that primscans only be scheduled when they'll definitely make
1025 * some useful progress. _bt_advance_array_keys does this by calling
1026 * _bt_checkkeys routines that report whether a tuple is past the end of
1027 * matches for the scan's keys (given the scan's current array elements).
1028 * If the page's final tuple is "after the end of matches" for a scan that
1029 * uses the *opposite* scan direction, then it must follow that it's also
1030 * "before the start of matches" for the actual current scan direction.
1031 * It is therefore essential that all of our initial positioning rules are
1032 * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1033 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1034 * need to be kept in sync.
1035 *----------
1036 */
1037 if (so->numberOfKeys > 0)
1038 {
1039 AttrNumber curattr;
1040 ScanKey bkey;
1041 ScanKey impliesNN;
1042 ScanKey cur;
1043
1044 /*
1045 * bkey will be set to the key that preprocessing left behind as the
1046 * boundary key for this attribute, in this scan direction (if any)
1047 */
1048 cur = so->keyData;
1049 curattr = 1;
1050 bkey = NULL;
1051 /* Also remember any scankey that implies a NOT NULL constraint */
1052 impliesNN = NULL;
1053
1054 /*
1055 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1056 * pass to handle after-last-key processing. Actual exit from the
1057 * loop is at one of the "break" statements below.
1058 */
1059 for (int i = 0;; cur++, i++)
1060 {
1061 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1062 {
1063 /* Done looking for the curattr boundary key */
1064 Assert(bkey == NULL ||
1065 (bkey->sk_attno == curattr &&
1066 (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1067 Assert(impliesNN == NULL ||
1068 (impliesNN->sk_attno == curattr &&
1069 (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1070
1071 /*
1072 * If this is a scan key for a skip array whose current
1073 * element is MINVAL, choose low_compare (when scanning
1074 * backwards it'll be MAXVAL, and we'll choose high_compare).
1075 *
1076 * Note: if the array's low_compare key makes 'bkey' NULL,
1077 * then we behave as if the array's first element is -inf,
1078 * except when !array->null_elem implies a usable NOT NULL
1079 * constraint.
1080 */
1081 if (bkey != NULL &&
1082 (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1083 {
1084 int ikey = bkey - so->keyData;
1085 ScanKey skipequalitykey = bkey;
1086 BTArrayKeyInfo *array = NULL;
1087
1088 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1089 {
1090 array = &so->arrayKeys[arridx];
1091 if (array->scan_key == ikey)
1092 break;
1093 }
1094
1095 if (ScanDirectionIsForward(dir))
1096 {
1097 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1098 bkey = array->low_compare;
1099 }
1100 else
1101 {
1102 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1103 bkey = array->high_compare;
1104 }
1105
1106 Assert(bkey == NULL ||
1107 bkey->sk_attno == skipequalitykey->sk_attno);
1108
1109 if (!array->null_elem)
1110 impliesNN = skipequalitykey;
1111 else
1112 Assert(bkey == NULL && impliesNN == NULL);
1113 }
1114
1115 /*
1116 * If we didn't find a usable boundary key, see if we can
1117 * deduce a NOT NULL key
1118 */
1119 if (bkey == NULL && impliesNN != NULL &&
1120 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1123 {
1124 /* Final startKeys[] entry will be deduced NOT NULL key */
1125 bkey = &notnullkey;
1128 (impliesNN->sk_flags &
1130 curattr,
1133 InvalidOid,
1134 InvalidOid,
1135 InvalidOid,
1136 (Datum) 0);
1137 }
1138
1139 /*
1140 * If preprocessing didn't leave a usable boundary key, quit;
1141 * else save the boundary key pointer in startKeys[]
1142 */
1143 if (bkey == NULL)
1144 break;
1145 startKeys[keysz++] = bkey;
1146
1147 /*
1148 * We can only consider adding more boundary keys when the one
1149 * that we just chose to add uses either the = or >= strategy
1150 * (during backwards scans we can only do so when the key that
1151 * we just added to startKeys[] uses the = or <= strategy)
1152 */
1153 strat_total = bkey->sk_strategy;
1154 if (strat_total == BTGreaterStrategyNumber ||
1155 strat_total == BTLessStrategyNumber)
1156 break;
1157
1158 /*
1159 * If the key that we just added to startKeys[] is a skip
1160 * array = key whose current element is marked NEXT or PRIOR,
1161 * make strat_total > or < (and stop adding boundary keys).
1162 * This can only happen with opclasses that lack skip support.
1163 */
1164 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1165 {
1166 Assert(bkey->sk_flags & SK_BT_SKIP);
1167 Assert(strat_total == BTEqualStrategyNumber);
1168
1169 if (ScanDirectionIsForward(dir))
1170 {
1171 Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1172 strat_total = BTGreaterStrategyNumber;
1173 }
1174 else
1175 {
1176 Assert(!(bkey->sk_flags & SK_BT_NEXT));
1177 strat_total = BTLessStrategyNumber;
1178 }
1179
1180 /*
1181 * We're done. We'll never find an exact = match for a
1182 * NEXT or PRIOR sentinel sk_argument value. There's no
1183 * sense in trying to add more keys to startKeys[].
1184 */
1185 break;
1186 }
1187
1188 /*
1189 * Done if that was the last scan key output by preprocessing.
1190 * Also done if we've now examined all keys marked required.
1191 */
1192 if (i >= so->numberOfKeys ||
1193 !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1194 break;
1195
1196 /*
1197 * Reset for next attr.
1198 */
1199 Assert(cur->sk_attno == curattr + 1);
1200 curattr = cur->sk_attno;
1201 bkey = NULL;
1202 impliesNN = NULL;
1203 }
1204
1205 /*
1206 * If we've located the starting boundary key for curattr, we have
1207 * no interest in curattr's other required key
1208 */
1209 if (bkey != NULL)
1210 continue;
1211
1212 /*
1213 * Is this key the starting boundary key for curattr?
1214 *
1215 * If not, does it imply a NOT NULL constraint? (Because
1216 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1217 * *any* inequality key works for that; we need not test.)
1218 */
1219 switch (cur->sk_strategy)
1220 {
1223 if (ScanDirectionIsBackward(dir))
1224 bkey = cur;
1225 else if (impliesNN == NULL)
1226 impliesNN = cur;
1227 break;
1229 bkey = cur;
1230 break;
1233 if (ScanDirectionIsForward(dir))
1234 bkey = cur;
1235 else if (impliesNN == NULL)
1236 impliesNN = cur;
1237 break;
1238 }
1239 }
1240 }
1241
1242 /*
1243 * If we found no usable boundary keys, we have to start from one end of
1244 * the tree. Walk down that edge to the first or last key, and scan from
1245 * there.
1246 *
1247 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1248 */
1249 if (keysz == 0)
1250 return _bt_endpoint(scan, dir);
1251
1252 /*
1253 * We want to start the scan somewhere within the index. Set up an
1254 * insertion scankey we can use to search for the boundary point we
1255 * identified above. The insertion scankey is built using the keys
1256 * identified by startKeys[]. (Remaining insertion scankey fields are
1257 * initialized after initial-positioning scan keys are finalized.)
1258 */
1259 Assert(keysz <= INDEX_MAX_KEYS);
1260 for (int i = 0; i < keysz; i++)
1261 {
1262 ScanKey bkey = startKeys[i];
1263
1264 Assert(bkey->sk_attno == i + 1);
1265
1266 if (bkey->sk_flags & SK_ROW_HEADER)
1267 {
1268 /*
1269 * Row comparison header: look to the first row member instead
1270 */
1271 ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1272 bool loosen_strat = false,
1273 tighten_strat = false;
1274
1275 /*
1276 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1277 * would've marked the qual as unsatisfiable, preventing us from
1278 * ever getting this far
1279 */
1280 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1281 Assert(subkey->sk_attno == bkey->sk_attno);
1282 Assert(!(subkey->sk_flags & SK_ISNULL));
1283
1284 /*
1285 * This is either a > or >= key (during backwards scans it is
1286 * either < or <=) that was marked required during preprocessing.
1287 * Later so->keyData[] keys can't have been marked required, so
1288 * our row compare header key must be the final startKeys[] entry.
1289 */
1291 Assert(subkey->sk_strategy == bkey->sk_strategy);
1292 Assert(subkey->sk_strategy == strat_total);
1293 Assert(i == keysz - 1);
1294
1295 /*
1296 * The member scankeys are already in insertion format (ie, they
1297 * have sk_func = 3-way-comparison function)
1298 */
1299 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1300
1301 /*
1302 * Now look to later row compare members.
1303 *
1304 * If there's an "index attribute gap" between two row compare
1305 * members, the second member won't have been marked required, and
1306 * so can't be used as a starting boundary key here. The part of
1307 * the row comparison that we do still use has to be treated as a
1308 * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1309 * with an omitted intervening index attribute "b" will use an
1310 * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1311 * the leaf level might satisfy the row compare qual.
1312 *
1313 * We're able to use a _more_ restrictive strategy when we reach a
1314 * NULL row compare member, since they're always unsatisfiable.
1315 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1316 * insertion scan key "a > 1". All tuples where "a = 1" cannot
1317 * possibly satisfy the row compare qual, so this is safe.
1318 */
1319 Assert(!(subkey->sk_flags & SK_ROW_END));
1320 for (;;)
1321 {
1322 subkey++;
1323 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1324
1325 if (subkey->sk_flags & SK_ISNULL)
1326 {
1327 /*
1328 * NULL member key, can only use earlier keys.
1329 *
1330 * We deliberately avoid checking if this key is marked
1331 * required. All earlier keys are required, and this key
1332 * is unsatisfiable either way, so we can't miss anything.
1333 */
1334 tighten_strat = true;
1335 break;
1336 }
1337
1338 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1339 {
1340 /* nonrequired member key, can only use earlier keys */
1341 loosen_strat = true;
1342 break;
1343 }
1344
1345 Assert(subkey->sk_attno == keysz + 1);
1346 Assert(subkey->sk_strategy == bkey->sk_strategy);
1347 Assert(keysz < INDEX_MAX_KEYS);
1348
1349 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1350 keysz++;
1351
1352 if (subkey->sk_flags & SK_ROW_END)
1353 break;
1354 }
1355 Assert(!(loosen_strat && tighten_strat));
1356 if (loosen_strat)
1357 {
1358 /* Use less restrictive strategy (and fewer member keys) */
1359 switch (strat_total)
1360 {
1362 strat_total = BTLessEqualStrategyNumber;
1363 break;
1365 strat_total = BTGreaterEqualStrategyNumber;
1366 break;
1367 }
1368 }
1369 if (tighten_strat)
1370 {
1371 /* Use more restrictive strategy (and fewer member keys) */
1372 switch (strat_total)
1373 {
1375 strat_total = BTLessStrategyNumber;
1376 break;
1378 strat_total = BTGreaterStrategyNumber;
1379 break;
1380 }
1381 }
1382
1383 /* Done (row compare header key is always last startKeys[] key) */
1384 break;
1385 }
1386
1387 /*
1388 * Ordinary comparison key/search-style key.
1389 *
1390 * Transform the search-style scan key to an insertion scan key by
1391 * replacing the sk_func with the appropriate btree 3-way-comparison
1392 * function.
1393 *
1394 * If scankey operator is not a cross-type comparison, we can use the
1395 * cached comparison function; otherwise gotta look it up in the
1396 * catalogs. (That can't lead to infinite recursion, since no
1397 * indexscan initiated by syscache lookup will use cross-data-type
1398 * operators.)
1399 *
1400 * We support the convention that sk_subtype == InvalidOid means the
1401 * opclass input type; this hack simplifies life for ScanKeyInit().
1402 */
1403 if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1404 bkey->sk_subtype == InvalidOid)
1405 {
1406 FmgrInfo *procinfo;
1407
1408 procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1409 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1410 bkey->sk_flags,
1411 bkey->sk_attno,
1413 bkey->sk_subtype,
1414 bkey->sk_collation,
1415 procinfo,
1416 bkey->sk_argument);
1417 }
1418 else
1419 {
1420 RegProcedure cmp_proc;
1421
1422 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1423 rel->rd_opcintype[i],
1424 bkey->sk_subtype, BTORDER_PROC);
1425 if (!RegProcedureIsValid(cmp_proc))
1426 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1427 BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1428 bkey->sk_attno, RelationGetRelationName(rel));
1429 ScanKeyEntryInitialize(inskey.scankeys + i,
1430 bkey->sk_flags,
1431 bkey->sk_attno,
1433 bkey->sk_subtype,
1434 bkey->sk_collation,
1435 cmp_proc,
1436 bkey->sk_argument);
1437 }
1438 }
1439
1440 /*----------
1441 * Examine the selected initial-positioning strategy to determine exactly
1442 * where we need to start the scan, and set flag variables to control the
1443 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1444 * page _bt_search returns).
1445 *----------
1446 */
1447 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1448 inskey.anynullkeys = false; /* unused */
1449 inskey.scantid = NULL;
1450 inskey.keysz = keysz;
1451 switch (strat_total)
1452 {
1454
1455 inskey.nextkey = false;
1456 inskey.backward = true;
1457 break;
1458
1460
1461 inskey.nextkey = true;
1462 inskey.backward = true;
1463 break;
1464
1466
1467 /*
1468 * If a backward scan was specified, need to start with last equal
1469 * item not first one.
1470 */
1471 if (ScanDirectionIsBackward(dir))
1472 {
1473 /*
1474 * This is the same as the <= strategy
1475 */
1476 inskey.nextkey = true;
1477 inskey.backward = true;
1478 }
1479 else
1480 {
1481 /*
1482 * This is the same as the >= strategy
1483 */
1484 inskey.nextkey = false;
1485 inskey.backward = false;
1486 }
1487 break;
1488
1490
1491 /*
1492 * Find first item >= scankey
1493 */
1494 inskey.nextkey = false;
1495 inskey.backward = false;
1496 break;
1497
1499
1500 /*
1501 * Find first item > scankey
1502 */
1503 inskey.nextkey = true;
1504 inskey.backward = false;
1505 break;
1506
1507 default:
1508 /* can't get here, but keep compiler quiet */
1509 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1510 return false;
1511 }
1512
1513 /*
1514 * Use the manufactured insertion scan key to descend the tree and
1515 * position ourselves on the target leaf page.
1516 */
1517 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1518 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1519
1520 /* don't need to keep the stack around... */
1521 _bt_freestack(stack);
1522
1523 if (!BufferIsValid(so->currPos.buf))
1524 {
1525 Assert(!so->needPrimScan);
1526
1527 /*
1528 * We only get here if the index is completely empty. Lock relation
1529 * because nothing finer to lock exists. Without a buffer lock, it's
1530 * possible for another transaction to insert data between
1531 * _bt_search() and PredicateLockRelation(). We have to try again
1532 * after taking the relation-level predicate lock, to close a narrow
1533 * window where we wouldn't scan concurrently inserted tuples, but the
1534 * writer wouldn't see our predicate lock.
1535 */
1537 {
1539 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1540 _bt_freestack(stack);
1541 }
1542
1543 if (!BufferIsValid(so->currPos.buf))
1544 {
1545 _bt_parallel_done(scan);
1546 return false;
1547 }
1548 }
1549
1550 /* position to the precise item on the page */
1551 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1552
1553 /*
1554 * Now load data from the first page of the scan (usually the page
1555 * currently in so->currPos.buf).
1556 *
1557 * If inskey.nextkey = false and inskey.backward = false, offnum is
1558 * positioned at the first non-pivot tuple >= inskey.scankeys.
1559 *
1560 * If inskey.nextkey = false and inskey.backward = true, offnum is
1561 * positioned at the last non-pivot tuple < inskey.scankeys.
1562 *
1563 * If inskey.nextkey = true and inskey.backward = false, offnum is
1564 * positioned at the first non-pivot tuple > inskey.scankeys.
1565 *
1566 * If inskey.nextkey = true and inskey.backward = true, offnum is
1567 * positioned at the last non-pivot tuple <= inskey.scankeys.
1568 *
1569 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1570 * for the page. For example, when inskey is both < the leaf page's high
1571 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1572 */
1573 if (!_bt_readfirstpage(scan, offnum, dir))
1574 return false;
1575
1576 _bt_returnitem(scan, so);
1577 return true;
1578}
1579
1580/*
1581 * _bt_next() -- Get the next item in a scan.
1582 *
1583 * On entry, so->currPos describes the current page, which may be pinned
1584 * but is not locked, and so->currPos.itemIndex identifies which item was
1585 * previously returned.
1586 *
1587 * On success exit, so->currPos is updated as needed, and _bt_returnitem
1588 * sets the next item to return to the scan. so->currPos remains valid.
1589 *
1590 * On failure exit (no more tuples), we invalidate so->currPos. It'll
1591 * still be possible for the scan to return tuples by changing direction,
1592 * though we'll need to call _bt_first anew in that other direction.
1593 */
1594bool
1596{
1597 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1598
1600
1601 /*
1602 * Advance to next tuple on current page; or if there's no more, try to
1603 * step to the next page with data.
1604 */
1605 if (ScanDirectionIsForward(dir))
1606 {
1607 if (++so->currPos.itemIndex > so->currPos.lastItem)
1608 {
1609 if (!_bt_steppage(scan, dir))
1610 return false;
1611 }
1612 }
1613 else
1614 {
1615 if (--so->currPos.itemIndex < so->currPos.firstItem)
1616 {
1617 if (!_bt_steppage(scan, dir))
1618 return false;
1619 }
1620 }
1621
1622 _bt_returnitem(scan, so);
1623 return true;
1624}
1625
1626/*
1627 * _bt_readpage() -- Load data from current index page into so->currPos
1628 *
1629 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1630 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1631 * they are updated as appropriate. All other fields of so->currPos are
1632 * initialized from scratch here.
1633 *
1634 * We scan the current page starting at offnum and moving in the indicated
1635 * direction. All items matching the scan keys are loaded into currPos.items.
1636 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1637 * that there can be no more matching tuples in the current scan direction
1638 * (could just be for the current primitive index scan when scan has arrays).
1639 *
1640 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1641 * prior to calling this function; this function will invoke
1642 * _bt_parallel_release before returning.
1643 *
1644 * Returns true if any matching items found on the page, false if none.
1645 */
1646static bool
1648 bool firstpage)
1649{
1650 Relation rel = scan->indexRelation;
1651 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1652 Page page;
1653 BTPageOpaque opaque;
1654 OffsetNumber minoff;
1655 OffsetNumber maxoff;
1656 BTReadPageState pstate;
1657 bool arrayKeys;
1658 int itemIndex,
1659 indnatts;
1660
1661 /* save the page/buffer block number, along with its sibling links */
1662 page = BufferGetPage(so->currPos.buf);
1663 opaque = BTPageGetOpaque(page);
1665 so->currPos.prevPage = opaque->btpo_prev;
1666 so->currPos.nextPage = opaque->btpo_next;
1667 /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
1668 so->currPos.dir = dir;
1669 so->currPos.nextTupleOffset = 0;
1670
1671 /* either moreRight or moreLeft should be set now (may be unset later) */
1673 so->currPos.moreLeft);
1674 Assert(!P_IGNORE(opaque));
1676 Assert(!so->needPrimScan);
1677
1678 if (scan->parallel_scan)
1679 {
1680 /* allow next/prev page to be read by other worker without delay */
1681 if (ScanDirectionIsForward(dir))
1683 so->currPos.currPage);
1684 else
1686 so->currPos.currPage);
1687 }
1688
1690
1691 /* initialize local variables */
1692 indnatts = IndexRelationGetNumberOfAttributes(rel);
1693 arrayKeys = so->numArrayKeys != 0;
1694 minoff = P_FIRSTDATAKEY(opaque);
1695 maxoff = PageGetMaxOffsetNumber(page);
1696
1697 /* initialize page-level state that we'll pass to _bt_checkkeys */
1698 pstate.minoff = minoff;
1699 pstate.maxoff = maxoff;
1700 pstate.finaltup = NULL;
1701 pstate.page = page;
1702 pstate.firstpage = firstpage;
1703 pstate.forcenonrequired = false;
1704 pstate.startikey = 0;
1705 pstate.offnum = InvalidOffsetNumber;
1706 pstate.skip = InvalidOffsetNumber;
1707 pstate.continuescan = true; /* default assumption */
1708 pstate.rechecks = 0;
1709 pstate.targetdistance = 0;
1710 pstate.nskipadvances = 0;
1711
1712 if (ScanDirectionIsForward(dir))
1713 {
1714 /* SK_SEARCHARRAY forward scans must provide high key up front */
1715 if (arrayKeys)
1716 {
1717 if (!P_RIGHTMOST(opaque))
1718 {
1719 ItemId iid = PageGetItemId(page, P_HIKEY);
1720
1721 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1722
1723 if (so->scanBehind &&
1724 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1725 {
1726 /* Schedule another primitive index scan after all */
1727 so->currPos.moreRight = false;
1728 so->needPrimScan = true;
1729 if (scan->parallel_scan)
1731 so->currPos.currPage);
1732 return false;
1733 }
1734 }
1735
1736 so->scanBehind = so->oppositeDirCheck = false; /* reset */
1737 }
1738
1739 /*
1740 * Consider pstate.startikey optimization once the ongoing primitive
1741 * index scan has already read at least one page
1742 */
1743 if (!pstate.firstpage && minoff < maxoff)
1744 _bt_set_startikey(scan, &pstate);
1745
1746 /* load items[] in ascending order */
1747 itemIndex = 0;
1748
1749 offnum = Max(offnum, minoff);
1750
1751 while (offnum <= maxoff)
1752 {
1753 ItemId iid = PageGetItemId(page, offnum);
1754 IndexTuple itup;
1755 bool passes_quals;
1756
1757 /*
1758 * If the scan specifies not to return killed tuples, then we
1759 * treat a killed tuple as not passing the qual
1760 */
1761 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1762 {
1763 offnum = OffsetNumberNext(offnum);
1764 continue;
1765 }
1766
1767 itup = (IndexTuple) PageGetItem(page, iid);
1768 Assert(!BTreeTupleIsPivot(itup));
1769
1770 pstate.offnum = offnum;
1771 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1772 itup, indnatts);
1773
1774 /*
1775 * Check if we need to skip ahead to a later tuple (only possible
1776 * when the scan uses array keys)
1777 */
1778 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1779 {
1780 Assert(!passes_quals && pstate.continuescan);
1781 Assert(offnum < pstate.skip);
1782 Assert(!pstate.forcenonrequired);
1783
1784 offnum = pstate.skip;
1785 pstate.skip = InvalidOffsetNumber;
1786 continue;
1787 }
1788
1789 if (passes_quals)
1790 {
1791 /* tuple passes all scan key conditions */
1792 if (!BTreeTupleIsPosting(itup))
1793 {
1794 /* Remember it */
1795 _bt_saveitem(so, itemIndex, offnum, itup);
1796 itemIndex++;
1797 }
1798 else
1799 {
1800 int tupleOffset;
1801
1802 /*
1803 * Set up state to return posting list, and remember first
1804 * TID
1805 */
1806 tupleOffset =
1807 _bt_setuppostingitems(so, itemIndex, offnum,
1808 BTreeTupleGetPostingN(itup, 0),
1809 itup);
1810 itemIndex++;
1811 /* Remember additional TIDs */
1812 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1813 {
1814 _bt_savepostingitem(so, itemIndex, offnum,
1815 BTreeTupleGetPostingN(itup, i),
1816 tupleOffset);
1817 itemIndex++;
1818 }
1819 }
1820 }
1821 /* When !continuescan, there can't be any more matches, so stop */
1822 if (!pstate.continuescan)
1823 break;
1824
1825 offnum = OffsetNumberNext(offnum);
1826 }
1827
1828 /*
1829 * We don't need to visit page to the right when the high key
1830 * indicates that no more matches will be found there.
1831 *
1832 * Checking the high key like this works out more often than you might
1833 * think. Leaf page splits pick a split point between the two most
1834 * dissimilar tuples (this is weighed against the need to evenly share
1835 * free space). Leaf pages with high key attribute values that can
1836 * only appear on non-pivot tuples on the right sibling page are
1837 * common.
1838 */
1839 if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
1840 {
1841 ItemId iid = PageGetItemId(page, P_HIKEY);
1842 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1843 int truncatt;
1844
1845 /* Reset arrays, per _bt_set_startikey contract */
1846 if (pstate.forcenonrequired)
1847 _bt_start_array_keys(scan, dir);
1848 pstate.forcenonrequired = false;
1849 pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
1850
1851 truncatt = BTreeTupleGetNAtts(itup, rel);
1852 _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1853 }
1854
1855 if (!pstate.continuescan)
1856 so->currPos.moreRight = false;
1857
1858 Assert(itemIndex <= MaxTIDsPerBTreePage);
1859 so->currPos.firstItem = 0;
1860 so->currPos.lastItem = itemIndex - 1;
1861 so->currPos.itemIndex = 0;
1862 }
1863 else
1864 {
1865 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1866 if (arrayKeys)
1867 {
1868 if (minoff <= maxoff && !P_LEFTMOST(opaque))
1869 {
1870 ItemId iid = PageGetItemId(page, minoff);
1871
1872 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1873
1874 if (so->scanBehind &&
1875 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1876 {
1877 /* Schedule another primitive index scan after all */
1878 so->currPos.moreLeft = false;
1879 so->needPrimScan = true;
1880 if (scan->parallel_scan)
1882 so->currPos.currPage);
1883 return false;
1884 }
1885 }
1886
1887 so->scanBehind = so->oppositeDirCheck = false; /* reset */
1888 }
1889
1890 /*
1891 * Consider pstate.startikey optimization once the ongoing primitive
1892 * index scan has already read at least one page
1893 */
1894 if (!pstate.firstpage && minoff < maxoff)
1895 _bt_set_startikey(scan, &pstate);
1896
1897 /* load items[] in descending order */
1898 itemIndex = MaxTIDsPerBTreePage;
1899
1900 offnum = Min(offnum, maxoff);
1901
1902 while (offnum >= minoff)
1903 {
1904 ItemId iid = PageGetItemId(page, offnum);
1905 IndexTuple itup;
1906 bool tuple_alive;
1907 bool passes_quals;
1908
1909 /*
1910 * If the scan specifies not to return killed tuples, then we
1911 * treat a killed tuple as not passing the qual. Most of the
1912 * time, it's a win to not bother examining the tuple's index
1913 * keys, but just skip to the next tuple (previous, actually,
1914 * since we're scanning backwards). However, if this is the first
1915 * tuple on the page, we do check the index keys, to prevent
1916 * uselessly advancing to the page to the left. This is similar
1917 * to the high key optimization used by forward scans.
1918 */
1919 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1920 {
1921 if (offnum > minoff)
1922 {
1923 offnum = OffsetNumberPrev(offnum);
1924 continue;
1925 }
1926
1927 tuple_alive = false;
1928 }
1929 else
1930 tuple_alive = true;
1931
1932 itup = (IndexTuple) PageGetItem(page, iid);
1933 Assert(!BTreeTupleIsPivot(itup));
1934
1935 pstate.offnum = offnum;
1936 if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
1937 {
1938 /* Reset arrays, per _bt_set_startikey contract */
1939 pstate.forcenonrequired = false;
1940 pstate.startikey = 0;
1941 _bt_start_array_keys(scan, dir);
1942 }
1943 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1944 itup, indnatts);
1945
1946 if (arrayKeys && so->scanBehind)
1947 {
1948 /*
1949 * Done scanning this page, but not done with the current
1950 * primscan.
1951 *
1952 * Note: Forward scans don't check this explicitly, since they
1953 * prefer to reuse pstate.skip for this instead.
1954 */
1955 Assert(!passes_quals && pstate.continuescan);
1956 Assert(!pstate.forcenonrequired);
1957
1958 break;
1959 }
1960
1961 /*
1962 * Check if we need to skip ahead to a later tuple (only possible
1963 * when the scan uses array keys)
1964 */
1965 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1966 {
1967 Assert(!passes_quals && pstate.continuescan);
1968 Assert(offnum > pstate.skip);
1969 Assert(!pstate.forcenonrequired);
1970
1971 offnum = pstate.skip;
1972 pstate.skip = InvalidOffsetNumber;
1973 continue;
1974 }
1975
1976 if (passes_quals && tuple_alive)
1977 {
1978 /* tuple passes all scan key conditions */
1979 if (!BTreeTupleIsPosting(itup))
1980 {
1981 /* Remember it */
1982 itemIndex--;
1983 _bt_saveitem(so, itemIndex, offnum, itup);
1984 }
1985 else
1986 {
1987 int tupleOffset;
1988
1989 /*
1990 * Set up state to return posting list, and remember first
1991 * TID.
1992 *
1993 * Note that we deliberately save/return items from
1994 * posting lists in ascending heap TID order for backwards
1995 * scans. This allows _bt_killitems() to make a
1996 * consistent assumption about the order of items
1997 * associated with the same posting list tuple.
1998 */
1999 itemIndex--;
2000 tupleOffset =
2001 _bt_setuppostingitems(so, itemIndex, offnum,
2002 BTreeTupleGetPostingN(itup, 0),
2003 itup);
2004 /* Remember additional TIDs */
2005 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
2006 {
2007 itemIndex--;
2008 _bt_savepostingitem(so, itemIndex, offnum,
2009 BTreeTupleGetPostingN(itup, i),
2010 tupleOffset);
2011 }
2012 }
2013 }
2014 /* When !continuescan, there can't be any more matches, so stop */
2015 if (!pstate.continuescan)
2016 break;
2017
2018 offnum = OffsetNumberPrev(offnum);
2019 }
2020
2021 /*
2022 * We don't need to visit page to the left when no more matches will
2023 * be found there
2024 */
2025 if (!pstate.continuescan)
2026 so->currPos.moreLeft = false;
2027
2028 Assert(itemIndex >= 0);
2029 so->currPos.firstItem = itemIndex;
2032 }
2033
2034 /*
2035 * If _bt_set_startikey told us to temporarily treat the scan's keys as
2036 * nonrequired (possible only during scans with array keys), there must be
2037 * no lasting consequences for the scan's array keys. The scan's arrays
2038 * should now have exactly the same elements as they would have had if the
2039 * nonrequired behavior had never been used. (In general, a scan's arrays
2040 * are expected to track its progress through the index's key space.)
2041 *
2042 * We are required (by _bt_set_startikey) to call _bt_checkkeys against
2043 * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
2044 * arrays to recover. Assert that that step hasn't been missed.
2045 */
2046 Assert(!pstate.forcenonrequired);
2047
2048 return (so->currPos.firstItem <= so->currPos.lastItem);
2049}
2050
2051/* Save an index item into so->currPos.items[itemIndex] */
2052static void
2053_bt_saveitem(BTScanOpaque so, int itemIndex,
2054 OffsetNumber offnum, IndexTuple itup)
2055{
2056 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2057
2059
2060 currItem->heapTid = itup->t_tid;
2061 currItem->indexOffset = offnum;
2062 if (so->currTuples)
2063 {
2064 Size itupsz = IndexTupleSize(itup);
2065
2066 currItem->tupleOffset = so->currPos.nextTupleOffset;
2067 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
2068 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
2069 }
2070}
2071
2072/*
2073 * Setup state to save TIDs/items from a single posting list tuple.
2074 *
2075 * Saves an index item into so->currPos.items[itemIndex] for TID that is
2076 * returned to scan first. Second or subsequent TIDs for posting list should
2077 * be saved by calling _bt_savepostingitem().
2078 *
2079 * Returns an offset into tuple storage space that main tuple is stored at if
2080 * needed.
2081 */
2082static int
2084 const ItemPointerData *heapTid, IndexTuple itup)
2085{
2086 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2087
2089
2090 currItem->heapTid = *heapTid;
2091 currItem->indexOffset = offnum;
2092 if (so->currTuples)
2093 {
2094 /* Save base IndexTuple (truncate posting list) */
2095 IndexTuple base;
2096 Size itupsz = BTreeTupleGetPostingOffset(itup);
2097
2098 itupsz = MAXALIGN(itupsz);
2099 currItem->tupleOffset = so->currPos.nextTupleOffset;
2100 base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
2101 memcpy(base, itup, itupsz);
2102 /* Defensively reduce work area index tuple header size */
2103 base->t_info &= ~INDEX_SIZE_MASK;
2104 base->t_info |= itupsz;
2105 so->currPos.nextTupleOffset += itupsz;
2106
2107 return currItem->tupleOffset;
2108 }
2109
2110 return 0;
2111}
2112
2113/*
2114 * Save an index item into so->currPos.items[itemIndex] for current posting
2115 * tuple.
2116 *
2117 * Assumes that _bt_setuppostingitems() has already been called for current
2118 * posting list tuple. Caller passes its return value as tupleOffset.
2119 */
2120static inline void
2122 ItemPointer heapTid, int tupleOffset)
2123{
2124 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2125
2126 currItem->heapTid = *heapTid;
2127 currItem->indexOffset = offnum;
2128
2129 /*
2130 * Have index-only scans return the same base IndexTuple for every TID
2131 * that originates from the same posting list
2132 */
2133 if (so->currTuples)
2134 currItem->tupleOffset = tupleOffset;
2135}
2136
2137/*
2138 * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
2139 * index scan by setting the relevant fields in caller's index scan descriptor
2140 */
2141static inline void
2143{
2144 BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
2145
2146 /* Most recent _bt_readpage must have succeeded */
2150
2151 /* Return next item, per amgettuple contract */
2152 scan->xs_heaptid = currItem->heapTid;
2153 if (so->currTuples)
2154 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2155}
2156
2157/*
2158 * _bt_steppage() -- Step to next page containing valid data for scan
2159 *
2160 * Wrapper on _bt_readnextpage that performs final steps for the current page.
2161 *
2162 * On entry, so->currPos must be valid. Its buffer will be pinned, though
2163 * never locked. (Actually, when so->dropPin there won't even be a pin held,
2164 * though so->currPos.currPage must still be set to a valid block number.)
2165 */
2166static bool
2168{
2169 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2170 BlockNumber blkno,
2171 lastcurrblkno;
2172
2174
2175 /* Before leaving current page, deal with any killed items */
2176 if (so->numKilled > 0)
2177 _bt_killitems(scan);
2178
2179 /*
2180 * Before we modify currPos, make a copy of the page data if there was a
2181 * mark position that needs it.
2182 */
2183 if (so->markItemIndex >= 0)
2184 {
2185 /* bump pin on current buffer for assignment to mark buffer */
2186 if (BTScanPosIsPinned(so->currPos))
2188 memcpy(&so->markPos, &so->currPos,
2189 offsetof(BTScanPosData, items[1]) +
2190 so->currPos.lastItem * sizeof(BTScanPosItem));
2191 if (so->markTuples)
2192 memcpy(so->markTuples, so->currTuples,
2195 so->markItemIndex = -1;
2196
2197 /*
2198 * If we're just about to start the next primitive index scan
2199 * (possible with a scan that has arrays keys, and needs to skip to
2200 * continue in the current scan direction), moreLeft/moreRight only
2201 * indicate the end of the current primitive index scan. They must
2202 * never be taken to indicate that the top-level index scan has ended
2203 * (that would be wrong).
2204 *
2205 * We could handle this case by treating the current array keys as
2206 * markPos state. But depending on the current array state like this
2207 * would add complexity. Instead, we just unset markPos's copy of
2208 * moreRight or moreLeft (whichever might be affected), while making
2209 * btrestrpos reset the scan's arrays to their initial scan positions.
2210 * In effect, btrestrpos leaves advancing the arrays up to the first
2211 * _bt_readpage call (that takes place after it has restored markPos).
2212 */
2213 if (so->needPrimScan)
2214 {
2216 so->markPos.moreRight = true;
2217 else
2218 so->markPos.moreLeft = true;
2219 }
2220
2221 /* mark/restore not supported by parallel scans */
2222 Assert(!scan->parallel_scan);
2223 }
2224
2226
2227 /* Walk to the next page with data */
2228 if (ScanDirectionIsForward(dir))
2229 blkno = so->currPos.nextPage;
2230 else
2231 blkno = so->currPos.prevPage;
2232 lastcurrblkno = so->currPos.currPage;
2233
2234 /*
2235 * Cancel primitive index scans that were scheduled when the call to
2236 * _bt_readpage for currPos happened to use the opposite direction to the
2237 * one that we're stepping in now. (It's okay to leave the scan's array
2238 * keys as-is, since the next _bt_readpage will advance them.)
2239 */
2240 if (so->currPos.dir != dir)
2241 so->needPrimScan = false;
2242
2243 return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
2244}
2245
2246/*
2247 * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
2248 *
2249 * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
2250 * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
2251 * When we're passed an offnum past the end of the page, we might still manage
2252 * to stop the scan on this page by calling _bt_checkkeys against the high
2253 * key. See _bt_readpage for full details.
2254 *
2255 * On entry, so->currPos must be pinned and locked (so offnum stays valid).
2256 * Parallel scan callers must have seized the scan before calling here.
2257 *
2258 * On exit, we'll have updated so->currPos and retained locks and pins
2259 * according to the same rules as those laid out for _bt_readnextpage exit.
2260 * Like _bt_readnextpage, our return value indicates if there are any matching
2261 * records in the given direction.
2262 *
2263 * We always release the scan for a parallel scan caller, regardless of
2264 * success or failure; we'll call _bt_parallel_release as soon as possible.
2265 */
2266static bool
2268{
2269 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2270
2271 so->numKilled = 0; /* just paranoia */
2272 so->markItemIndex = -1; /* ditto */
2273
2274 /* Initialize so->currPos for the first page (page in so->currPos.buf) */
2275 if (so->needPrimScan)
2276 {
2277 Assert(so->numArrayKeys);
2278
2279 so->currPos.moreLeft = true;
2280 so->currPos.moreRight = true;
2281 so->needPrimScan = false;
2282 }
2283 else if (ScanDirectionIsForward(dir))
2284 {
2285 so->currPos.moreLeft = false;
2286 so->currPos.moreRight = true;
2287 }
2288 else
2289 {
2290 so->currPos.moreLeft = true;
2291 so->currPos.moreRight = false;
2292 }
2293
2294 /*
2295 * Attempt to load matching tuples from the first page.
2296 *
2297 * Note that _bt_readpage will finish initializing the so->currPos fields.
2298 * _bt_readpage also releases parallel scan (even when it returns false).
2299 */
2300 if (_bt_readpage(scan, dir, offnum, true))
2301 {
2302 Relation rel = scan->indexRelation;
2303
2304 /*
2305 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2306 * so->currPos.buf in preparation for btgettuple returning tuples.
2307 */
2310 return true;
2311 }
2312
2313 /* There's no actually-matching data on the page in so->currPos.buf */
2315
2316 /* Call _bt_readnextpage using its _bt_steppage wrapper function */
2317 if (!_bt_steppage(scan, dir))
2318 return false;
2319
2320 /* _bt_readpage for a later page (now in so->currPos) succeeded */
2321 return true;
2322}
2323
2324/*
2325 * _bt_readnextpage() -- Read next page containing valid data for _bt_next
2326 *
2327 * Caller's blkno is the next interesting page's link, taken from either the
2328 * previously-saved right link or left link. lastcurrblkno is the page that
2329 * was current at the point where the blkno link was saved, which we use to
2330 * reason about concurrent page splits/page deletions during backwards scans.
2331 * In the common case where seized=false, blkno is either so->currPos.nextPage
2332 * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
2333 *
2334 * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
2335 * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
2336 * won't need to be read again in almost all cases). Parallel scan callers
2337 * that seized the scan before calling here should pass seized=true; such a
2338 * caller's blkno and lastcurrblkno arguments come from the seized scan.
2339 * seized=false callers just pass us the blkno/lastcurrblkno taken from their
2340 * so->currPos, which (along with so->currPos itself) can be used to end the
2341 * scan. A seized=false caller's blkno can never be assumed to be the page
2342 * that must be read next during a parallel scan, though. We must figure that
2343 * part out for ourselves by seizing the scan (the correct page to read might
2344 * already be beyond the seized=false caller's blkno during a parallel scan,
2345 * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
2346 * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
2347 *
2348 * On success exit, so->currPos is updated to contain data from the next
2349 * interesting page, and we return true. We hold a pin on the buffer on
2350 * success exit (except during so->dropPin index scans, when we drop the pin
2351 * eagerly to avoid blocking VACUUM).
2352 *
2353 * If there are no more matching records in the given direction, we invalidate
2354 * so->currPos (while ensuring it retains no locks or pins), and return false.
2355 *
2356 * We always release the scan for a parallel scan caller, regardless of
2357 * success or failure; we'll call _bt_parallel_release as soon as possible.
2358 */
2359static bool
2361 BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
2362{
2363 Relation rel = scan->indexRelation;
2364 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2365
2366 Assert(so->currPos.currPage == lastcurrblkno || seized);
2367 Assert(!(blkno == P_NONE && seized));
2369
2370 /*
2371 * Remember that the scan already read lastcurrblkno, a page to the left
2372 * of blkno (or remember reading a page to the right, for backwards scans)
2373 */
2374 if (ScanDirectionIsForward(dir))
2375 so->currPos.moreLeft = true;
2376 else
2377 so->currPos.moreRight = true;
2378
2379 for (;;)
2380 {
2381 Page page;
2382 BTPageOpaque opaque;
2383
2384 if (blkno == P_NONE ||
2386 !so->currPos.moreRight : !so->currPos.moreLeft))
2387 {
2388 /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
2389 Assert(so->currPos.currPage == lastcurrblkno && !seized);
2391 _bt_parallel_done(scan); /* iff !so->needPrimScan */
2392 return false;
2393 }
2394
2395 Assert(!so->needPrimScan);
2396
2397 /* parallel scan must never actually visit so->currPos blkno */
2398 if (!seized && scan->parallel_scan != NULL &&
2399 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
2400 {
2401 /* whole scan is now done (or another primitive scan required) */
2403 return false;
2404 }
2405
2406 if (ScanDirectionIsForward(dir))
2407 {
2408 /* read blkno, but check for interrupts first */
2410 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2411 }
2412 else
2413 {
2414 /* read blkno, avoiding race (also checks for interrupts) */
2415 so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
2416 lastcurrblkno);
2417 if (so->currPos.buf == InvalidBuffer)
2418 {
2419 /* must have been a concurrent deletion of leftmost page */
2421 _bt_parallel_done(scan);
2422 return false;
2423 }
2424 }
2425
2426 page = BufferGetPage(so->currPos.buf);
2427 opaque = BTPageGetOpaque(page);
2428 lastcurrblkno = blkno;
2429 if (likely(!P_IGNORE(opaque)))
2430 {
2431 /* see if there are any matches on this page */
2432 if (ScanDirectionIsForward(dir))
2433 {
2434 /* note that this will clear moreRight if we can stop */
2435 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
2436 break;
2437 blkno = so->currPos.nextPage;
2438 }
2439 else
2440 {
2441 /* note that this will clear moreLeft if we can stop */
2442 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
2443 break;
2444 blkno = so->currPos.prevPage;
2445 }
2446 }
2447 else
2448 {
2449 /* _bt_readpage not called, so do all this for ourselves */
2450 if (ScanDirectionIsForward(dir))
2451 blkno = opaque->btpo_next;
2452 else
2453 blkno = opaque->btpo_prev;
2454 if (scan->parallel_scan != NULL)
2455 _bt_parallel_release(scan, blkno, lastcurrblkno);
2456 }
2457
2458 /* no matching tuples on this page */
2459 _bt_relbuf(rel, so->currPos.buf);
2460 seized = false; /* released by _bt_readpage (or by us) */
2461 }
2462
2463 /*
2464 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2465 * so->currPos.buf in preparation for btgettuple returning tuples.
2466 */
2467 Assert(so->currPos.currPage == blkno);
2470
2471 return true;
2472}
2473
2474/*
2475 * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
2476 * recovering from concurrent page splits/page deletions when necessary
2477 *
2478 * Called during backwards scans, to deal with their unique concurrency rules.
2479 *
2480 * blkno points to the block number of the page that we expect to move the
2481 * scan to. We'll successfully move the scan there when we find that its
2482 * right sibling link still points to lastcurrblkno (the page we just read).
2483 * Otherwise, we have to figure out which page is the correct one for the scan
2484 * to now read the hard way, reasoning about concurrent splits and deletions.
2485 * See nbtree/README.
2486 *
2487 * On return, we have both a pin and a read lock on the returned page, whose
2488 * block number will be set in *blkno. Returns InvalidBuffer if there is no
2489 * page to the left (no lock or pin is held in that case).
2490 *
2491 * It is possible for the returned leaf page to be half-dead; caller must
2492 * check that condition and step left again when required.
2493 */
2494static Buffer
2496 BlockNumber lastcurrblkno)
2497{
2498 BlockNumber origblkno = *blkno; /* detects circular links */
2499
2500 for (;;)
2501 {
2502 Buffer buf;
2503 Page page;
2504 BTPageOpaque opaque;
2505 int tries;
2506
2507 /* check for interrupts while we're not holding any buffer lock */
2509 buf = _bt_getbuf(rel, *blkno, BT_READ);
2510 page = BufferGetPage(buf);
2511 opaque = BTPageGetOpaque(page);
2512
2513 /*
2514 * If this isn't the page we want, walk right till we find what we
2515 * want --- but go no more than four hops (an arbitrary limit). If we
2516 * don't find the correct page by then, the most likely bet is that
2517 * lastcurrblkno got deleted and isn't in the sibling chain at all
2518 * anymore, not that its left sibling got split more than four times.
2519 *
2520 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2521 * because half-dead pages are still in the sibling chain.
2522 */
2523 tries = 0;
2524 for (;;)
2525 {
2526 if (likely(!P_ISDELETED(opaque) &&
2527 opaque->btpo_next == lastcurrblkno))
2528 {
2529 /* Found desired page, return it */
2530 return buf;
2531 }
2532 if (P_RIGHTMOST(opaque) || ++tries > 4)
2533 break;
2534 /* step right */
2535 *blkno = opaque->btpo_next;
2536 buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2537 page = BufferGetPage(buf);
2538 opaque = BTPageGetOpaque(page);
2539 }
2540
2541 /*
2542 * Return to the original page (usually the page most recently read by
2543 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2544 * what's up with its prev sibling link
2545 */
2546 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2547 page = BufferGetPage(buf);
2548 opaque = BTPageGetOpaque(page);
2549 if (P_ISDELETED(opaque))
2550 {
2551 /*
2552 * It was deleted. Move right to first nondeleted page (there
2553 * must be one); that is the page that has acquired the deleted
2554 * one's keyspace, so stepping left from it will take us where we
2555 * want to be.
2556 */
2557 for (;;)
2558 {
2559 if (P_RIGHTMOST(opaque))
2560 elog(ERROR, "fell off the end of index \"%s\"",
2562 lastcurrblkno = opaque->btpo_next;
2563 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2564 page = BufferGetPage(buf);
2565 opaque = BTPageGetOpaque(page);
2566 if (!P_ISDELETED(opaque))
2567 break;
2568 }
2569 }
2570 else
2571 {
2572 /*
2573 * Original lastcurrblkno wasn't deleted; the explanation had
2574 * better be that the page to the left got split or deleted.
2575 * Without this check, we risk going into an infinite loop.
2576 */
2577 if (opaque->btpo_prev == origblkno)
2578 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2579 lastcurrblkno, RelationGetRelationName(rel));
2580 /* Okay to try again, since left sibling link changed */
2581 }
2582
2583 /*
2584 * Original lastcurrblkno from caller was concurrently deleted (could
2585 * also have been a great many concurrent left sibling page splits).
2586 * Found a non-deleted page that should now act as our lastcurrblkno.
2587 */
2588 if (P_LEFTMOST(opaque))
2589 {
2590 /* New lastcurrblkno has no left sibling (concurrently deleted) */
2591 _bt_relbuf(rel, buf);
2592 break;
2593 }
2594
2595 /* Start from scratch with new lastcurrblkno's blkno/prev link */
2596 *blkno = origblkno = opaque->btpo_prev;
2597 _bt_relbuf(rel, buf);
2598 }
2599
2600 return InvalidBuffer;
2601}
2602
2603/*
2604 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2605 *
2606 * If the index is empty, we will return InvalidBuffer; any other failure
2607 * condition causes ereport(). We will not return a dead page.
2608 *
2609 * The returned buffer is pinned and read-locked.
2610 */
2611Buffer
2612_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2613{
2614 Buffer buf;
2615 Page page;
2616 BTPageOpaque opaque;
2617 OffsetNumber offnum;
2618 BlockNumber blkno;
2619 IndexTuple itup;
2620
2621 /*
2622 * If we are looking for a leaf page, okay to descend from fast root;
2623 * otherwise better descend from true root. (There is no point in being
2624 * smarter about intermediate levels.)
2625 */
2626 if (level == 0)
2627 buf = _bt_getroot(rel, NULL, BT_READ);
2628 else
2629 buf = _bt_gettrueroot(rel);
2630
2631 if (!BufferIsValid(buf))
2632 return InvalidBuffer;
2633
2634 page = BufferGetPage(buf);
2635 opaque = BTPageGetOpaque(page);
2636
2637 for (;;)
2638 {
2639 /*
2640 * If we landed on a deleted page, step right to find a live page
2641 * (there must be one). Also, if we want the rightmost page, step
2642 * right if needed to get to it (this could happen if the page split
2643 * since we obtained a pointer to it).
2644 */
2645 while (P_IGNORE(opaque) ||
2646 (rightmost && !P_RIGHTMOST(opaque)))
2647 {
2648 blkno = opaque->btpo_next;
2649 if (blkno == P_NONE)
2650 elog(ERROR, "fell off the end of index \"%s\"",
2652 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2653 page = BufferGetPage(buf);
2654 opaque = BTPageGetOpaque(page);
2655 }
2656
2657 /* Done? */
2658 if (opaque->btpo_level == level)
2659 break;
2660 if (opaque->btpo_level < level)
2661 ereport(ERROR,
2662 (errcode(ERRCODE_INDEX_CORRUPTED),
2663 errmsg_internal("btree level %u not found in index \"%s\"",
2664 level, RelationGetRelationName(rel))));
2665
2666 /* Descend to leftmost or rightmost child page */
2667 if (rightmost)
2668 offnum = PageGetMaxOffsetNumber(page);
2669 else
2670 offnum = P_FIRSTDATAKEY(opaque);
2671
2672 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2673 blkno = BTreeTupleGetDownLink(itup);
2674
2675 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2676 page = BufferGetPage(buf);
2677 opaque = BTPageGetOpaque(page);
2678 }
2679
2680 return buf;
2681}
2682
2683/*
2684 * _bt_endpoint() -- Find the first or last page in the index, and scan
2685 * from there to the first key satisfying all the quals.
2686 *
2687 * This is used by _bt_first() to set up a scan when we've determined
2688 * that the scan must start at the beginning or end of the index (for
2689 * a forward or backward scan respectively).
2690 *
2691 * Parallel scan callers must have seized the scan before calling here.
2692 * Exit conditions are the same as for _bt_first().
2693 */
2694static bool
2696{
2697 Relation rel = scan->indexRelation;
2698 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2699 Page page;
2700 BTPageOpaque opaque;
2702
2704 Assert(!so->needPrimScan);
2705
2706 /*
2707 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2708 * version of _bt_search().
2709 */
2711
2712 if (!BufferIsValid(so->currPos.buf))
2713 {
2714 /*
2715 * Empty index. Lock the whole relation, as nothing finer to lock
2716 * exists.
2717 */
2719 _bt_parallel_done(scan);
2720 return false;
2721 }
2722
2723 page = BufferGetPage(so->currPos.buf);
2724 opaque = BTPageGetOpaque(page);
2725 Assert(P_ISLEAF(opaque));
2726
2727 if (ScanDirectionIsForward(dir))
2728 {
2729 /* There could be dead pages to the left, so not this: */
2730 /* Assert(P_LEFTMOST(opaque)); */
2731
2732 start = P_FIRSTDATAKEY(opaque);
2733 }
2734 else if (ScanDirectionIsBackward(dir))
2735 {
2736 Assert(P_RIGHTMOST(opaque));
2737
2739 }
2740 else
2741 {
2742 elog(ERROR, "invalid scan direction: %d", (int) dir);
2743 start = 0; /* keep compiler quiet */
2744 }
2745
2746 /*
2747 * Now load data from the first page of the scan.
2748 */
2749 if (!_bt_readfirstpage(scan, start, dir))
2750 return false;
2751
2752 _bt_returnitem(scan, so);
2753 return true;
2754}
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5398
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4223
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4499
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
PageData * Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
#define RegProcedureIsValid(p)
Definition: c.h:779
#define Min(x, y)
Definition: c.h:1006
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1108
#define likely(x)
Definition: c.h:406
#define MAXALIGN(LEN)
Definition: c.h:813
#define Max(x, y)
Definition: c.h:1000
regproc RegProcedure
Definition: c.h:658
int32_t int32
Definition: c.h:537
#define unlikely(x)
Definition: c.h:407
uint32_t uint32
Definition: c.h:541
size_t Size
Definition: c.h:613
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode(int sqlerrcode)
Definition: elog.c:863
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
Assert(PointerIsAligned(start, uint64))
return str start
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:917
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition: itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:889
void * palloc(Size size)
Definition: mcxt.c:1365
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2256
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1004
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:581
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:740
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:846
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1071
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1040
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:345
void _bt_preprocess_keys(IndexScanDesc scan)
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition: nbtree.c:1031
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:816
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:981
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:954
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
BTStackData * BTStack
Definition: nbtree.h:750
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define SK_BT_SKIP
Definition: nbtree.h:1137
#define P_HIKEY
Definition: nbtree.h:368
#define BTORDER_PROC
Definition: nbtree.h:717
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define SK_BT_PRIOR
Definition: nbtree.h:1143
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define SK_BT_NEXT
Definition: nbtree.h:1142
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:530
#define P_NONE
Definition: nbtree.h:213
#define SK_BT_REQBKWD
Definition: nbtree.h:1136
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1148
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define SK_BT_MAXVAL
Definition: nbtree.h:1141
#define BT_READ
Definition: nbtree.h:730
#define SK_BT_REQFWD
Definition: nbtree.h:1135
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define SK_BT_DESC
Definition: nbtree.h:1147
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define BT_WRITE
Definition: nbtree.h:731
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
#define SK_BT_MINVAL
Definition: nbtree.h:1140
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1097
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2612
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:246
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:607
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:887
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:2053
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2360
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, const ItemPointerData *heapTid, IndexTuple itup)
Definition: nbtsearch.c:2083
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
Definition: nbtsearch.c:1647
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:348
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2695
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2167
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2267
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition: nbtsearch.c:2495
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:2121
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:107
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:479
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1595
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:2142
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:693
static void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
Definition: nbtsearch.c:64
bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
Definition: nbtutils.c:2278
bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtutils.c:2150
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:189
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:3401
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4149
void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
Definition: nbtutils.c:2391
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:613
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define INDEX_MAX_KEYS
static char * buf
Definition: pg_test_fsync.c:72
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:705
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:37
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2599
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2576
short access
Definition: preproc-type.c:36
#define RelationGetDescr(relation)
Definition: rel.h:541
#define RelationGetRelationName(relation)
Definition: rel.h:549
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:527
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
ScanDirection
Definition: sdir.h:25
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
ScanKey high_compare
Definition: nbtree.h:1050
ScanKey low_compare
Definition: nbtree.h:1049
bool null_elem
Definition: nbtree.h:1047
OffsetNumber stricthigh
Definition: nbtree.h:836
bool bounds_valid
Definition: nbtree.h:834
OffsetNumber low
Definition: nbtree.h:835
BTScanInsert itup_key
Definition: nbtree.h:824
BlockNumber btpo_next
Definition: nbtree.h:66
BlockNumber btpo_prev
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:67
bool needPrimScan
Definition: nbtree.h:1063
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
char * markTuples
Definition: nbtree.h:1081
BTScanPosData currPos
Definition: nbtree.h:1093
char * currTuples
Definition: nbtree.h:1080
bool oppositeDirCheck
Definition: nbtree.h:1065
BTScanPosData markPos
Definition: nbtree.h:1094
ScanKey keyData
Definition: nbtree.h:1058
bool moreRight
Definition: nbtree.h:986
Buffer buf
Definition: nbtree.h:964
BlockNumber currPage
Definition: nbtree.h:967
int firstItem
Definition: nbtree.h:995
int nextTupleOffset
Definition: nbtree.h:979
BlockNumber prevPage
Definition: nbtree.h:968
BlockNumber nextPage
Definition: nbtree.h:969
bool moreLeft
Definition: nbtree.h:985
int lastItem
Definition: nbtree.h:996
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
int itemIndex
Definition: nbtree.h:997
ScanDirection dir
Definition: nbtree.h:973
XLogRecPtr lsn
Definition: nbtree.h:970
ItemPointerData heapTid
Definition: nbtree.h:957
LocationIndex tupleOffset
Definition: nbtree.h:959
OffsetNumber indexOffset
Definition: nbtree.h:958
BlockNumber bts_blkno
Definition: nbtree.h:745
struct BTStackData * bts_parent
Definition: nbtree.h:747
OffsetNumber bts_offset
Definition: nbtree.h:746
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:193
bool ignore_killed_tuples
Definition: relscan.h:150
struct IndexScanInstrumentation * instrument
Definition: relscan.h:161
IndexTuple xs_itup
Definition: relscan.h:169
Relation indexRelation
Definition: relscan.h:139
ItemPointerData xs_heaptid
Definition: relscan.h:174
struct SnapshotData * xs_snapshot
Definition: relscan.h:140
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_subtype
Definition: skey.h:69
Oid sk_collation
Definition: skey.h:70
StrategyNumber sk_strategy
Definition: skey.h:68
AttrNumber sk_attno
Definition: skey.h:67
static ItemArray items
Definition: test_tidstore.c:48
#define IsolationIsSerializable()
Definition: xact.h:53