PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "executor/instrument_node.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (Relation rel, BTScanOpaque so)
 
static Buffer _bt_moveright (Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static void _bt_returnitem (IndexScanDesc scan, BTScanOpaque so)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readfirstpage (IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
 
static Buffer _bt_lock_and_validate_left (Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 339 of file nbtsearch.c.

342{
343 Page page;
344 BTPageOpaque opaque;
345 OffsetNumber low,
346 high;
347 int32 result,
348 cmpval;
349
350 page = BufferGetPage(buf);
351 opaque = BTPageGetOpaque(page);
352
353 /* Requesting nextkey semantics while using scantid seems nonsensical */
354 Assert(!key->nextkey || key->scantid == NULL);
355 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
356 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
357
358 low = P_FIRSTDATAKEY(opaque);
359 high = PageGetMaxOffsetNumber(page);
360
361 /*
362 * If there are no keys on the page, return the first available slot. Note
363 * this covers two cases: the page is really empty (no keys), or it
364 * contains only a high key. The latter case is possible after vacuuming.
365 * This can never happen on an internal page, however, since they are
366 * never empty (an internal page must have at least one child).
367 */
368 if (unlikely(high < low))
369 return low;
370
371 /*
372 * Binary search to find the first key on the page >= scan key, or first
373 * key > scankey when nextkey is true.
374 *
375 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
376 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
377 *
378 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
379 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
380 *
381 * We can fall out when high == low.
382 */
383 high++; /* establish the loop invariant for high */
384
385 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
386
387 while (high > low)
388 {
389 OffsetNumber mid = low + ((high - low) / 2);
390
391 /* We have low <= mid < high, so mid points at a real slot */
392
393 result = _bt_compare(rel, key, page, mid);
394
395 if (result >= cmpval)
396 low = mid + 1;
397 else
398 high = mid;
399 }
400
401 /*
402 * At this point we have high == low.
403 *
404 * On a leaf page we always return the first non-pivot tuple >= scan key
405 * (resp. > scan key) for forward scan callers. For backward scans, it's
406 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
407 */
408 if (P_ISLEAF(opaque))
409 {
410 /*
411 * In the backward scan case we're supposed to locate the last
412 * matching tuple on the leaf level -- not the first matching tuple
413 * (the last tuple will be the first one returned by the scan).
414 *
415 * At this point we've located the first non-pivot tuple immediately
416 * after the last matching tuple (which might just be maxoff + 1).
417 * Compensate by stepping back.
418 */
419 if (key->backward)
420 return OffsetNumberPrev(low);
421
422 return low;
423 }
424
425 /*
426 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
427 * There must be one if _bt_compare() is playing by the rules.
428 *
429 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
430 * a truncated -inf heap TID is usually enough to prevent it altogether.
431 * Even omitted scan key entries are treated as > truncated attributes.
432 *
433 * However, during backward scans _bt_compare() interprets omitted scan
434 * key attributes as == corresponding truncated -inf attributes instead.
435 * This works just like < would work here. Under this scheme, < strategy
436 * backward scans will always directly descend to the correct leaf page.
437 * In particular, they will never incur an "extra" leaf page access with a
438 * scan key that happens to contain the same prefix of values as some
439 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
440 * it uses a leaf page high key to "re-find" a page undergoing deletion.
441 */
442 Assert(low > P_FIRSTDATAKEY(opaque));
443
444 return OffsetNumberPrev(low);
445}
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
PageData * Page
Definition bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:371
#define Assert(condition)
Definition c.h:873
int32_t int32
Definition c.h:542
#define unlikely(x)
Definition c.h:412
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition nbtsearch.c:684
uint16 OffsetNumber
Definition off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int fb(int x)

References _bt_compare(), Assert, BTPageGetOpaque, buf, BufferGetPage(), fb(), OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), and unlikely.

Referenced by _bt_first(), and _bt_search().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 470 of file nbtsearch.c.

471{
472 BTScanInsert key = insertstate->itup_key;
473 Page page;
474 BTPageOpaque opaque;
475 OffsetNumber low,
476 high,
477 stricthigh;
478 int32 result,
479 cmpval;
480
481 page = BufferGetPage(insertstate->buf);
482 opaque = BTPageGetOpaque(page);
483
484 Assert(P_ISLEAF(opaque));
485 Assert(!key->nextkey);
486 Assert(insertstate->postingoff == 0);
487
488 if (!insertstate->bounds_valid)
489 {
490 /* Start new binary search */
491 low = P_FIRSTDATAKEY(opaque);
492 high = PageGetMaxOffsetNumber(page);
493 }
494 else
495 {
496 /* Restore result of previous binary search against same page */
497 low = insertstate->low;
498 high = insertstate->stricthigh;
499 }
500
501 /* If there are no keys on the page, return the first available slot */
502 if (unlikely(high < low))
503 {
504 /* Caller can't reuse bounds */
506 insertstate->stricthigh = InvalidOffsetNumber;
507 insertstate->bounds_valid = false;
508 return low;
509 }
510
511 /*
512 * Binary search to find the first key on the page >= scan key. (nextkey
513 * is always false when inserting).
514 *
515 * The loop invariant is: all slots before 'low' are < scan key, all slots
516 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
517 * maintained to save additional search effort for caller.
518 *
519 * We can fall out when high == low.
520 */
521 if (!insertstate->bounds_valid)
522 high++; /* establish the loop invariant for high */
523 stricthigh = high; /* high initially strictly higher */
524
525 cmpval = 1; /* !nextkey comparison value */
526
527 while (high > low)
528 {
529 OffsetNumber mid = low + ((high - low) / 2);
530
531 /* We have low <= mid < high, so mid points at a real slot */
532
533 result = _bt_compare(rel, key, page, mid);
534
535 if (result >= cmpval)
536 low = mid + 1;
537 else
538 {
539 high = mid;
540 if (result != 0)
541 stricthigh = high;
542 }
543
544 /*
545 * If tuple at offset located by binary search is a posting list whose
546 * TID range overlaps with caller's scantid, perform posting list
547 * binary search to set postingoff for caller. Caller must split the
548 * posting list when postingoff is set. This should happen
549 * infrequently.
550 */
551 if (unlikely(result == 0 && key->scantid != NULL))
552 {
553 /*
554 * postingoff should never be set more than once per leaf page
555 * binary search. That would mean that there are duplicate table
556 * TIDs in the index, which is never okay. Check for that here.
557 */
558 if (insertstate->postingoff != 0)
561 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
564 low, stricthigh,
567
568 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
569 }
570 }
571
572 /*
573 * On a leaf page, a binary search always returns the first key >= scan
574 * key (at least in !nextkey case), which could be the last slot + 1. This
575 * is also the lower bound of cached search.
576 *
577 * stricthigh may also be the last slot + 1, which prevents caller from
578 * using bounds directly, but is still useful to us if we're called a
579 * second time with cached bounds (cached low will be < stricthigh when
580 * that happens).
581 */
582 insertstate->low = low;
583 insertstate->stricthigh = stricthigh;
584 insertstate->bounds_valid = true;
585
586 return low;
587}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4356
int errmsg_internal(const char *fmt,...)
Definition elog.c:1170
int errcode(int sqlerrcode)
Definition elog.c:863
#define ERROR
Definition elog.h:39
#define ereport(elevel,...)
Definition elog.h:150
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition nbtsearch.c:598
#define InvalidOffsetNumber
Definition off.h:26
#define RelationGetRelationName(relation)
Definition rel.h:548

References _bt_binsrch_posting(), _bt_compare(), Assert, BTPageGetOpaque, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, fb(), InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), RelationGetRelationName, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 598 of file nbtsearch.c.

599{
600 IndexTuple itup;
601 ItemId itemid;
602 int low,
603 high,
604 mid,
605 res;
606
607 /*
608 * If this isn't a posting tuple, then the index must be corrupt (if it is
609 * an ordinary non-pivot tuple then there must be an existing tuple with a
610 * heap TID that equals inserter's new heap TID/scantid). Defensively
611 * check that tuple is a posting list tuple whose posting list range
612 * includes caller's scantid.
613 *
614 * (This is also needed because contrib/amcheck's rootdescend option needs
615 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
616 */
617 itemid = PageGetItemId(page, offnum);
618 itup = (IndexTuple) PageGetItem(page, itemid);
619 if (!BTreeTupleIsPosting(itup))
620 return 0;
621
622 Assert(key->heapkeyspace && key->allequalimage);
623
624 /*
625 * In the event that posting list tuple has LP_DEAD bit set, indicate this
626 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
627 * second call to _bt_binsrch_insert() can take place when its caller has
628 * removed the dead item.
629 */
630 if (ItemIdIsDead(itemid))
631 return -1;
632
633 /* "high" is past end of posting list for loop invariant */
634 low = 0;
635 high = BTreeTupleGetNPosting(itup);
636 Assert(high >= 2);
637
638 while (high > low)
639 {
640 mid = low + ((high - low) / 2);
641 res = ItemPointerCompare(key->scantid,
642 BTreeTupleGetPostingN(itup, mid));
643
644 if (res > 0)
645 low = mid + 1;
646 else if (res < 0)
647 high = mid;
648 else
649 return mid;
650 }
651
652 /* Exact match not found */
653 return low;
654}
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
#define ItemIdIsDead(itemId)
Definition itemid.h:113
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
IndexTupleData * IndexTuple
Definition itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition nbtree.h:545
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), ItemIdIsDead, ItemPointerCompare(), PageGetItem(), and PageGetItemId().

Referenced by _bt_binsrch_insert().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 684 of file nbtsearch.c.

688{
690 BTPageOpaque opaque = BTPageGetOpaque(page);
691 IndexTuple itup;
692 ItemPointer heapTid;
694 int ncmpkey;
695 int ntupatts;
696 int32 result;
697
698 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
700 Assert(key->heapkeyspace || key->scantid == NULL);
701
702 /*
703 * Force result ">" if target item is first data item on an internal page
704 * --- see NOTE above.
705 */
706 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
707 return 1;
708
709 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
710 ntupatts = BTreeTupleGetNAtts(itup, rel);
711
712 /*
713 * The scan key is set up with the attribute number associated with each
714 * term in the key. It is important that, if the index is multi-key, the
715 * scan contain the first k key attributes, and that they be in order. If
716 * you think about how multi-key ordering works, you'll understand why
717 * this is.
718 *
719 * We don't test for violation of this condition here, however. The
720 * initial setup for the index scan had better have gotten it right (see
721 * _bt_first).
722 */
723
724 ncmpkey = Min(ntupatts, key->keysz);
725 Assert(key->heapkeyspace || ncmpkey == key->keysz);
726 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
727 scankey = key->scankeys;
728 for (int i = 1; i <= ncmpkey; i++)
729 {
730 Datum datum;
731 bool isNull;
732
733 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
734
735 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
736 {
737 if (isNull)
738 result = 0; /* NULL "=" NULL */
739 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
740 result = -1; /* NULL "<" NOT_NULL */
741 else
742 result = 1; /* NULL ">" NOT_NULL */
743 }
744 else if (isNull) /* key is NOT_NULL and item is NULL */
745 {
746 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
747 result = 1; /* NOT_NULL ">" NULL */
748 else
749 result = -1; /* NOT_NULL "<" NULL */
750 }
751 else
752 {
753 /*
754 * The sk_func needs to be passed the index value as left arg and
755 * the sk_argument as right arg (they might be of different
756 * types). Since it is convenient for callers to think of
757 * _bt_compare as comparing the scankey to the index item, we have
758 * to flip the sign of the comparison result. (Unless it's a DESC
759 * column, in which case we *don't* flip the sign.)
760 */
761 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
762 scankey->sk_collation,
763 datum,
764 scankey->sk_argument));
765
766 if (!(scankey->sk_flags & SK_BT_DESC))
767 INVERT_COMPARE_RESULT(result);
768 }
769
770 /* if the keys are unequal, return the difference */
771 if (result != 0)
772 return result;
773
774 scankey++;
775 }
776
777 /*
778 * All non-truncated attributes (other than heap TID) were found to be
779 * equal. Treat truncated attributes as minus infinity when scankey has a
780 * key attribute value that would otherwise be compared directly.
781 *
782 * Note: it doesn't matter if ntupatts includes non-key attributes;
783 * scankey won't, so explicitly excluding non-key attributes isn't
784 * necessary.
785 */
786 if (key->keysz > ntupatts)
787 return 1;
788
789 /*
790 * Use the heap TID attribute and scantid to try to break the tie. The
791 * rules are the same as any other key attribute -- only the
792 * representation differs.
793 */
794 heapTid = BTreeTupleGetHeapTID(itup);
795 if (key->scantid == NULL)
796 {
797 /*
798 * Forward scans have a scankey that is considered greater than a
799 * truncated pivot tuple if and when the scankey has equal values for
800 * attributes up to and including the least significant untruncated
801 * attribute in tuple. Even attributes that were omitted from the
802 * scan key are considered greater than -inf truncated attributes.
803 * (See _bt_binsrch for an explanation of our backward scan behavior.)
804 *
805 * For example, if an index has the minimum two attributes (single
806 * user key attribute, plus heap TID attribute), and a page's high key
807 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
808 * will not descend to the page to the left. The search will descend
809 * right instead. The truncated attribute in pivot tuple means that
810 * all non-pivot tuples on the page to the left are strictly < 'foo',
811 * so it isn't necessary to descend left. In other words, search
812 * doesn't have to descend left because it isn't interested in a match
813 * that has a heap TID value of -inf.
814 *
815 * Note: the heap TID part of the test ensures that scankey is being
816 * compared to a pivot tuple with one or more truncated -inf key
817 * attributes. The heap TID attribute is the last key attribute in
818 * every index, of course, but other than that it isn't special.
819 */
820 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
821 key->heapkeyspace)
822 return 1;
823
824 /* All provided scankey arguments found to be equal */
825 return 0;
826 }
827
828 /*
829 * Treat truncated heap TID as minus infinity, since scankey has a key
830 * attribute value (scantid) that would otherwise be compared directly
831 */
833 if (heapTid == NULL)
834 return 1;
835
836 /*
837 * Scankey must be treated as equal to a posting list tuple if its scantid
838 * value falls within the range of the posting list. In all other cases
839 * there can only be a single heap TID value, which is compared directly
840 * with scantid.
841 */
843 result = ItemPointerCompare(key->scantid, heapTid);
844 if (result <= 0 || !BTreeTupleIsPosting(itup))
845 return result;
846 else
847 {
848 result = ItemPointerCompare(key->scantid,
850 if (result > 0)
851 return 1;
852 }
853
854 return 0;
855}
#define Min(x, y)
Definition c.h:997
#define INVERT_COMPARE_RESULT(var)
Definition c.h:1099
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition fmgr.c:1150
int i
Definition isn.c:77
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition itup.h:131
#define SK_BT_NULLS_FIRST
Definition nbtree.h:1117
#define SK_BT_DESC
Definition nbtree.h:1116
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition nbtree.h:665
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition nbtree.h:639
#define BTreeTupleGetNAtts(itup, rel)
Definition nbtree.h:578
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition nbtutils.c:964
uint64_t Datum
Definition postgres.h:70
static int32 DatumGetInt32(Datum X)
Definition postgres.h:212
#define RelationGetDescr(relation)
Definition rel.h:540
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
#define SK_ISNULL
Definition skey.h:115

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), fb(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, SK_BT_DESC, SK_BT_NULLS_FIRST, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( Relation  rel,
BTScanOpaque  so 
)
inlinestatic

Definition at line 55 of file nbtsearch.c.

56{
57 if (!so->dropPin)
58 {
59 /* Just drop the lock (not the pin) */
60 _bt_unlockbuf(rel, so->currPos.buf);
61 return;
62 }
63
64 /*
65 * Drop both the lock and the pin.
66 *
67 * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
68 * when concurrent heap TID recycling by VACUUM might have taken place.
69 */
71 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
72 _bt_relbuf(rel, so->currPos.buf);
73 so->currPos.buf = InvalidBuffer;
74}
#define InvalidBuffer
Definition buf.h:25
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4634
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1024
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1071
#define RelationNeedsWAL(relation)
Definition rel.h:637

References _bt_relbuf(), _bt_unlockbuf(), Assert, BufferGetLSNAtomic(), fb(), InvalidBuffer, and RelationNeedsWAL.

Referenced by _bt_readfirstpage(), and _bt_readnextpage().

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2178 of file nbtsearch.c.

2179{
2180 Relation rel = scan->indexRelation;
2182 Page page;
2183 BTPageOpaque opaque;
2185
2186 Assert(!BTScanPosIsValid(so->currPos));
2187 Assert(!so->needPrimScan);
2188
2189 /*
2190 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2191 * version of _bt_search().
2192 */
2193 so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2194
2195 if (!BufferIsValid(so->currPos.buf))
2196 {
2197 /*
2198 * Empty index. Lock the whole relation, as nothing finer to lock
2199 * exists.
2200 */
2202 _bt_parallel_done(scan);
2203 return false;
2204 }
2205
2206 page = BufferGetPage(so->currPos.buf);
2207 opaque = BTPageGetOpaque(page);
2208 Assert(P_ISLEAF(opaque));
2209
2210 if (ScanDirectionIsForward(dir))
2211 {
2212 /* There could be dead pages to the left, so not this: */
2213 /* Assert(P_LEFTMOST(opaque)); */
2214
2215 start = P_FIRSTDATAKEY(opaque);
2216 }
2217 else if (ScanDirectionIsBackward(dir))
2218 {
2219 Assert(P_RIGHTMOST(opaque));
2220
2222 }
2223 else
2224 {
2225 elog(ERROR, "invalid scan direction: %d", (int) dir);
2226 start = 0; /* keep compiler quiet */
2227 }
2228
2229 /*
2230 * Now load data from the first page of the scan.
2231 */
2232 if (!_bt_readfirstpage(scan, start, dir))
2233 return false;
2234
2235 _bt_returnitem(scan, so);
2236 return true;
2237}
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:417
#define elog(elevel,...)
Definition elog.h:226
return str start
void _bt_parallel_done(IndexScanDesc scan)
Definition nbtree.c:1041
#define BTScanPosIsValid(scanpos)
Definition nbtree.h:1021
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
BTScanOpaqueData * BTScanOpaque
Definition nbtree.h:1097
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition nbtsearch.c:2092
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition nbtsearch.c:1747
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition nbtsearch.c:1622
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition predicate.c:2574
#define ScanDirectionIsForward(direction)
Definition sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition sdir.h:50
Relation indexRelation
Definition relscan.h:138
struct SnapshotData * xs_snapshot
Definition relscan.h:139

References _bt_get_endpoint(), _bt_parallel_done(), _bt_readfirstpage(), _bt_returnitem(), Assert, BTPageGetOpaque, BTScanPosIsValid, BufferGetPage(), BufferIsValid(), elog, ERROR, fb(), IndexScanDescData::indexRelation, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, start, and IndexScanDescData::xs_snapshot.

Referenced by _bt_first().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 878 of file nbtsearch.c.

879{
880 Relation rel = scan->indexRelation;
882 BTStack stack;
883 OffsetNumber offnum;
884 BTScanInsertData inskey;
887 int keysz = 0;
891
892 Assert(!BTScanPosIsValid(so->currPos));
893
894 /*
895 * Examine the scan keys and eliminate any redundant keys; also mark the
896 * keys that must be matched to continue the scan.
897 */
899
900 /*
901 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
902 * never be satisfied (eg, x == 1 AND x > 2).
903 */
904 if (!so->qual_ok)
905 {
906 Assert(!so->needPrimScan);
907 _bt_parallel_done(scan);
908 return false;
909 }
910
911 /*
912 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
913 * will likely release the parallel scan later on.
914 */
915 if (scan->parallel_scan != NULL &&
916 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
917 return false;
918
919 /*
920 * Initialize the scan's arrays (if any) for the current scan direction
921 * (except when they were already set to later values as part of
922 * scheduling the primitive index scan that is now underway)
923 */
924 if (so->numArrayKeys && !so->needPrimScan)
925 _bt_start_array_keys(scan, dir);
926
927 if (blkno != InvalidBlockNumber)
928 {
929 /*
930 * We anticipated calling _bt_search, but another worker bet us to it.
931 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
932 */
933 Assert(scan->parallel_scan != NULL);
934 Assert(!so->needPrimScan);
935 Assert(blkno != P_NONE);
936
937 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
938 return false;
939
940 _bt_returnitem(scan, so);
941 return true;
942 }
943
944 /*
945 * Count an indexscan for stats, now that we know that we'll call
946 * _bt_search/_bt_endpoint below
947 */
949 if (scan->instrument)
950 scan->instrument->nsearches++;
951
952 /*----------
953 * Examine the scan keys to discover where we need to start the scan.
954 * The selected scan keys (at most one per index column) are remembered by
955 * storing their addresses into the local startKeys[] array. The final
956 * startKeys[] entry's strategy is set in strat_total. (Actually, there
957 * are a couple of cases where we force a less/more restrictive strategy.)
958 *
959 * We must use the key that was marked required (in the direction opposite
960 * our own scan's) during preprocessing. Each index attribute can only
961 * have one such required key. In general, the keys that we use to find
962 * an initial position when scanning forwards are the same keys that end
963 * the scan on the leaf level when scanning backwards (and vice-versa).
964 *
965 * When the scan keys include cross-type operators, _bt_preprocess_keys
966 * may not be able to eliminate redundant keys; in such cases it will
967 * arbitrarily pick a usable key for each attribute (and scan direction),
968 * ensuring that there is no more than one key required in each direction.
969 * We stop considering further keys once we reach the first nonrequired
970 * key (which must come after all required keys), so this can't affect us.
971 *
972 * The required keys that we use as starting boundaries have to be =, >,
973 * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
974 * We can use keys for multiple attributes so long as the prior attributes
975 * had only =, >= (resp. =, <=) keys. These rules are very similar to the
976 * rules that preprocessing used to determine which keys to mark required.
977 * We cannot always use every required key as a positioning key, though.
978 * Skip arrays necessitate independently applying our own rules here.
979 * Skip arrays are always generally considered = array keys, but we'll
980 * nevertheless treat them as inequalities at certain points of the scan.
981 * When that happens, it _might_ have implications for the number of
982 * required keys that we can safely use for initial positioning purposes.
983 *
984 * For example, a forward scan with a skip array on its leading attribute
985 * (with no low_compare/high_compare) will have at least two required scan
986 * keys, but we won't use any of them as boundary keys during the scan's
987 * initial call here. Our positioning key during the first call here can
988 * be thought of as representing "> -infinity". Similarly, if such a skip
989 * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
990 * during the scan's initial call here; a lower-order key such as "b = 42"
991 * can't be used until the "a" array advances beyond MINVAL/low_compare.
992 *
993 * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
994 * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
995 * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
996 * that we treat = and >= as equivalent when scanning forwards (just as we
997 * treat = and <= as equivalent when scanning backwards). We effectively
998 * do the same thing (though with a distinct "a" element/value) each time.
999 *
1000 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1001 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1002 * If the index stores nulls at the end of the index we'll be starting
1003 * from, and we have no boundary key for the column (which means the key
1004 * we deduced NOT NULL from is an inequality key that constrains the other
1005 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1006 * use as a boundary key. If we didn't do this, we might find ourselves
1007 * traversing a lot of null entries at the start of the scan.
1008 *
1009 * In this loop, row-comparison keys are treated the same as keys on their
1010 * first (leftmost) columns. We'll add all lower-order columns of the row
1011 * comparison that were marked required during preprocessing below.
1012 *
1013 * _bt_advance_array_keys needs to know exactly how we'll reposition the
1014 * scan (should it opt to schedule another primitive index scan). It is
1015 * critical that primscans only be scheduled when they'll definitely make
1016 * some useful progress. _bt_advance_array_keys does this by calling
1017 * _bt_checkkeys routines that report whether a tuple is past the end of
1018 * matches for the scan's keys (given the scan's current array elements).
1019 * If the page's final tuple is "after the end of matches" for a scan that
1020 * uses the *opposite* scan direction, then it must follow that it's also
1021 * "before the start of matches" for the actual current scan direction.
1022 * It is therefore essential that all of our initial positioning rules are
1023 * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1024 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1025 * need to be kept in sync.
1026 *----------
1027 */
1028 if (so->numberOfKeys > 0)
1029 {
1031 ScanKey bkey;
1033 ScanKey cur;
1034
1035 /*
1036 * bkey will be set to the key that preprocessing left behind as the
1037 * boundary key for this attribute, in this scan direction (if any)
1038 */
1039 cur = so->keyData;
1040 curattr = 1;
1041 bkey = NULL;
1042 /* Also remember any scankey that implies a NOT NULL constraint */
1043 impliesNN = NULL;
1044
1045 /*
1046 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1047 * pass to handle after-last-key processing. Actual exit from the
1048 * loop is at one of the "break" statements below.
1049 */
1050 for (int i = 0;; cur++, i++)
1051 {
1052 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1053 {
1054 /* Done looking for the curattr boundary key */
1055 Assert(bkey == NULL ||
1056 (bkey->sk_attno == curattr &&
1057 (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1058 Assert(impliesNN == NULL ||
1059 (impliesNN->sk_attno == curattr &&
1060 (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1061
1062 /*
1063 * If this is a scan key for a skip array whose current
1064 * element is MINVAL, choose low_compare (when scanning
1065 * backwards it'll be MAXVAL, and we'll choose high_compare).
1066 *
1067 * Note: if the array's low_compare key makes 'bkey' NULL,
1068 * then we behave as if the array's first element is -inf,
1069 * except when !array->null_elem implies a usable NOT NULL
1070 * constraint.
1071 */
1072 if (bkey != NULL &&
1073 (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1074 {
1075 int ikey = bkey - so->keyData;
1077 BTArrayKeyInfo *array = NULL;
1078
1079 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1080 {
1081 array = &so->arrayKeys[arridx];
1082 if (array->scan_key == ikey)
1083 break;
1084 }
1085
1086 if (ScanDirectionIsForward(dir))
1087 {
1088 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1089 bkey = array->low_compare;
1090 }
1091 else
1092 {
1093 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1094 bkey = array->high_compare;
1095 }
1096
1097 Assert(bkey == NULL ||
1098 bkey->sk_attno == skipequalitykey->sk_attno);
1099
1100 if (!array->null_elem)
1102 else
1103 Assert(bkey == NULL && impliesNN == NULL);
1104 }
1105
1106 /*
1107 * If we didn't find a usable boundary key, see if we can
1108 * deduce a NOT NULL key
1109 */
1110 if (bkey == NULL && impliesNN != NULL &&
1111 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1114 {
1115 /* Final startKeys[] entry will be deduced NOT NULL key */
1116 bkey = &notnullkey;
1119 (impliesNN->sk_flags &
1121 curattr,
1124 InvalidOid,
1125 InvalidOid,
1126 InvalidOid,
1127 (Datum) 0);
1128 }
1129
1130 /*
1131 * If preprocessing didn't leave a usable boundary key, quit;
1132 * else save the boundary key pointer in startKeys[]
1133 */
1134 if (bkey == NULL)
1135 break;
1136 startKeys[keysz++] = bkey;
1137
1138 /*
1139 * We can only consider adding more boundary keys when the one
1140 * that we just chose to add uses either the = or >= strategy
1141 * (during backwards scans we can only do so when the key that
1142 * we just added to startKeys[] uses the = or <= strategy)
1143 */
1144 strat_total = bkey->sk_strategy;
1147 break;
1148
1149 /*
1150 * If the key that we just added to startKeys[] is a skip
1151 * array = key whose current element is marked NEXT or PRIOR,
1152 * make strat_total > or < (and stop adding boundary keys).
1153 * This can only happen with opclasses that lack skip support.
1154 */
1155 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1156 {
1157 Assert(bkey->sk_flags & SK_BT_SKIP);
1159
1160 if (ScanDirectionIsForward(dir))
1161 {
1162 Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1164 }
1165 else
1166 {
1167 Assert(!(bkey->sk_flags & SK_BT_NEXT));
1169 }
1170
1171 /*
1172 * We're done. We'll never find an exact = match for a
1173 * NEXT or PRIOR sentinel sk_argument value. There's no
1174 * sense in trying to add more keys to startKeys[].
1175 */
1176 break;
1177 }
1178
1179 /*
1180 * Done if that was the last scan key output by preprocessing.
1181 * Also done if we've now examined all keys marked required.
1182 */
1183 if (i >= so->numberOfKeys ||
1184 !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1185 break;
1186
1187 /*
1188 * Reset for next attr.
1189 */
1190 Assert(cur->sk_attno == curattr + 1);
1191 curattr = cur->sk_attno;
1192 bkey = NULL;
1193 impliesNN = NULL;
1194 }
1195
1196 /*
1197 * If we've located the starting boundary key for curattr, we have
1198 * no interest in curattr's other required key
1199 */
1200 if (bkey != NULL)
1201 continue;
1202
1203 /*
1204 * Is this key the starting boundary key for curattr?
1205 *
1206 * If not, does it imply a NOT NULL constraint? (Because
1207 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1208 * *any* inequality key works for that; we need not test.)
1209 */
1210 switch (cur->sk_strategy)
1211 {
1214 if (ScanDirectionIsBackward(dir))
1215 bkey = cur;
1216 else if (impliesNN == NULL)
1217 impliesNN = cur;
1218 break;
1220 bkey = cur;
1221 break;
1224 if (ScanDirectionIsForward(dir))
1225 bkey = cur;
1226 else if (impliesNN == NULL)
1227 impliesNN = cur;
1228 break;
1229 }
1230 }
1231 }
1232
1233 /*
1234 * If we found no usable boundary keys, we have to start from one end of
1235 * the tree. Walk down that edge to the first or last key, and scan from
1236 * there.
1237 *
1238 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1239 */
1240 if (keysz == 0)
1241 return _bt_endpoint(scan, dir);
1242
1243 /*
1244 * We want to start the scan somewhere within the index. Set up an
1245 * insertion scankey we can use to search for the boundary point we
1246 * identified above. The insertion scankey is built using the keys
1247 * identified by startKeys[]. (Remaining insertion scankey fields are
1248 * initialized after initial-positioning scan keys are finalized.)
1249 */
1250 Assert(keysz <= INDEX_MAX_KEYS);
1251 for (int i = 0; i < keysz; i++)
1252 {
1254
1255 Assert(bkey->sk_attno == i + 1);
1256
1257 if (bkey->sk_flags & SK_ROW_HEADER)
1258 {
1259 /*
1260 * Row comparison header: look to the first row member instead
1261 */
1262 ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1263 bool loosen_strat = false,
1264 tighten_strat = false;
1265
1266 /*
1267 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1268 * would've marked the qual as unsatisfiable, preventing us from
1269 * ever getting this far
1270 */
1271 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1272 Assert(subkey->sk_attno == bkey->sk_attno);
1273 Assert(!(subkey->sk_flags & SK_ISNULL));
1274
1275 /*
1276 * This is either a > or >= key (during backwards scans it is
1277 * either < or <=) that was marked required during preprocessing.
1278 * Later so->keyData[] keys can't have been marked required, so
1279 * our row compare header key must be the final startKeys[] entry.
1280 */
1281 Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
1282 Assert(subkey->sk_strategy == bkey->sk_strategy);
1283 Assert(subkey->sk_strategy == strat_total);
1284 Assert(i == keysz - 1);
1285
1286 /*
1287 * The member scankeys are already in insertion format (ie, they
1288 * have sk_func = 3-way-comparison function)
1289 */
1290 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1291
1292 /*
1293 * Now look to later row compare members.
1294 *
1295 * If there's an "index attribute gap" between two row compare
1296 * members, the second member won't have been marked required, and
1297 * so can't be used as a starting boundary key here. The part of
1298 * the row comparison that we do still use has to be treated as a
1299 * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1300 * with an omitted intervening index attribute "b" will use an
1301 * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1302 * the leaf level might satisfy the row compare qual.
1303 *
1304 * We're able to use a _more_ restrictive strategy when we reach a
1305 * NULL row compare member, since they're always unsatisfiable.
1306 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1307 * insertion scan key "a > 1". All tuples where "a = 1" cannot
1308 * possibly satisfy the row compare qual, so this is safe.
1309 */
1310 Assert(!(subkey->sk_flags & SK_ROW_END));
1311 for (;;)
1312 {
1313 subkey++;
1314 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1315
1316 if (subkey->sk_flags & SK_ISNULL)
1317 {
1318 /*
1319 * NULL member key, can only use earlier keys.
1320 *
1321 * We deliberately avoid checking if this key is marked
1322 * required. All earlier keys are required, and this key
1323 * is unsatisfiable either way, so we can't miss anything.
1324 */
1325 tighten_strat = true;
1326 break;
1327 }
1328
1329 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1330 {
1331 /* nonrequired member key, can only use earlier keys */
1332 loosen_strat = true;
1333 break;
1334 }
1335
1336 Assert(subkey->sk_attno == keysz + 1);
1337 Assert(subkey->sk_strategy == bkey->sk_strategy);
1338 Assert(keysz < INDEX_MAX_KEYS);
1339
1340 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1341 keysz++;
1342
1343 if (subkey->sk_flags & SK_ROW_END)
1344 break;
1345 }
1347 if (loosen_strat)
1348 {
1349 /* Use less restrictive strategy (and fewer member keys) */
1350 switch (strat_total)
1351 {
1354 break;
1357 break;
1358 }
1359 }
1360 if (tighten_strat)
1361 {
1362 /* Use more restrictive strategy (and fewer member keys) */
1363 switch (strat_total)
1364 {
1367 break;
1370 break;
1371 }
1372 }
1373
1374 /* Done (row compare header key is always last startKeys[] key) */
1375 break;
1376 }
1377
1378 /*
1379 * Ordinary comparison key/search-style key.
1380 *
1381 * Transform the search-style scan key to an insertion scan key by
1382 * replacing the sk_func with the appropriate btree 3-way-comparison
1383 * function.
1384 *
1385 * If scankey operator is not a cross-type comparison, we can use the
1386 * cached comparison function; otherwise gotta look it up in the
1387 * catalogs. (That can't lead to infinite recursion, since no
1388 * indexscan initiated by syscache lookup will use cross-data-type
1389 * operators.)
1390 *
1391 * We support the convention that sk_subtype == InvalidOid means the
1392 * opclass input type; this hack simplifies life for ScanKeyInit().
1393 */
1394 if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1395 bkey->sk_subtype == InvalidOid)
1396 {
1398
1399 procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1401 bkey->sk_flags,
1402 bkey->sk_attno,
1404 bkey->sk_subtype,
1405 bkey->sk_collation,
1406 procinfo,
1407 bkey->sk_argument);
1408 }
1409 else
1410 {
1411 RegProcedure cmp_proc;
1412
1413 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1414 rel->rd_opcintype[i],
1415 bkey->sk_subtype, BTORDER_PROC);
1416 if (!RegProcedureIsValid(cmp_proc))
1417 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1418 BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1419 bkey->sk_attno, RelationGetRelationName(rel));
1421 bkey->sk_flags,
1422 bkey->sk_attno,
1424 bkey->sk_subtype,
1425 bkey->sk_collation,
1426 cmp_proc,
1427 bkey->sk_argument);
1428 }
1429 }
1430
1431 /*----------
1432 * Examine the selected initial-positioning strategy to determine exactly
1433 * where we need to start the scan, and set flag variables to control the
1434 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1435 * page _bt_search returns).
1436 *----------
1437 */
1438 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1439 inskey.anynullkeys = false; /* unused */
1440 inskey.scantid = NULL;
1441 inskey.keysz = keysz;
1442 switch (strat_total)
1443 {
1445
1446 inskey.nextkey = false;
1447 inskey.backward = true;
1448 break;
1449
1451
1452 inskey.nextkey = true;
1453 inskey.backward = true;
1454 break;
1455
1457
1458 /*
1459 * If a backward scan was specified, need to start with last equal
1460 * item not first one.
1461 */
1462 if (ScanDirectionIsBackward(dir))
1463 {
1464 /*
1465 * This is the same as the <= strategy
1466 */
1467 inskey.nextkey = true;
1468 inskey.backward = true;
1469 }
1470 else
1471 {
1472 /*
1473 * This is the same as the >= strategy
1474 */
1475 inskey.nextkey = false;
1476 inskey.backward = false;
1477 }
1478 break;
1479
1481
1482 /*
1483 * Find first item >= scankey
1484 */
1485 inskey.nextkey = false;
1486 inskey.backward = false;
1487 break;
1488
1490
1491 /*
1492 * Find first item > scankey
1493 */
1494 inskey.nextkey = true;
1495 inskey.backward = false;
1496 break;
1497
1498 default:
1499 /* can't get here, but keep compiler quiet */
1500 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1501 return false;
1502 }
1503
1504 /*
1505 * Use the manufactured insertion scan key to descend the tree and
1506 * position ourselves on the target leaf page.
1507 */
1508 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1509 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1510
1511 /* don't need to keep the stack around... */
1512 _bt_freestack(stack);
1513
1514 if (!BufferIsValid(so->currPos.buf))
1515 {
1516 Assert(!so->needPrimScan);
1517
1518 /*
1519 * We only get here if the index is completely empty. Lock relation
1520 * because nothing finer to lock exists. Without a buffer lock, it's
1521 * possible for another transaction to insert data between
1522 * _bt_search() and PredicateLockRelation(). We have to try again
1523 * after taking the relation-level predicate lock, to close a narrow
1524 * window where we wouldn't scan concurrently inserted tuples, but the
1525 * writer wouldn't see our predicate lock.
1526 */
1528 {
1530 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1531 _bt_freestack(stack);
1532 }
1533
1534 if (!BufferIsValid(so->currPos.buf))
1535 {
1536 _bt_parallel_done(scan);
1537 return false;
1538 }
1539 }
1540
1541 /* position to the precise item on the page */
1542 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1543
1544 /*
1545 * Now load data from the first page of the scan (usually the page
1546 * currently in so->currPos.buf).
1547 *
1548 * If inskey.nextkey = false and inskey.backward = false, offnum is
1549 * positioned at the first non-pivot tuple >= inskey.scankeys.
1550 *
1551 * If inskey.nextkey = false and inskey.backward = true, offnum is
1552 * positioned at the last non-pivot tuple < inskey.scankeys.
1553 *
1554 * If inskey.nextkey = true and inskey.backward = false, offnum is
1555 * positioned at the first non-pivot tuple > inskey.scankeys.
1556 *
1557 * If inskey.nextkey = true and inskey.backward = true, offnum is
1558 * positioned at the last non-pivot tuple <= inskey.scankeys.
1559 *
1560 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1561 * for the page. For example, when inskey is both < the leaf page's high
1562 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1563 */
1564 if (!_bt_readfirstpage(scan, offnum, dir))
1565 return false;
1566
1567 _bt_returnitem(scan, so);
1568 return true;
1569}
int16 AttrNumber
Definition attnum.h:21
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
#define RegProcedureIsValid(p)
Definition c.h:792
regproc RegProcedure
Definition c.h:664
struct cursor * cur
Definition ecpg.c:29
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition indexam.c:917
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition lsyscache.c:872
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition nbtpage.c:740
void _bt_preprocess_keys(IndexScanDesc scan)
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition nbtree.c:876
#define SK_BT_SKIP
Definition nbtree.h:1106
#define BTORDER_PROC
Definition nbtree.h:717
#define SK_BT_PRIOR
Definition nbtree.h:1112
#define SK_BT_NEXT
Definition nbtree.h:1111
#define P_NONE
Definition nbtree.h:213
#define SK_BT_REQBKWD
Definition nbtree.h:1105
#define SK_BT_MAXVAL
Definition nbtree.h:1110
#define BT_READ
Definition nbtree.h:730
#define SK_BT_REQFWD
Definition nbtree.h:1104
#define SK_BT_MINVAL
Definition nbtree.h:1109
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition nbtsearch.c:1840
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition nbtsearch.c:339
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:2178
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition nbtsearch.c:98
void _bt_freestack(BTStack stack)
Definition nbtutils.c:151
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition pgstat.h:705
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
#define InvalidOid
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition scankey.c:101
#define SK_ROW_HEADER
Definition skey.h:117
#define SK_ROW_MEMBER
Definition skey.h:118
#define SK_SEARCHNOTNULL
Definition skey.h:122
#define SK_ROW_END
Definition skey.h:119
ScanKeyData * ScanKey
Definition skey.h:75
uint16 StrategyNumber
Definition stratnum.h:22
#define BTGreaterStrategyNumber
Definition stratnum.h:33
#define InvalidStrategy
Definition stratnum.h:24
#define BTLessStrategyNumber
Definition stratnum.h:29
#define BTEqualStrategyNumber
Definition stratnum.h:31
#define BTLessEqualStrategyNumber
Definition stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition stratnum.h:32
ScanKey high_compare
Definition nbtree.h:1050
ScanKey low_compare
Definition nbtree.h:1049
ItemPointer scantid
Definition nbtree.h:802
bool allequalimage
Definition nbtree.h:798
bool heapkeyspace
Definition nbtree.h:797
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition nbtree.h:804
struct ParallelIndexScanDescData * parallel_scan
Definition relscan.h:192
struct IndexScanInstrumentation * instrument
Definition relscan.h:160
int sk_flags
Definition skey.h:66
#define IsolationIsSerializable()
Definition xact.h:53

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), BTScanInsertData::allequalimage, BTScanInsertData::anynullkeys, Assert, BTScanInsertData::backward, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BufferIsValid(), cur, DatumGetPointer(), elog, ERROR, fb(), get_opfamily_proc(), BTScanInsertData::heapkeyspace, BTArrayKeyInfo::high_compare, i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, IndexScanDescData::instrument, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanInsertData::keysz, BTArrayKeyInfo::low_compare, BTScanInsertData::nextkey, IndexScanInstrumentation::nsearches, BTArrayKeyInfo::null_elem, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), BTScanInsertData::scankeys, BTScanInsertData::scantid, SK_BT_DESC, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_NEXT, SK_BT_NULLS_FIRST, SK_BT_PRIOR, SK_BT_REQBKWD, SK_BT_REQFWD, SK_BT_SKIP, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2092 of file nbtsearch.c.

2093{
2094 Buffer buf;
2095 Page page;
2096 BTPageOpaque opaque;
2097 OffsetNumber offnum;
2098 BlockNumber blkno;
2099 IndexTuple itup;
2100
2101 /*
2102 * If we are looking for a leaf page, okay to descend from fast root;
2103 * otherwise better descend from true root. (There is no point in being
2104 * smarter about intermediate levels.)
2105 */
2106 if (level == 0)
2107 buf = _bt_getroot(rel, NULL, BT_READ);
2108 else
2109 buf = _bt_gettrueroot(rel);
2110
2111 if (!BufferIsValid(buf))
2112 return InvalidBuffer;
2113
2114 page = BufferGetPage(buf);
2115 opaque = BTPageGetOpaque(page);
2116
2117 for (;;)
2118 {
2119 /*
2120 * If we landed on a deleted page, step right to find a live page
2121 * (there must be one). Also, if we want the rightmost page, step
2122 * right if needed to get to it (this could happen if the page split
2123 * since we obtained a pointer to it).
2124 */
2125 while (P_IGNORE(opaque) ||
2126 (rightmost && !P_RIGHTMOST(opaque)))
2127 {
2128 blkno = opaque->btpo_next;
2129 if (blkno == P_NONE)
2130 elog(ERROR, "fell off the end of index \"%s\"",
2132 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2133 page = BufferGetPage(buf);
2134 opaque = BTPageGetOpaque(page);
2135 }
2136
2137 /* Done? */
2138 if (opaque->btpo_level == level)
2139 break;
2140 if (opaque->btpo_level < level)
2141 ereport(ERROR,
2143 errmsg_internal("btree level %u not found in index \"%s\"",
2144 level, RelationGetRelationName(rel))));
2145
2146 /* Descend to leftmost or rightmost child page */
2147 if (rightmost)
2148 offnum = PageGetMaxOffsetNumber(page);
2149 else
2150 offnum = P_FIRSTDATAKEY(opaque);
2151
2153 elog(PANIC, "offnum out of range");
2154
2155 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2156 blkno = BTreeTupleGetDownLink(itup);
2157
2158 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2159 page = BufferGetPage(buf);
2160 opaque = BTPageGetOpaque(page);
2161 }
2162
2163 return buf;
2164}
int Buffer
Definition buf.h:23
#define PANIC
Definition elog.h:42
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition nbtpage.c:1004
Buffer _bt_gettrueroot(Relation rel)
Definition nbtpage.c:581
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition nbtpage.c:345
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition nbtree.h:557
#define P_IGNORE(opaque)
Definition nbtree.h:226
BlockNumber btpo_next
Definition nbtree.h:66
uint32 btpo_level
Definition nbtree.h:67

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, fb(), InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PANIC, and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_lock_and_validate_left()

static Buffer _bt_lock_and_validate_left ( Relation  rel,
BlockNumber blkno,
BlockNumber  lastcurrblkno 
)
static

Definition at line 1975 of file nbtsearch.c.

1977{
1978 BlockNumber origblkno = *blkno; /* detects circular links */
1979
1980 for (;;)
1981 {
1982 Buffer buf;
1983 Page page;
1984 BTPageOpaque opaque;
1985 int tries;
1986
1987 /* check for interrupts while we're not holding any buffer lock */
1989 buf = _bt_getbuf(rel, *blkno, BT_READ);
1990 page = BufferGetPage(buf);
1991 opaque = BTPageGetOpaque(page);
1992
1993 /*
1994 * If this isn't the page we want, walk right till we find what we
1995 * want --- but go no more than four hops (an arbitrary limit). If we
1996 * don't find the correct page by then, the most likely bet is that
1997 * lastcurrblkno got deleted and isn't in the sibling chain at all
1998 * anymore, not that its left sibling got split more than four times.
1999 *
2000 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2001 * because half-dead pages are still in the sibling chain.
2002 */
2003 tries = 0;
2004 for (;;)
2005 {
2006 if (likely(!P_ISDELETED(opaque) &&
2007 opaque->btpo_next == lastcurrblkno))
2008 {
2009 /* Found desired page, return it */
2010 return buf;
2011 }
2012 if (P_RIGHTMOST(opaque) || ++tries > 4)
2013 break;
2014 /* step right */
2015 *blkno = opaque->btpo_next;
2016 buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2017 page = BufferGetPage(buf);
2018 opaque = BTPageGetOpaque(page);
2019 }
2020
2021 /*
2022 * Return to the original page (usually the page most recently read by
2023 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2024 * what's up with its prev sibling link
2025 */
2027 page = BufferGetPage(buf);
2028 opaque = BTPageGetOpaque(page);
2029 if (P_ISDELETED(opaque))
2030 {
2031 /*
2032 * It was deleted. Move right to first nondeleted page (there
2033 * must be one); that is the page that has acquired the deleted
2034 * one's keyspace, so stepping left from it will take us where we
2035 * want to be.
2036 */
2037 for (;;)
2038 {
2039 if (P_RIGHTMOST(opaque))
2040 elog(ERROR, "fell off the end of index \"%s\"",
2042 lastcurrblkno = opaque->btpo_next;
2044 page = BufferGetPage(buf);
2045 opaque = BTPageGetOpaque(page);
2046 if (!P_ISDELETED(opaque))
2047 break;
2048 }
2049 }
2050 else
2051 {
2052 /*
2053 * Original lastcurrblkno wasn't deleted; the explanation had
2054 * better be that the page to the left got split or deleted.
2055 * Without this check, we risk going into an infinite loop.
2056 */
2057 if (opaque->btpo_prev == origblkno)
2058 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2060 /* Okay to try again, since left sibling link changed */
2061 }
2062
2063 /*
2064 * Original lastcurrblkno from caller was concurrently deleted (could
2065 * also have been a great many concurrent left sibling page splits).
2066 * Found a non-deleted page that should now act as our lastcurrblkno.
2067 */
2068 if (P_LEFTMOST(opaque))
2069 {
2070 /* New lastcurrblkno has no left sibling (concurrently deleted) */
2071 _bt_relbuf(rel, buf);
2072 break;
2073 }
2074
2075 /* Start from scratch with new lastcurrblkno's blkno/prev link */
2076 *blkno = origblkno = opaque->btpo_prev;
2077 _bt_relbuf(rel, buf);
2078 }
2079
2080 return InvalidBuffer;
2081}
#define likely(x)
Definition c.h:411
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition nbtpage.c:846
#define P_LEFTMOST(opaque)
Definition nbtree.h:219
#define P_ISDELETED(opaque)
Definition nbtree.h:223
BlockNumber btpo_prev
Definition nbtree.h:65

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetPage(), CHECK_FOR_INTERRUPTS, elog, ERROR, fb(), InvalidBuffer, likely, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_readnextpage().

◆ _bt_moveright()

static Buffer _bt_moveright ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access 
)
static

Definition at line 237 of file nbtsearch.c.

244{
245 Page page;
246 BTPageOpaque opaque;
248
249 Assert(!forupdate || heaprel != NULL);
250
251 /*
252 * When nextkey = false (normal case): if the scan key that brought us to
253 * this page is > the high key stored on the page, then the page has split
254 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
255 * have some duplicates to the right as well as the left, but that's
256 * something that's only ever dealt with on the leaf level, after
257 * _bt_search has found an initial leaf page.)
258 *
259 * When nextkey = true: move right if the scan key is >= page's high key.
260 * (Note that key.scantid cannot be set in this case.)
261 *
262 * The page could even have split more than once, so scan as far as
263 * needed.
264 *
265 * We also have to move right if we followed a link that brought us to a
266 * dead page.
267 */
268 cmpval = key->nextkey ? 0 : 1;
269
270 for (;;)
271 {
272 page = BufferGetPage(buf);
273 opaque = BTPageGetOpaque(page);
274
275 if (P_RIGHTMOST(opaque))
276 break;
277
278 /*
279 * Finish any incomplete splits we encounter along the way.
280 */
281 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
282 {
284
285 /* upgrade our lock if necessary */
286 if (access == BT_READ)
287 {
288 _bt_unlockbuf(rel, buf);
289 _bt_lockbuf(rel, buf, BT_WRITE);
290 }
291
292 if (P_INCOMPLETE_SPLIT(opaque))
293 _bt_finish_split(rel, heaprel, buf, stack);
294 else
295 _bt_relbuf(rel, buf);
296
297 /* re-acquire the lock in the right mode, and re-check */
298 buf = _bt_getbuf(rel, blkno, access);
299 continue;
300 }
301
302 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
303 {
304 /* step right one page */
305 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
306 continue;
307 }
308 else
309 break;
310 }
311
312 if (P_IGNORE(opaque))
313 elog(ERROR, "fell off the end of index \"%s\"",
315
316 return buf;
317}
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition nbtinsert.c:2256
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition nbtpage.c:1040
#define P_HIKEY
Definition nbtree.h:368
#define P_INCOMPLETE_SPLIT(opaque)
Definition nbtree.h:228
#define BT_WRITE
Definition nbtree.h:731
short access

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage(), elog, ERROR, fb(), P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_search().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1586 of file nbtsearch.c.

1587{
1589
1590 Assert(BTScanPosIsValid(so->currPos));
1591
1592 /*
1593 * Advance to next tuple on current page; or if there's no more, try to
1594 * step to the next page with data.
1595 */
1596 if (ScanDirectionIsForward(dir))
1597 {
1598 if (++so->currPos.itemIndex > so->currPos.lastItem)
1599 {
1600 if (!_bt_steppage(scan, dir))
1601 return false;
1602 }
1603 }
1604 else
1605 {
1606 if (--so->currPos.itemIndex < so->currPos.firstItem)
1607 {
1608 if (!_bt_steppage(scan, dir))
1609 return false;
1610 }
1611 }
1612
1613 _bt_returnitem(scan, so);
1614 return true;
1615}
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1647

References _bt_returnitem(), _bt_steppage(), Assert, BTScanPosIsValid, fb(), IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_readfirstpage()

static bool _bt_readfirstpage ( IndexScanDesc  scan,
OffsetNumber  offnum,
ScanDirection  dir 
)
static

Definition at line 1747 of file nbtsearch.c.

1748{
1750
1751 so->numKilled = 0; /* just paranoia */
1752 so->markItemIndex = -1; /* ditto */
1753
1754 /* Initialize so->currPos for the first page (page in so->currPos.buf) */
1755 if (so->needPrimScan)
1756 {
1757 Assert(so->numArrayKeys);
1758
1759 so->currPos.moreLeft = true;
1760 so->currPos.moreRight = true;
1761 so->needPrimScan = false;
1762 }
1763 else if (ScanDirectionIsForward(dir))
1764 {
1765 so->currPos.moreLeft = false;
1766 so->currPos.moreRight = true;
1767 }
1768 else
1769 {
1770 so->currPos.moreLeft = true;
1771 so->currPos.moreRight = false;
1772 }
1773
1774 /*
1775 * Attempt to load matching tuples from the first page.
1776 *
1777 * Note that _bt_readpage will finish initializing the so->currPos fields.
1778 * _bt_readpage also releases parallel scan (even when it returns false).
1779 */
1780 if (_bt_readpage(scan, dir, offnum, true))
1781 {
1782 Relation rel = scan->indexRelation;
1783
1784 /*
1785 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1786 * so->currPos.buf in preparation for btgettuple returning tuples.
1787 */
1788 Assert(BTScanPosIsPinned(so->currPos));
1790 return true;
1791 }
1792
1793 /* There's no actually-matching data on the page in so->currPos.buf */
1794 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1795
1796 /* Call _bt_readnextpage using its _bt_steppage wrapper function */
1797 if (!_bt_steppage(scan, dir))
1798 return false;
1799
1800 /* _bt_readpage for a later page (now in so->currPos) succeeded */
1801 return true;
1802}
bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
#define BTScanPosIsPinned(scanpos)
Definition nbtree.h:1004
static void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
Definition nbtsearch.c:55

References _bt_drop_lock_and_maybe_pin(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert, BTScanPosIsPinned, fb(), IndexScanDescData::indexRelation, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), and _bt_first().

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
BlockNumber  lastcurrblkno,
ScanDirection  dir,
bool  seized 
)
static

Definition at line 1840 of file nbtsearch.c.

1842{
1843 Relation rel = scan->indexRelation;
1845
1846 Assert(so->currPos.currPage == lastcurrblkno || seized);
1847 Assert(!(blkno == P_NONE && seized));
1848 Assert(!BTScanPosIsPinned(so->currPos));
1849
1850 /*
1851 * Remember that the scan already read lastcurrblkno, a page to the left
1852 * of blkno (or remember reading a page to the right, for backwards scans)
1853 */
1854 if (ScanDirectionIsForward(dir))
1855 so->currPos.moreLeft = true;
1856 else
1857 so->currPos.moreRight = true;
1858
1859 for (;;)
1860 {
1861 Page page;
1862 BTPageOpaque opaque;
1863
1864 if (blkno == P_NONE ||
1866 !so->currPos.moreRight : !so->currPos.moreLeft))
1867 {
1868 /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
1869 Assert(so->currPos.currPage == lastcurrblkno && !seized);
1870 BTScanPosInvalidate(so->currPos);
1871 _bt_parallel_done(scan); /* iff !so->needPrimScan */
1872 return false;
1873 }
1874
1875 Assert(!so->needPrimScan);
1876
1877 /* parallel scan must never actually visit so->currPos blkno */
1878 if (!seized && scan->parallel_scan != NULL &&
1879 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
1880 {
1881 /* whole scan is now done (or another primitive scan required) */
1882 BTScanPosInvalidate(so->currPos);
1883 return false;
1884 }
1885
1886 if (ScanDirectionIsForward(dir))
1887 {
1888 /* read blkno, but check for interrupts first */
1890 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1891 }
1892 else
1893 {
1894 /* read blkno, avoiding race (also checks for interrupts) */
1895 so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
1897 if (so->currPos.buf == InvalidBuffer)
1898 {
1899 /* must have been a concurrent deletion of leftmost page */
1900 BTScanPosInvalidate(so->currPos);
1901 _bt_parallel_done(scan);
1902 return false;
1903 }
1904 }
1905
1906 page = BufferGetPage(so->currPos.buf);
1907 opaque = BTPageGetOpaque(page);
1908 lastcurrblkno = blkno;
1909 if (likely(!P_IGNORE(opaque)))
1910 {
1911 /* see if there are any matches on this page */
1912 if (ScanDirectionIsForward(dir))
1913 {
1914 /* note that this will clear moreRight if we can stop */
1915 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
1916 break;
1917 blkno = so->currPos.nextPage;
1918 }
1919 else
1920 {
1921 /* note that this will clear moreLeft if we can stop */
1922 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
1923 break;
1924 blkno = so->currPos.prevPage;
1925 }
1926 }
1927 else
1928 {
1929 /* _bt_readpage not called, so do all this for ourselves */
1930 if (ScanDirectionIsForward(dir))
1931 blkno = opaque->btpo_next;
1932 else
1933 blkno = opaque->btpo_prev;
1934 if (scan->parallel_scan != NULL)
1935 _bt_parallel_release(scan, blkno, lastcurrblkno);
1936 }
1937
1938 /* no matching tuples on this page */
1939 _bt_relbuf(rel, so->currPos.buf);
1940 seized = false; /* released by _bt_readpage (or by us) */
1941 }
1942
1943 /*
1944 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1945 * so->currPos.buf in preparation for btgettuple returning tuples.
1946 */
1947 Assert(so->currPos.currPage == blkno);
1948 Assert(BTScanPosIsPinned(so->currPos));
1950
1951 return true;
1952}
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition nbtree.c:1014
#define BTScanPosInvalidate(scanpos)
Definition nbtree.h:1027
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition nbtsearch.c:1975

References _bt_drop_lock_and_maybe_pin(), _bt_getbuf(), _bt_lock_and_validate_left(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), Assert, BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTScanPosInvalidate, BTScanPosIsPinned, BufferGetPage(), CHECK_FOR_INTERRUPTS, fb(), IndexScanDescData::indexRelation, InvalidBuffer, likely, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_first(), and _bt_steppage().

◆ _bt_returnitem()

static void _bt_returnitem ( IndexScanDesc  scan,
BTScanOpaque  so 
)
inlinestatic

Definition at line 1622 of file nbtsearch.c.

1623{
1624 BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
1625
1626 /* Most recent _bt_readpage must have succeeded */
1627 Assert(BTScanPosIsValid(so->currPos));
1628 Assert(so->currPos.itemIndex >= so->currPos.firstItem);
1629 Assert(so->currPos.itemIndex <= so->currPos.lastItem);
1630
1631 /* Return next item, per amgettuple contract */
1632 scan->xs_heaptid = currItem->heapTid;
1633 if (so->currTuples)
1634 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1635}
IndexTuple xs_itup
Definition relscan.h:168
ItemPointerData xs_heaptid
Definition relscan.h:173

References Assert, BTScanPosIsValid, fb(), IndexScanDescData::xs_heaptid, and IndexScanDescData::xs_itup.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 98 of file nbtsearch.c.

100{
102 int page_access = BT_READ;
103
104 /* heaprel must be set whenever _bt_allocbuf is reachable */
106 Assert(access == BT_READ || heaprel != NULL);
107
108 /* Get the root page to start with */
109 *bufP = _bt_getroot(rel, heaprel, access);
110
111 /* If index is empty and access = BT_READ, no root page is created. */
112 if (!BufferIsValid(*bufP))
113 return (BTStack) NULL;
114
115 /* Loop iterates once per level descended in the tree */
116 for (;;)
117 {
118 Page page;
119 BTPageOpaque opaque;
120 OffsetNumber offnum;
121 ItemId itemid;
122 IndexTuple itup;
123 BlockNumber child;
125
126 /*
127 * Race -- the page we just grabbed may have split since we read its
128 * downlink in its parent page (or the metapage). If it has, we may
129 * need to move right to its new sibling. Do that.
130 *
131 * In write-mode, allow _bt_moveright to finish any incomplete splits
132 * along the way. Strictly speaking, we'd only need to finish an
133 * incomplete split on the leaf page we're about to insert to, not on
134 * any of the upper levels (internal pages with incomplete splits are
135 * also taken care of in _bt_getstackbuf). But this is a good
136 * opportunity to finish splits of internal pages too.
137 */
138 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
140
141 /* if this is a leaf page, we're done */
142 page = BufferGetPage(*bufP);
143 opaque = BTPageGetOpaque(page);
144 if (P_ISLEAF(opaque))
145 break;
146
147 /*
148 * Find the appropriate pivot tuple on this page. Its downlink points
149 * to the child page that we're about to descend to.
150 */
151 offnum = _bt_binsrch(rel, key, *bufP);
152 itemid = PageGetItemId(page, offnum);
153 itup = (IndexTuple) PageGetItem(page, itemid);
154 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
155 child = BTreeTupleGetDownLink(itup);
156
157 /*
158 * We need to save the location of the pivot tuple we chose in a new
159 * stack entry for this page/level. If caller ends up splitting a
160 * page one level down, it usually ends up inserting a new pivot
161 * tuple/downlink immediately after the location recorded here.
162 */
164 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
165 new_stack->bts_offset = offnum;
166 new_stack->bts_parent = stack_in;
167
168 /*
169 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
170 * we're on the level 1 and asked to lock leaf page in write mode,
171 * then lock next page in write mode, because it must be a leaf.
172 */
173 if (opaque->btpo_level == 1 && access == BT_WRITE)
175
176 /* drop the read lock on the page, then acquire one on its child */
177 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
178
179 /* okay, all set to move down a level */
181 }
182
183 /*
184 * If we're asked to lock leaf in write mode, but didn't manage to, then
185 * relock. This should only happen when the root page is a leaf page (and
186 * the only page in the index other than the metapage).
187 */
188 if (access == BT_WRITE && page_access == BT_READ)
189 {
190 /* trade in our read lock for a write lock */
191 _bt_unlockbuf(rel, *bufP);
192 _bt_lockbuf(rel, *bufP, BT_WRITE);
193
194 /*
195 * Race -- the leaf page may have split after we dropped the read lock
196 * but before we acquired a write lock. If it has, we may need to
197 * move right to its new sibling. Do that.
198 */
199 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
200 }
201
202 return stack_in;
203}
#define palloc_object(type)
Definition fe_memutils.h:74
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
BTStackData * BTStack
Definition nbtree.h:750
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition nbtsearch.c:237

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), fb(), P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc_object.

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 1647 of file nbtsearch.c.

1648{
1650 BlockNumber blkno,
1652
1653 Assert(BTScanPosIsValid(so->currPos));
1654
1655 /* Before leaving current page, deal with any killed items */
1656 if (so->numKilled > 0)
1657 _bt_killitems(scan);
1658
1659 /*
1660 * Before we modify currPos, make a copy of the page data if there was a
1661 * mark position that needs it.
1662 */
1663 if (so->markItemIndex >= 0)
1664 {
1665 /* bump pin on current buffer for assignment to mark buffer */
1666 if (BTScanPosIsPinned(so->currPos))
1667 IncrBufferRefCount(so->currPos.buf);
1668 memcpy(&so->markPos, &so->currPos,
1670 so->currPos.lastItem * sizeof(BTScanPosItem));
1671 if (so->markTuples)
1672 memcpy(so->markTuples, so->currTuples,
1673 so->currPos.nextTupleOffset);
1674 so->markPos.itemIndex = so->markItemIndex;
1675 so->markItemIndex = -1;
1676
1677 /*
1678 * If we're just about to start the next primitive index scan
1679 * (possible with a scan that has arrays keys, and needs to skip to
1680 * continue in the current scan direction), moreLeft/moreRight only
1681 * indicate the end of the current primitive index scan. They must
1682 * never be taken to indicate that the top-level index scan has ended
1683 * (that would be wrong).
1684 *
1685 * We could handle this case by treating the current array keys as
1686 * markPos state. But depending on the current array state like this
1687 * would add complexity. Instead, we just unset markPos's copy of
1688 * moreRight or moreLeft (whichever might be affected), while making
1689 * btrestrpos reset the scan's arrays to their initial scan positions.
1690 * In effect, btrestrpos leaves advancing the arrays up to the first
1691 * _bt_readpage call (that takes place after it has restored markPos).
1692 */
1693 if (so->needPrimScan)
1694 {
1695 if (ScanDirectionIsForward(so->currPos.dir))
1696 so->markPos.moreRight = true;
1697 else
1698 so->markPos.moreLeft = true;
1699 }
1700
1701 /* mark/restore not supported by parallel scans */
1702 Assert(!scan->parallel_scan);
1703 }
1704
1705 BTScanPosUnpinIfPinned(so->currPos);
1706
1707 /* Walk to the next page with data */
1708 if (ScanDirectionIsForward(dir))
1709 blkno = so->currPos.nextPage;
1710 else
1711 blkno = so->currPos.prevPage;
1712 lastcurrblkno = so->currPos.currPage;
1713
1714 /*
1715 * Cancel primitive index scans that were scheduled when the call to
1716 * _bt_readpage for currPos happened to use the opposite direction to the
1717 * one that we're stepping in now. (It's okay to leave the scan's array
1718 * keys as-is, since the next _bt_readpage will advance them.)
1719 */
1720 if (so->currPos.dir != dir)
1721 so->needPrimScan = false;
1722
1723 return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
1724}
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5533
#define BTScanPosUnpinIfPinned(scanpos)
Definition nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition nbtutils.c:205
static ItemArray items

References _bt_killitems(), _bt_readnextpage(), Assert, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), IncrBufferRefCount(), items, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ScanDirectionIsForward.

Referenced by _bt_next(), and _bt_readfirstpage().