PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtsearch.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
Include dependency graph for nbtsearch.c:

Go to the source code of this file.

Functions

static void _bt_drop_lock_and_maybe_pin (IndexScanDesc scan, BTScanPos sp)
 
static Buffer _bt_moveright (Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
 
static OffsetNumber _bt_binsrch (Relation rel, BTScanInsert key, Buffer buf)
 
static int _bt_binsrch_posting (BTScanInsert key, Page page, OffsetNumber offnum)
 
static bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
 
static void _bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
 
static int _bt_setuppostingitems (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
 
static void _bt_savepostingitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
 
static void _bt_returnitem (IndexScanDesc scan, BTScanOpaque so)
 
static bool _bt_steppage (IndexScanDesc scan, ScanDirection dir)
 
static bool _bt_readfirstpage (IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 
static bool _bt_readnextpage (IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
 
static Buffer _bt_lock_and_validate_left (Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
 
static bool _bt_endpoint (IndexScanDesc scan, ScanDirection dir)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 

Function Documentation

◆ _bt_binsrch()

static OffsetNumber _bt_binsrch ( Relation  rel,
BTScanInsert  key,
Buffer  buf 
)
static

Definition at line 343 of file nbtsearch.c.

346{
347 Page page;
348 BTPageOpaque opaque;
349 OffsetNumber low,
350 high;
351 int32 result,
352 cmpval;
353
354 page = BufferGetPage(buf);
355 opaque = BTPageGetOpaque(page);
356
357 /* Requesting nextkey semantics while using scantid seems nonsensical */
358 Assert(!key->nextkey || key->scantid == NULL);
359 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
360 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
361
362 low = P_FIRSTDATAKEY(opaque);
363 high = PageGetMaxOffsetNumber(page);
364
365 /*
366 * If there are no keys on the page, return the first available slot. Note
367 * this covers two cases: the page is really empty (no keys), or it
368 * contains only a high key. The latter case is possible after vacuuming.
369 * This can never happen on an internal page, however, since they are
370 * never empty (an internal page must have at least one child).
371 */
372 if (unlikely(high < low))
373 return low;
374
375 /*
376 * Binary search to find the first key on the page >= scan key, or first
377 * key > scankey when nextkey is true.
378 *
379 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381 *
382 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384 *
385 * We can fall out when high == low.
386 */
387 high++; /* establish the loop invariant for high */
388
389 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
390
391 while (high > low)
392 {
393 OffsetNumber mid = low + ((high - low) / 2);
394
395 /* We have low <= mid < high, so mid points at a real slot */
396
397 result = _bt_compare(rel, key, page, mid);
398
399 if (result >= cmpval)
400 low = mid + 1;
401 else
402 high = mid;
403 }
404
405 /*
406 * At this point we have high == low.
407 *
408 * On a leaf page we always return the first non-pivot tuple >= scan key
409 * (resp. > scan key) for forward scan callers. For backward scans, it's
410 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
411 */
412 if (P_ISLEAF(opaque))
413 {
414 /*
415 * In the backward scan case we're supposed to locate the last
416 * matching tuple on the leaf level -- not the first matching tuple
417 * (the last tuple will be the first one returned by the scan).
418 *
419 * At this point we've located the first non-pivot tuple immediately
420 * after the last matching tuple (which might just be maxoff + 1).
421 * Compensate by stepping back.
422 */
423 if (key->backward)
424 return OffsetNumberPrev(low);
425
426 return low;
427 }
428
429 /*
430 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
431 * There must be one if _bt_compare() is playing by the rules.
432 *
433 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
434 * a truncated -inf heap TID is usually enough to prevent it altogether.
435 * Even omitted scan key entries are treated as > truncated attributes.
436 *
437 * However, during backward scans _bt_compare() interprets omitted scan
438 * key attributes as == corresponding truncated -inf attributes instead.
439 * This works just like < would work here. Under this scheme, < strategy
440 * backward scans will always directly descend to the correct leaf page.
441 * In particular, they will never incur an "extra" leaf page access with a
442 * scan key that happens to contain the same prefix of values as some
443 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
444 * it uses a leaf page high key to "re-find" a page undergoing deletion.
445 */
446 Assert(low > P_FIRSTDATAKEY(opaque));
447
448 return OffsetNumberPrev(low);
449}
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
PageData * Page
Definition: bufpage.h:82
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
int32_t int32
Definition: c.h:498
#define unlikely(x)
Definition: c.h:347
Assert(PointerIsAligned(start, uint64))
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:72

References _bt_compare(), Assert(), BTPageGetOpaque, buf, BufferGetPage(), sort-test::key, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), and unlikely.

Referenced by _bt_first(), and _bt_search().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 474 of file nbtsearch.c.

475{
476 BTScanInsert key = insertstate->itup_key;
477 Page page;
478 BTPageOpaque opaque;
479 OffsetNumber low,
480 high,
481 stricthigh;
482 int32 result,
483 cmpval;
484
485 page = BufferGetPage(insertstate->buf);
486 opaque = BTPageGetOpaque(page);
487
488 Assert(P_ISLEAF(opaque));
489 Assert(!key->nextkey);
490 Assert(insertstate->postingoff == 0);
491
492 if (!insertstate->bounds_valid)
493 {
494 /* Start new binary search */
495 low = P_FIRSTDATAKEY(opaque);
496 high = PageGetMaxOffsetNumber(page);
497 }
498 else
499 {
500 /* Restore result of previous binary search against same page */
501 low = insertstate->low;
502 high = insertstate->stricthigh;
503 }
504
505 /* If there are no keys on the page, return the first available slot */
506 if (unlikely(high < low))
507 {
508 /* Caller can't reuse bounds */
509 insertstate->low = InvalidOffsetNumber;
510 insertstate->stricthigh = InvalidOffsetNumber;
511 insertstate->bounds_valid = false;
512 return low;
513 }
514
515 /*
516 * Binary search to find the first key on the page >= scan key. (nextkey
517 * is always false when inserting).
518 *
519 * The loop invariant is: all slots before 'low' are < scan key, all slots
520 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
521 * maintained to save additional search effort for caller.
522 *
523 * We can fall out when high == low.
524 */
525 if (!insertstate->bounds_valid)
526 high++; /* establish the loop invariant for high */
527 stricthigh = high; /* high initially strictly higher */
528
529 cmpval = 1; /* !nextkey comparison value */
530
531 while (high > low)
532 {
533 OffsetNumber mid = low + ((high - low) / 2);
534
535 /* We have low <= mid < high, so mid points at a real slot */
536
537 result = _bt_compare(rel, key, page, mid);
538
539 if (result >= cmpval)
540 low = mid + 1;
541 else
542 {
543 high = mid;
544 if (result != 0)
545 stricthigh = high;
546 }
547
548 /*
549 * If tuple at offset located by binary search is a posting list whose
550 * TID range overlaps with caller's scantid, perform posting list
551 * binary search to set postingoff for caller. Caller must split the
552 * posting list when postingoff is set. This should happen
553 * infrequently.
554 */
555 if (unlikely(result == 0 && key->scantid != NULL))
556 {
557 /*
558 * postingoff should never be set more than once per leaf page
559 * binary search. That would mean that there are duplicate table
560 * TIDs in the index, which is never okay. Check for that here.
561 */
562 if (insertstate->postingoff != 0)
564 (errcode(ERRCODE_INDEX_CORRUPTED),
565 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
568 low, stricthigh,
569 BufferGetBlockNumber(insertstate->buf),
571
572 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
573 }
574 }
575
576 /*
577 * On a leaf page, a binary search always returns the first key >= scan
578 * key (at least in !nextkey case), which could be the last slot + 1. This
579 * is also the lower bound of cached search.
580 *
581 * stricthigh may also be the last slot + 1, which prevents caller from
582 * using bounds directly, but is still useful to us if we're called a
583 * second time with cached bounds (cached low will be < stricthigh when
584 * that happens).
585 */
586 insertstate->low = low;
587 insertstate->stricthigh = stricthigh;
588 insertstate->bounds_valid = true;
589
590 return low;
591}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4231
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errcode(int sqlerrcode)
Definition: elog.c:854
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:602
#define InvalidOffsetNumber
Definition: off.h:26
#define RelationGetRelationName(relation)
Definition: rel.h:550
OffsetNumber stricthigh
Definition: nbtree.h:836
bool bounds_valid
Definition: nbtree.h:834
OffsetNumber low
Definition: nbtree.h:835
BTScanInsert itup_key
Definition: nbtree.h:824

References _bt_binsrch_posting(), _bt_compare(), Assert(), BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_binsrch_posting()

static int _bt_binsrch_posting ( BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 602 of file nbtsearch.c.

603{
604 IndexTuple itup;
605 ItemId itemid;
606 int low,
607 high,
608 mid,
609 res;
610
611 /*
612 * If this isn't a posting tuple, then the index must be corrupt (if it is
613 * an ordinary non-pivot tuple then there must be an existing tuple with a
614 * heap TID that equals inserter's new heap TID/scantid). Defensively
615 * check that tuple is a posting list tuple whose posting list range
616 * includes caller's scantid.
617 *
618 * (This is also needed because contrib/amcheck's rootdescend option needs
619 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
620 */
621 itemid = PageGetItemId(page, offnum);
622 itup = (IndexTuple) PageGetItem(page, itemid);
623 if (!BTreeTupleIsPosting(itup))
624 return 0;
625
626 Assert(key->heapkeyspace && key->allequalimage);
627
628 /*
629 * In the event that posting list tuple has LP_DEAD bit set, indicate this
630 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
631 * second call to _bt_binsrch_insert() can take place when its caller has
632 * removed the dead item.
633 */
634 if (ItemIdIsDead(itemid))
635 return -1;
636
637 /* "high" is past end of posting list for loop invariant */
638 low = 0;
639 high = BTreeTupleGetNPosting(itup);
640 Assert(high >= 2);
641
642 while (high > low)
643 {
644 mid = low + ((high - low) / 2);
645 res = ItemPointerCompare(key->scantid,
646 BTreeTupleGetPostingN(itup, mid));
647
648 if (res > 0)
649 low = mid + 1;
650 else if (res < 0)
651 high = mid;
652 else
653 return mid;
654 }
655
656 /* Exact match not found */
657 return low;
658}
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), ItemIdIsDead, ItemPointerCompare(), sort-test::key, PageGetItem(), and PageGetItemId().

Referenced by _bt_binsrch_insert().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 688 of file nbtsearch.c.

692{
693 TupleDesc itupdesc = RelationGetDescr(rel);
694 BTPageOpaque opaque = BTPageGetOpaque(page);
695 IndexTuple itup;
696 ItemPointer heapTid;
697 ScanKey scankey;
698 int ncmpkey;
699 int ntupatts;
700 int32 result;
701
702 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
704 Assert(key->heapkeyspace || key->scantid == NULL);
705
706 /*
707 * Force result ">" if target item is first data item on an internal page
708 * --- see NOTE above.
709 */
710 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
711 return 1;
712
713 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
714 ntupatts = BTreeTupleGetNAtts(itup, rel);
715
716 /*
717 * The scan key is set up with the attribute number associated with each
718 * term in the key. It is important that, if the index is multi-key, the
719 * scan contain the first k key attributes, and that they be in order. If
720 * you think about how multi-key ordering works, you'll understand why
721 * this is.
722 *
723 * We don't test for violation of this condition here, however. The
724 * initial setup for the index scan had better have gotten it right (see
725 * _bt_first).
726 */
727
728 ncmpkey = Min(ntupatts, key->keysz);
729 Assert(key->heapkeyspace || ncmpkey == key->keysz);
730 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
731 scankey = key->scankeys;
732 for (int i = 1; i <= ncmpkey; i++)
733 {
734 Datum datum;
735 bool isNull;
736
737 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
738
739 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
740 {
741 if (isNull)
742 result = 0; /* NULL "=" NULL */
743 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
744 result = -1; /* NULL "<" NOT_NULL */
745 else
746 result = 1; /* NULL ">" NOT_NULL */
747 }
748 else if (isNull) /* key is NOT_NULL and item is NULL */
749 {
750 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
751 result = 1; /* NOT_NULL ">" NULL */
752 else
753 result = -1; /* NOT_NULL "<" NULL */
754 }
755 else
756 {
757 /*
758 * The sk_func needs to be passed the index value as left arg and
759 * the sk_argument as right arg (they might be of different
760 * types). Since it is convenient for callers to think of
761 * _bt_compare as comparing the scankey to the index item, we have
762 * to flip the sign of the comparison result. (Unless it's a DESC
763 * column, in which case we *don't* flip the sign.)
764 */
765 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
766 scankey->sk_collation,
767 datum,
768 scankey->sk_argument));
769
770 if (!(scankey->sk_flags & SK_BT_DESC))
771 INVERT_COMPARE_RESULT(result);
772 }
773
774 /* if the keys are unequal, return the difference */
775 if (result != 0)
776 return result;
777
778 scankey++;
779 }
780
781 /*
782 * All non-truncated attributes (other than heap TID) were found to be
783 * equal. Treat truncated attributes as minus infinity when scankey has a
784 * key attribute value that would otherwise be compared directly.
785 *
786 * Note: it doesn't matter if ntupatts includes non-key attributes;
787 * scankey won't, so explicitly excluding non-key attributes isn't
788 * necessary.
789 */
790 if (key->keysz > ntupatts)
791 return 1;
792
793 /*
794 * Use the heap TID attribute and scantid to try to break the tie. The
795 * rules are the same as any other key attribute -- only the
796 * representation differs.
797 */
798 heapTid = BTreeTupleGetHeapTID(itup);
799 if (key->scantid == NULL)
800 {
801 /*
802 * Forward scans have a scankey that is considered greater than a
803 * truncated pivot tuple if and when the scankey has equal values for
804 * attributes up to and including the least significant untruncated
805 * attribute in tuple. Even attributes that were omitted from the
806 * scan key are considered greater than -inf truncated attributes.
807 * (See _bt_binsrch for an explanation of our backward scan behavior.)
808 *
809 * For example, if an index has the minimum two attributes (single
810 * user key attribute, plus heap TID attribute), and a page's high key
811 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
812 * will not descend to the page to the left. The search will descend
813 * right instead. The truncated attribute in pivot tuple means that
814 * all non-pivot tuples on the page to the left are strictly < 'foo',
815 * so it isn't necessary to descend left. In other words, search
816 * doesn't have to descend left because it isn't interested in a match
817 * that has a heap TID value of -inf.
818 *
819 * Note: the heap TID part of the test ensures that scankey is being
820 * compared to a pivot tuple with one or more truncated -inf key
821 * attributes. The heap TID attribute is the last key attribute in
822 * every index, of course, but other than that it isn't special.
823 */
824 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
825 key->heapkeyspace)
826 return 1;
827
828 /* All provided scankey arguments found to be equal */
829 return 0;
830 }
831
832 /*
833 * Treat truncated heap TID as minus infinity, since scankey has a key
834 * attribute value (scantid) that would otherwise be compared directly
835 */
837 if (heapTid == NULL)
838 return 1;
839
840 /*
841 * Scankey must be treated as equal to a posting list tuple if its scantid
842 * value falls within the range of the posting list. In all other cases
843 * there can only be a single heap TID value, which is compared directly
844 * with scantid.
845 */
847 result = ItemPointerCompare(key->scantid, heapTid);
848 if (result <= 0 || !BTreeTupleIsPosting(itup))
849 return result;
850 else
851 {
852 result = ItemPointerCompare(key->scantid,
854 if (result > 0)
855 return 1;
856 }
857
858 return 0;
859}
#define Min(x, y)
Definition: c.h:975
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1077
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int i
Definition: isn.c:77
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1147
#define SK_BT_DESC
Definition: nbtree.h:1146
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4056
uintptr_t Datum
Definition: postgres.h:69
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define RelationGetDescr(relation)
Definition: rel.h:542
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:535
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_drop_lock_and_maybe_pin()

static void _bt_drop_lock_and_maybe_pin ( IndexScanDesc  scan,
BTScanPos  sp 
)
static

Definition at line 67 of file nbtsearch.c.

68{
70
71 if (IsMVCCSnapshot(scan->xs_snapshot) &&
73 !scan->xs_want_itup)
74 {
75 ReleaseBuffer(sp->buf);
76 sp->buf = InvalidBuffer;
77 }
78}
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5373
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define RelationNeedsWAL(relation)
Definition: rel.h:639
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55
Buffer buf
Definition: nbtree.h:964
Relation indexRelation
Definition: relscan.h:137
struct SnapshotData * xs_snapshot
Definition: relscan.h:138

References _bt_unlockbuf(), BTScanPosData::buf, IndexScanDescData::indexRelation, InvalidBuffer, IsMVCCSnapshot, RelationNeedsWAL, ReleaseBuffer(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_readfirstpage(), and _bt_readnextpage().

◆ _bt_endpoint()

static bool _bt_endpoint ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2636 of file nbtsearch.c.

2637{
2638 Relation rel = scan->indexRelation;
2639 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2640 Page page;
2641 BTPageOpaque opaque;
2643
2645 Assert(!so->needPrimScan);
2646
2647 /*
2648 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2649 * version of _bt_search().
2650 */
2652
2653 if (!BufferIsValid(so->currPos.buf))
2654 {
2655 /*
2656 * Empty index. Lock the whole relation, as nothing finer to lock
2657 * exists.
2658 */
2660 _bt_parallel_done(scan);
2661 return false;
2662 }
2663
2664 page = BufferGetPage(so->currPos.buf);
2665 opaque = BTPageGetOpaque(page);
2666 Assert(P_ISLEAF(opaque));
2667
2668 if (ScanDirectionIsForward(dir))
2669 {
2670 /* There could be dead pages to the left, so not this: */
2671 /* Assert(P_LEFTMOST(opaque)); */
2672
2673 start = P_FIRSTDATAKEY(opaque);
2674 }
2675 else if (ScanDirectionIsBackward(dir))
2676 {
2677 Assert(P_RIGHTMOST(opaque));
2678
2680 }
2681 else
2682 {
2683 elog(ERROR, "invalid scan direction: %d", (int) dir);
2684 start = 0; /* keep compiler quiet */
2685 }
2686
2687 /*
2688 * Now load data from the first page of the scan.
2689 */
2690 if (!_bt_readfirstpage(scan, start, dir))
2691 return false;
2692
2693 _bt_returnitem(scan, so);
2694 return true;
2695}
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
#define elog(elevel,...)
Definition: elog.h:225
return str start
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:949
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1096
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2553
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2213
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:2084
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2576
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
bool needPrimScan
Definition: nbtree.h:1063
BTScanPosData currPos
Definition: nbtree.h:1092

References _bt_get_endpoint(), _bt_parallel_done(), _bt_readfirstpage(), _bt_returnitem(), Assert(), BTPageGetOpaque, BTScanPosIsValid, BTScanPosData::buf, BufferGetPage(), BufferIsValid(), BTScanOpaqueData::currPos, elog, ERROR, IndexScanDescData::indexRelation, BTScanOpaqueData::needPrimScan, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, start, and IndexScanDescData::xs_snapshot.

Referenced by _bt_first().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 882 of file nbtsearch.c.

883{
884 Relation rel = scan->indexRelation;
885 BTScanOpaque so = (BTScanOpaque) scan->opaque;
886 BTStack stack;
887 OffsetNumber offnum;
888 BTScanInsertData inskey;
889 ScanKey startKeys[INDEX_MAX_KEYS];
890 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
891 int keysz = 0;
892 StrategyNumber strat_total;
894 lastcurrblkno;
895
897
898 /*
899 * Examine the scan keys and eliminate any redundant keys; also mark the
900 * keys that must be matched to continue the scan.
901 */
903
904 /*
905 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 * never be satisfied (eg, x == 1 AND x > 2).
907 */
908 if (!so->qual_ok)
909 {
910 Assert(!so->needPrimScan);
911 _bt_parallel_done(scan);
912 return false;
913 }
914
915 /*
916 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 * will likely release the parallel scan later on.
918 */
919 if (scan->parallel_scan != NULL &&
920 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 return false;
922
923 /*
924 * Initialize the scan's arrays (if any) for the current scan direction
925 * (except when they were already set to later values as part of
926 * scheduling the primitive index scan that is now underway)
927 */
928 if (so->numArrayKeys && !so->needPrimScan)
929 _bt_start_array_keys(scan, dir);
930
931 if (blkno != InvalidBlockNumber)
932 {
933 /*
934 * We anticipated calling _bt_search, but another worker bet us to it.
935 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 */
937 Assert(scan->parallel_scan != NULL);
938 Assert(!so->needPrimScan);
939 Assert(blkno != P_NONE);
940
941 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
942 return false;
943
944 _bt_returnitem(scan, so);
945 return true;
946 }
947
948 /*
949 * Count an indexscan for stats, now that we know that we'll call
950 * _bt_search/_bt_endpoint below
951 */
953 if (scan->instrument)
954 scan->instrument->nsearches++;
955
956 /*----------
957 * Examine the scan keys to discover where we need to start the scan.
958 *
959 * We want to identify the keys that can be used as starting boundaries;
960 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
961 * a backwards scan. We can use keys for multiple attributes so long as
962 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
963 * a > or < boundary or find an attribute with no boundary (which can be
964 * thought of as the same as "> -infinity"), we can't use keys for any
965 * attributes to its right, because it would break our simplistic notion
966 * of what initial positioning strategy to use.
967 *
968 * When the scan keys include cross-type operators, _bt_preprocess_keys
969 * may not be able to eliminate redundant keys; in such cases we will
970 * arbitrarily pick a usable one for each attribute. This is correct
971 * but possibly not optimal behavior. (For example, with keys like
972 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
973 * x=5 would be more efficient.) Since the situation only arises given
974 * a poorly-worded query plus an incomplete opfamily, live with it.
975 *
976 * When both equality and inequality keys appear for a single attribute
977 * (again, only possible when cross-type operators appear), we *must*
978 * select one of the equality keys for the starting point, because
979 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
980 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
981 * start at x=4, we will fail and stop before reaching x=10. If multiple
982 * equality quals survive preprocessing, however, it doesn't matter which
983 * one we use --- by definition, they are either redundant or
984 * contradictory.
985 *
986 * In practice we rarely see any "attribute boundary key gaps" here.
987 * Preprocessing can usually backfill skip array keys for any attributes
988 * that were omitted from the original scan->keyData[] input keys. All
989 * array keys are always considered = keys, but we'll sometimes need to
990 * treat the current key value as if we were using an inequality strategy.
991 * This happens with range skip arrays, which store inequality keys in the
992 * array's low_compare/high_compare fields (used to find the first/last
993 * set of matches, when = key will lack a usable sk_argument value).
994 * These are always preferred over any redundant "standard" inequality
995 * keys on the same column (per the usual rule about preferring = keys).
996 * Note also that any column with an = skip array key can never have an
997 * additional, contradictory = key.
998 *
999 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1000 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1001 * If the index stores nulls at the end of the index we'll be starting
1002 * from, and we have no boundary key for the column (which means the key
1003 * we deduced NOT NULL from is an inequality key that constrains the other
1004 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1005 * use as a boundary key. If we didn't do this, we might find ourselves
1006 * traversing a lot of null entries at the start of the scan.
1007 *
1008 * In this loop, row-comparison keys are treated the same as keys on their
1009 * first (leftmost) columns. We'll add on lower-order columns of the row
1010 * comparison below, if possible.
1011 *
1012 * The selected scan keys (at most one per index column) are remembered by
1013 * storing their addresses into the local startKeys[] array.
1014 *
1015 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1016 * the next primitive index scan (for scans with array keys) based in part
1017 * on an understanding of how it'll enable us to reposition the scan.
1018 * They're directly aware of how we'll sometimes cons up an explicit
1019 * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1020 * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1021 * scans to skip large groups of NULLs through repeated deductions about
1022 * key strictness (for a required inequality key) and whether NULLs in the
1023 * key's index column are stored last or first (relative to non-NULLs).
1024 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1025 * need to be kept in sync.
1026 *----------
1027 */
1028 strat_total = BTEqualStrategyNumber;
1029 if (so->numberOfKeys > 0)
1030 {
1031 AttrNumber curattr;
1032 ScanKey chosen;
1033 ScanKey impliesNN;
1034 ScanKey cur;
1035
1036 /*
1037 * chosen is the so-far-chosen key for the current attribute, if any.
1038 * We don't cast the decision in stone until we reach keys for the
1039 * next attribute.
1040 */
1041 cur = so->keyData;
1042 curattr = 1;
1043 chosen = NULL;
1044 /* Also remember any scankey that implies a NOT NULL constraint */
1045 impliesNN = NULL;
1046
1047 /*
1048 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1049 * pass to handle after-last-key processing. Actual exit from the
1050 * loop is at one of the "break" statements below.
1051 */
1052 for (int i = 0;; cur++, i++)
1053 {
1054 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1055 {
1056 /*
1057 * Done looking at keys for curattr.
1058 *
1059 * If this is a scan key for a skip array whose current
1060 * element is MINVAL, choose low_compare (when scanning
1061 * backwards it'll be MAXVAL, and we'll choose high_compare).
1062 *
1063 * Note: if the array's low_compare key makes 'chosen' NULL,
1064 * then we behave as if the array's first element is -inf,
1065 * except when !array->null_elem implies a usable NOT NULL
1066 * constraint.
1067 */
1068 if (chosen != NULL &&
1069 (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1070 {
1071 int ikey = chosen - so->keyData;
1072 ScanKey skipequalitykey = chosen;
1073 BTArrayKeyInfo *array = NULL;
1074
1075 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1076 {
1077 array = &so->arrayKeys[arridx];
1078 if (array->scan_key == ikey)
1079 break;
1080 }
1081
1082 if (ScanDirectionIsForward(dir))
1083 {
1084 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1085 chosen = array->low_compare;
1086 }
1087 else
1088 {
1089 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1090 chosen = array->high_compare;
1091 }
1092
1093 Assert(chosen == NULL ||
1094 chosen->sk_attno == skipequalitykey->sk_attno);
1095
1096 if (!array->null_elem)
1097 impliesNN = skipequalitykey;
1098 else
1099 Assert(chosen == NULL && impliesNN == NULL);
1100 }
1101
1102 /*
1103 * If we didn't find a usable boundary key, see if we can
1104 * deduce a NOT NULL key
1105 */
1106 if (chosen == NULL && impliesNN != NULL &&
1107 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1110 {
1111 /* Yes, so build the key in notnullkeys[keysz] */
1112 chosen = &notnullkeys[keysz];
1115 (impliesNN->sk_flags &
1117 curattr,
1118 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1121 InvalidOid,
1122 InvalidOid,
1123 InvalidOid,
1124 (Datum) 0);
1125 }
1126
1127 /*
1128 * If we still didn't find a usable boundary key, quit; else
1129 * save the boundary key pointer in startKeys.
1130 */
1131 if (chosen == NULL)
1132 break;
1133 startKeys[keysz++] = chosen;
1134
1135 /*
1136 * We can only consider adding more boundary keys when the one
1137 * that we just chose to add uses either the = or >= strategy
1138 * (during backwards scans we can only do so when the key that
1139 * we just added to startKeys[] uses the = or <= strategy)
1140 */
1141 strat_total = chosen->sk_strategy;
1142 if (strat_total == BTGreaterStrategyNumber ||
1143 strat_total == BTLessStrategyNumber)
1144 break;
1145
1146 /*
1147 * If the key that we just added to startKeys[] is a skip
1148 * array = key whose current element is marked NEXT or PRIOR,
1149 * make strat_total > or < (and stop adding boundary keys).
1150 * This can only happen with opclasses that lack skip support.
1151 */
1152 if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1153 {
1154 Assert(chosen->sk_flags & SK_BT_SKIP);
1155 Assert(strat_total == BTEqualStrategyNumber);
1156
1157 if (ScanDirectionIsForward(dir))
1158 {
1159 Assert(!(chosen->sk_flags & SK_BT_PRIOR));
1160 strat_total = BTGreaterStrategyNumber;
1161 }
1162 else
1163 {
1164 Assert(!(chosen->sk_flags & SK_BT_NEXT));
1165 strat_total = BTLessStrategyNumber;
1166 }
1167
1168 /*
1169 * We're done. We'll never find an exact = match for a
1170 * NEXT or PRIOR sentinel sk_argument value. There's no
1171 * sense in trying to add more keys to startKeys[].
1172 */
1173 break;
1174 }
1175
1176 /*
1177 * Done if that was the last scan key output by preprocessing.
1178 * Also done if there is a gap index attribute that lacks a
1179 * usable key (only possible when preprocessing was unable to
1180 * generate a skip array key to "fill in the gap").
1181 */
1182 if (i >= so->numberOfKeys ||
1183 cur->sk_attno != curattr + 1)
1184 break;
1185
1186 /*
1187 * Reset for next attr.
1188 */
1189 curattr = cur->sk_attno;
1190 chosen = NULL;
1191 impliesNN = NULL;
1192 }
1193
1194 /*
1195 * Can we use this key as a starting boundary for this attr?
1196 *
1197 * If not, does it imply a NOT NULL constraint? (Because
1198 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1199 * *any* inequality key works for that; we need not test.)
1200 */
1201 switch (cur->sk_strategy)
1202 {
1205 if (chosen == NULL)
1206 {
1207 if (ScanDirectionIsBackward(dir))
1208 chosen = cur;
1209 else
1210 impliesNN = cur;
1211 }
1212 break;
1214 /* override any non-equality choice */
1215 chosen = cur;
1216 break;
1219 if (chosen == NULL)
1220 {
1221 if (ScanDirectionIsForward(dir))
1222 chosen = cur;
1223 else
1224 impliesNN = cur;
1225 }
1226 break;
1227 }
1228 }
1229 }
1230
1231 /*
1232 * If we found no usable boundary keys, we have to start from one end of
1233 * the tree. Walk down that edge to the first or last key, and scan from
1234 * there.
1235 *
1236 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1237 */
1238 if (keysz == 0)
1239 return _bt_endpoint(scan, dir);
1240
1241 /*
1242 * We want to start the scan somewhere within the index. Set up an
1243 * insertion scankey we can use to search for the boundary point we
1244 * identified above. The insertion scankey is built using the keys
1245 * identified by startKeys[]. (Remaining insertion scankey fields are
1246 * initialized after initial-positioning scan keys are finalized.)
1247 */
1248 Assert(keysz <= INDEX_MAX_KEYS);
1249 for (int i = 0; i < keysz; i++)
1250 {
1251 ScanKey cur = startKeys[i];
1252
1253 Assert(cur->sk_attno == i + 1);
1254
1255 if (cur->sk_flags & SK_ROW_HEADER)
1256 {
1257 /*
1258 * Row comparison header: look to the first row member instead
1259 */
1260 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1261
1262 /*
1263 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1264 * would've marked the qual as unsatisfiable, preventing us from
1265 * ever getting this far
1266 */
1267 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1268 Assert(subkey->sk_attno == cur->sk_attno);
1269 Assert(!(subkey->sk_flags & SK_ISNULL));
1270
1271 /*
1272 * The member scankeys are already in insertion format (ie, they
1273 * have sk_func = 3-way-comparison function)
1274 */
1275 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1276
1277 /*
1278 * If the row comparison is the last positioning key we accepted,
1279 * try to add additional keys from the lower-order row members.
1280 * (If we accepted independent conditions on additional index
1281 * columns, we use those instead --- doesn't seem worth trying to
1282 * determine which is more restrictive.) Note that this is OK
1283 * even if the row comparison is of ">" or "<" type, because the
1284 * condition applied to all but the last row member is effectively
1285 * ">=" or "<=", and so the extra keys don't break the positioning
1286 * scheme. But, by the same token, if we aren't able to use all
1287 * the row members, then the part of the row comparison that we
1288 * did use has to be treated as just a ">=" or "<=" condition, and
1289 * so we'd better adjust strat_total accordingly.
1290 */
1291 if (i == keysz - 1)
1292 {
1293 bool used_all_subkeys = false;
1294
1295 Assert(!(subkey->sk_flags & SK_ROW_END));
1296 for (;;)
1297 {
1298 subkey++;
1299 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1300 if (subkey->sk_attno != keysz + 1)
1301 break; /* out-of-sequence, can't use it */
1302 if (subkey->sk_strategy != cur->sk_strategy)
1303 break; /* wrong direction, can't use it */
1304 if (subkey->sk_flags & SK_ISNULL)
1305 break; /* can't use null keys */
1306 Assert(keysz < INDEX_MAX_KEYS);
1307 memcpy(inskey.scankeys + keysz, subkey,
1308 sizeof(ScanKeyData));
1309 keysz++;
1310 if (subkey->sk_flags & SK_ROW_END)
1311 {
1312 used_all_subkeys = true;
1313 break;
1314 }
1315 }
1316 if (!used_all_subkeys)
1317 {
1318 switch (strat_total)
1319 {
1321 strat_total = BTLessEqualStrategyNumber;
1322 break;
1324 strat_total = BTGreaterEqualStrategyNumber;
1325 break;
1326 }
1327 }
1328 break; /* done with outer loop */
1329 }
1330 }
1331 else
1332 {
1333 /*
1334 * Ordinary comparison key. Transform the search-style scan key
1335 * to an insertion scan key by replacing the sk_func with the
1336 * appropriate btree comparison function.
1337 *
1338 * If scankey operator is not a cross-type comparison, we can use
1339 * the cached comparison function; otherwise gotta look it up in
1340 * the catalogs. (That can't lead to infinite recursion, since no
1341 * indexscan initiated by syscache lookup will use cross-data-type
1342 * operators.)
1343 *
1344 * We support the convention that sk_subtype == InvalidOid means
1345 * the opclass input type; this is a hack to simplify life for
1346 * ScanKeyInit().
1347 */
1348 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1349 cur->sk_subtype == InvalidOid)
1350 {
1351 FmgrInfo *procinfo;
1352
1353 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1354 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1355 cur->sk_flags,
1356 cur->sk_attno,
1358 cur->sk_subtype,
1359 cur->sk_collation,
1360 procinfo,
1361 cur->sk_argument);
1362 }
1363 else
1364 {
1365 RegProcedure cmp_proc;
1366
1367 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1368 rel->rd_opcintype[i],
1369 cur->sk_subtype,
1370 BTORDER_PROC);
1371 if (!RegProcedureIsValid(cmp_proc))
1372 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1373 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1374 cur->sk_attno, RelationGetRelationName(rel));
1375 ScanKeyEntryInitialize(inskey.scankeys + i,
1376 cur->sk_flags,
1377 cur->sk_attno,
1379 cur->sk_subtype,
1380 cur->sk_collation,
1381 cmp_proc,
1382 cur->sk_argument);
1383 }
1384 }
1385 }
1386
1387 /*----------
1388 * Examine the selected initial-positioning strategy to determine exactly
1389 * where we need to start the scan, and set flag variables to control the
1390 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1391 * page _bt_search returns).
1392 *----------
1393 */
1394 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1395 inskey.anynullkeys = false; /* unused */
1396 inskey.scantid = NULL;
1397 inskey.keysz = keysz;
1398 switch (strat_total)
1399 {
1401
1402 inskey.nextkey = false;
1403 inskey.backward = true;
1404 break;
1405
1407
1408 inskey.nextkey = true;
1409 inskey.backward = true;
1410 break;
1411
1413
1414 /*
1415 * If a backward scan was specified, need to start with last equal
1416 * item not first one.
1417 */
1418 if (ScanDirectionIsBackward(dir))
1419 {
1420 /*
1421 * This is the same as the <= strategy
1422 */
1423 inskey.nextkey = true;
1424 inskey.backward = true;
1425 }
1426 else
1427 {
1428 /*
1429 * This is the same as the >= strategy
1430 */
1431 inskey.nextkey = false;
1432 inskey.backward = false;
1433 }
1434 break;
1435
1437
1438 /*
1439 * Find first item >= scankey
1440 */
1441 inskey.nextkey = false;
1442 inskey.backward = false;
1443 break;
1444
1446
1447 /*
1448 * Find first item > scankey
1449 */
1450 inskey.nextkey = true;
1451 inskey.backward = false;
1452 break;
1453
1454 default:
1455 /* can't get here, but keep compiler quiet */
1456 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1457 return false;
1458 }
1459
1460 /*
1461 * Use the manufactured insertion scan key to descend the tree and
1462 * position ourselves on the target leaf page.
1463 */
1464 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1465 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1466
1467 /* don't need to keep the stack around... */
1468 _bt_freestack(stack);
1469
1470 if (!BufferIsValid(so->currPos.buf))
1471 {
1472 /*
1473 * We only get here if the index is completely empty. Lock relation
1474 * because nothing finer to lock exists. Without a buffer lock, it's
1475 * possible for another transaction to insert data between
1476 * _bt_search() and PredicateLockRelation(). We have to try again
1477 * after taking the relation-level predicate lock, to close a narrow
1478 * window where we wouldn't scan concurrently inserted tuples, but the
1479 * writer wouldn't see our predicate lock.
1480 */
1482 {
1484 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1485 _bt_freestack(stack);
1486 }
1487
1488 if (!BufferIsValid(so->currPos.buf))
1489 {
1490 Assert(!so->needPrimScan);
1491 _bt_parallel_done(scan);
1492 return false;
1493 }
1494 }
1495
1496 /* position to the precise item on the page */
1497 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1498
1499 /*
1500 * Now load data from the first page of the scan (usually the page
1501 * currently in so->currPos.buf).
1502 *
1503 * If inskey.nextkey = false and inskey.backward = false, offnum is
1504 * positioned at the first non-pivot tuple >= inskey.scankeys.
1505 *
1506 * If inskey.nextkey = false and inskey.backward = true, offnum is
1507 * positioned at the last non-pivot tuple < inskey.scankeys.
1508 *
1509 * If inskey.nextkey = true and inskey.backward = false, offnum is
1510 * positioned at the first non-pivot tuple > inskey.scankeys.
1511 *
1512 * If inskey.nextkey = true and inskey.backward = true, offnum is
1513 * positioned at the last non-pivot tuple <= inskey.scankeys.
1514 *
1515 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1516 * for the page. For example, when inskey is both < the leaf page's high
1517 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1518 */
1519 if (!_bt_readfirstpage(scan, offnum, dir))
1520 return false;
1521
1522 _bt_returnitem(scan, so);
1523 return true;
1524}
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define RegProcedureIsValid(p)
Definition: c.h:748
regproc RegProcedure
Definition: c.h:621
struct cursor * cur
Definition: ecpg.c:29
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:907
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:888
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
void _bt_preprocess_keys(IndexScanDesc scan)
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:784
#define SK_BT_SKIP
Definition: nbtree.h:1136
#define BTORDER_PROC
Definition: nbtree.h:717
#define SK_BT_PRIOR
Definition: nbtree.h:1142
#define SK_BT_NEXT
Definition: nbtree.h:1141
#define P_NONE
Definition: nbtree.h:213
#define SK_BT_MAXVAL
Definition: nbtree.h:1140
#define BT_READ
Definition: nbtree.h:730
#define SK_BT_MINVAL
Definition: nbtree.h:1139
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2301
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2636
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:187
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:611
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:694
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
#define InvalidOid
Definition: postgres_ext.h:35
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
ScanKey high_compare
Definition: nbtree.h:1050
ScanKey low_compare
Definition: nbtree.h:1049
bool null_elem
Definition: nbtree.h:1047
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
ScanKey keyData
Definition: nbtree.h:1058
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:191
struct IndexScanInstrumentation * instrument
Definition: relscan.h:159
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), BTScanOpaqueData::arrayKeys, Assert(), BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BTScanPosData::buf, BufferIsValid(), cur, BTScanOpaqueData::currPos, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), BTArrayKeyInfo::high_compare, i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, IndexScanDescData::instrument, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanOpaqueData::keyData, BTArrayKeyInfo::low_compare, BTScanOpaqueData::needPrimScan, IndexScanInstrumentation::nsearches, BTArrayKeyInfo::null_elem, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_NEXT, SK_BT_NULLS_FIRST, SK_BT_PRIOR, SK_BT_SKIP, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2553 of file nbtsearch.c.

2554{
2555 Buffer buf;
2556 Page page;
2557 BTPageOpaque opaque;
2558 OffsetNumber offnum;
2559 BlockNumber blkno;
2560 IndexTuple itup;
2561
2562 /*
2563 * If we are looking for a leaf page, okay to descend from fast root;
2564 * otherwise better descend from true root. (There is no point in being
2565 * smarter about intermediate levels.)
2566 */
2567 if (level == 0)
2568 buf = _bt_getroot(rel, NULL, BT_READ);
2569 else
2570 buf = _bt_gettrueroot(rel);
2571
2572 if (!BufferIsValid(buf))
2573 return InvalidBuffer;
2574
2575 page = BufferGetPage(buf);
2576 opaque = BTPageGetOpaque(page);
2577
2578 for (;;)
2579 {
2580 /*
2581 * If we landed on a deleted page, step right to find a live page
2582 * (there must be one). Also, if we want the rightmost page, step
2583 * right if needed to get to it (this could happen if the page split
2584 * since we obtained a pointer to it).
2585 */
2586 while (P_IGNORE(opaque) ||
2587 (rightmost && !P_RIGHTMOST(opaque)))
2588 {
2589 blkno = opaque->btpo_next;
2590 if (blkno == P_NONE)
2591 elog(ERROR, "fell off the end of index \"%s\"",
2593 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2594 page = BufferGetPage(buf);
2595 opaque = BTPageGetOpaque(page);
2596 }
2597
2598 /* Done? */
2599 if (opaque->btpo_level == level)
2600 break;
2601 if (opaque->btpo_level < level)
2602 ereport(ERROR,
2603 (errcode(ERRCODE_INDEX_CORRUPTED),
2604 errmsg_internal("btree level %u not found in index \"%s\"",
2605 level, RelationGetRelationName(rel))));
2606
2607 /* Descend to leftmost or rightmost child page */
2608 if (rightmost)
2609 offnum = PageGetMaxOffsetNumber(page);
2610 else
2611 offnum = P_FIRSTDATAKEY(opaque);
2612
2613 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2614 blkno = BTreeTupleGetDownLink(itup);
2615
2616 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2617 page = BufferGetPage(buf);
2618 opaque = BTPageGetOpaque(page);
2619 }
2620
2621 return buf;
2622}
int Buffer
Definition: buf.h:23
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define P_IGNORE(opaque)
Definition: nbtree.h:226
BlockNumber btpo_next
Definition: nbtree.h:66
uint32 btpo_level
Definition: nbtree.h:67

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_lock_and_validate_left()

static Buffer _bt_lock_and_validate_left ( Relation  rel,
BlockNumber blkno,
BlockNumber  lastcurrblkno 
)
static

Definition at line 2436 of file nbtsearch.c.

2438{
2439 BlockNumber origblkno = *blkno; /* detects circular links */
2440
2441 for (;;)
2442 {
2443 Buffer buf;
2444 Page page;
2445 BTPageOpaque opaque;
2446 int tries;
2447
2448 /* check for interrupts while we're not holding any buffer lock */
2450 buf = _bt_getbuf(rel, *blkno, BT_READ);
2451 page = BufferGetPage(buf);
2452 opaque = BTPageGetOpaque(page);
2453
2454 /*
2455 * If this isn't the page we want, walk right till we find what we
2456 * want --- but go no more than four hops (an arbitrary limit). If we
2457 * don't find the correct page by then, the most likely bet is that
2458 * lastcurrblkno got deleted and isn't in the sibling chain at all
2459 * anymore, not that its left sibling got split more than four times.
2460 *
2461 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2462 * because half-dead pages are still in the sibling chain.
2463 */
2464 tries = 0;
2465 for (;;)
2466 {
2467 if (likely(!P_ISDELETED(opaque) &&
2468 opaque->btpo_next == lastcurrblkno))
2469 {
2470 /* Found desired page, return it */
2471 return buf;
2472 }
2473 if (P_RIGHTMOST(opaque) || ++tries > 4)
2474 break;
2475 /* step right */
2476 *blkno = opaque->btpo_next;
2477 buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2478 page = BufferGetPage(buf);
2479 opaque = BTPageGetOpaque(page);
2480 }
2481
2482 /*
2483 * Return to the original page (usually the page most recently read by
2484 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2485 * what's up with its prev sibling link
2486 */
2487 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2488 page = BufferGetPage(buf);
2489 opaque = BTPageGetOpaque(page);
2490 if (P_ISDELETED(opaque))
2491 {
2492 /*
2493 * It was deleted. Move right to first nondeleted page (there
2494 * must be one); that is the page that has acquired the deleted
2495 * one's keyspace, so stepping left from it will take us where we
2496 * want to be.
2497 */
2498 for (;;)
2499 {
2500 if (P_RIGHTMOST(opaque))
2501 elog(ERROR, "fell off the end of index \"%s\"",
2503 lastcurrblkno = opaque->btpo_next;
2504 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2505 page = BufferGetPage(buf);
2506 opaque = BTPageGetOpaque(page);
2507 if (!P_ISDELETED(opaque))
2508 break;
2509 }
2510 }
2511 else
2512 {
2513 /*
2514 * Original lastcurrblkno wasn't deleted; the explanation had
2515 * better be that the page to the left got split or deleted.
2516 * Without this check, we risk going into an infinite loop.
2517 */
2518 if (opaque->btpo_prev == origblkno)
2519 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2520 lastcurrblkno, RelationGetRelationName(rel));
2521 /* Okay to try again, since left sibling link changed */
2522 }
2523
2524 /*
2525 * Original lastcurrblkno from caller was concurrently deleted (could
2526 * also have been a great many concurrent left sibling page splits).
2527 * Found a non-deleted page that should now act as our lastcurrblkno.
2528 */
2529 if (P_LEFTMOST(opaque))
2530 {
2531 /* New lastcurrblkno has no left sibling (concurrently deleted) */
2532 _bt_relbuf(rel, buf);
2533 break;
2534 }
2535
2536 /* Start from scratch with new lastcurrblkno's blkno/prev link */
2537 *blkno = origblkno = opaque->btpo_prev;
2538 _bt_relbuf(rel, buf);
2539 }
2540
2541 return InvalidBuffer;
2542}
#define likely(x)
Definition: c.h:346
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
BlockNumber btpo_prev
Definition: nbtree.h:65

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BufferGetPage(), CHECK_FOR_INTERRUPTS, elog, ERROR, InvalidBuffer, likely, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_readnextpage().

◆ _bt_moveright()

static Buffer _bt_moveright ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer  buf,
bool  forupdate,
BTStack  stack,
int  access 
)
static

Definition at line 241 of file nbtsearch.c.

248{
249 Page page;
250 BTPageOpaque opaque;
251 int32 cmpval;
252
253 Assert(!forupdate || heaprel != NULL);
254
255 /*
256 * When nextkey = false (normal case): if the scan key that brought us to
257 * this page is > the high key stored on the page, then the page has split
258 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
259 * have some duplicates to the right as well as the left, but that's
260 * something that's only ever dealt with on the leaf level, after
261 * _bt_search has found an initial leaf page.)
262 *
263 * When nextkey = true: move right if the scan key is >= page's high key.
264 * (Note that key.scantid cannot be set in this case.)
265 *
266 * The page could even have split more than once, so scan as far as
267 * needed.
268 *
269 * We also have to move right if we followed a link that brought us to a
270 * dead page.
271 */
272 cmpval = key->nextkey ? 0 : 1;
273
274 for (;;)
275 {
276 page = BufferGetPage(buf);
277 opaque = BTPageGetOpaque(page);
278
279 if (P_RIGHTMOST(opaque))
280 break;
281
282 /*
283 * Finish any incomplete splits we encounter along the way.
284 */
285 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
286 {
288
289 /* upgrade our lock if necessary */
290 if (access == BT_READ)
291 {
292 _bt_unlockbuf(rel, buf);
293 _bt_lockbuf(rel, buf, BT_WRITE);
294 }
295
296 if (P_INCOMPLETE_SPLIT(opaque))
297 _bt_finish_split(rel, heaprel, buf, stack);
298 else
299 _bt_relbuf(rel, buf);
300
301 /* re-acquire the lock in the right mode, and re-check */
302 buf = _bt_getbuf(rel, blkno, access);
303 continue;
304 }
305
306 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
307 {
308 /* step right one page */
309 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
310 continue;
311 }
312 else
313 break;
314 }
315
316 if (P_IGNORE(opaque))
317 elog(ERROR, "fell off the end of index \"%s\"",
319
320 return buf;
321}
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
#define P_HIKEY
Definition: nbtree.h:368
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BT_WRITE
Definition: nbtree.h:731
short access
Definition: preproc-type.c:36

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, buf, BufferGetBlockNumber(), BufferGetPage(), elog, ERROR, sort-test::key, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_search().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1541 of file nbtsearch.c.

1542{
1543 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1544
1546
1547 /*
1548 * Advance to next tuple on current page; or if there's no more, try to
1549 * step to the next page with data.
1550 */
1551 if (ScanDirectionIsForward(dir))
1552 {
1553 if (++so->currPos.itemIndex > so->currPos.lastItem)
1554 {
1555 if (!_bt_steppage(scan, dir))
1556 return false;
1557 }
1558 }
1559 else
1560 {
1561 if (--so->currPos.itemIndex < so->currPos.firstItem)
1562 {
1563 if (!_bt_steppage(scan, dir))
1564 return false;
1565 }
1566 }
1567
1568 _bt_returnitem(scan, so);
1569 return true;
1570}
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2110
int firstItem
Definition: nbtree.h:995
int lastItem
Definition: nbtree.h:996
int itemIndex
Definition: nbtree.h:997

References _bt_returnitem(), _bt_steppage(), Assert(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_readfirstpage()

static bool _bt_readfirstpage ( IndexScanDesc  scan,
OffsetNumber  offnum,
ScanDirection  dir 
)
static

Definition at line 2213 of file nbtsearch.c.

2214{
2215 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2216
2217 so->numKilled = 0; /* just paranoia */
2218 so->markItemIndex = -1; /* ditto */
2219
2220 /* Initialize so->currPos for the first page (page in so->currPos.buf) */
2221 if (so->needPrimScan)
2222 {
2223 Assert(so->numArrayKeys);
2224
2225 so->currPos.moreLeft = true;
2226 so->currPos.moreRight = true;
2227 so->needPrimScan = false;
2228 }
2229 else if (ScanDirectionIsForward(dir))
2230 {
2231 so->currPos.moreLeft = false;
2232 so->currPos.moreRight = true;
2233 }
2234 else
2235 {
2236 so->currPos.moreLeft = true;
2237 so->currPos.moreRight = false;
2238 }
2239
2240 /*
2241 * Attempt to load matching tuples from the first page.
2242 *
2243 * Note that _bt_readpage will finish initializing the so->currPos fields.
2244 * _bt_readpage also releases parallel scan (even when it returns false).
2245 */
2246 if (_bt_readpage(scan, dir, offnum, true))
2247 {
2248 /*
2249 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2250 * so->currPos.buf in preparation for btgettuple returning tuples.
2251 */
2254 return true;
2255 }
2256
2257 /* There's no actually-matching data on the page in so->currPos.buf */
2259
2260 /* Call _bt_readnextpage using its _bt_steppage wrapper function */
2261 if (!_bt_steppage(scan, dir))
2262 return false;
2263
2264 /* _bt_readpage for a later page (now in so->currPos) succeeded */
2265 return true;
2266}
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:67
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
Definition: nbtsearch.c:1593
bool moreRight
Definition: nbtree.h:986
bool moreLeft
Definition: nbtree.h:985

References _bt_drop_lock_and_maybe_pin(), _bt_readpage(), _bt_steppage(), _bt_unlockbuf(), Assert(), BTScanPosIsPinned, BTScanPosData::buf, BTScanOpaqueData::currPos, if(), IndexScanDescData::indexRelation, BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), and _bt_first().

◆ _bt_readnextpage()

static bool _bt_readnextpage ( IndexScanDesc  scan,
BlockNumber  blkno,
BlockNumber  lastcurrblkno,
ScanDirection  dir,
bool  seized 
)
static

Definition at line 2301 of file nbtsearch.c.

2303{
2304 Relation rel = scan->indexRelation;
2305 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2306
2307 Assert(so->currPos.currPage == lastcurrblkno || seized);
2308 Assert(!(blkno == P_NONE && seized));
2310
2311 /*
2312 * Remember that the scan already read lastcurrblkno, a page to the left
2313 * of blkno (or remember reading a page to the right, for backwards scans)
2314 */
2315 if (ScanDirectionIsForward(dir))
2316 so->currPos.moreLeft = true;
2317 else
2318 so->currPos.moreRight = true;
2319
2320 for (;;)
2321 {
2322 Page page;
2323 BTPageOpaque opaque;
2324
2325 if (blkno == P_NONE ||
2327 !so->currPos.moreRight : !so->currPos.moreLeft))
2328 {
2329 /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
2330 Assert(so->currPos.currPage == lastcurrblkno && !seized);
2332 _bt_parallel_done(scan); /* iff !so->needPrimScan */
2333 return false;
2334 }
2335
2336 Assert(!so->needPrimScan);
2337
2338 /* parallel scan must never actually visit so->currPos blkno */
2339 if (!seized && scan->parallel_scan != NULL &&
2340 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
2341 {
2342 /* whole scan is now done (or another primitive scan required) */
2344 return false;
2345 }
2346
2347 if (ScanDirectionIsForward(dir))
2348 {
2349 /* read blkno, but check for interrupts first */
2351 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2352 }
2353 else
2354 {
2355 /* read blkno, avoiding race (also checks for interrupts) */
2356 so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
2357 lastcurrblkno);
2358 if (so->currPos.buf == InvalidBuffer)
2359 {
2360 /* must have been a concurrent deletion of leftmost page */
2362 _bt_parallel_done(scan);
2363 return false;
2364 }
2365 }
2366
2367 page = BufferGetPage(so->currPos.buf);
2368 opaque = BTPageGetOpaque(page);
2369 lastcurrblkno = blkno;
2370 if (likely(!P_IGNORE(opaque)))
2371 {
2372 /* see if there are any matches on this page */
2373 if (ScanDirectionIsForward(dir))
2374 {
2375 /* note that this will clear moreRight if we can stop */
2376 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
2377 break;
2378 blkno = so->currPos.nextPage;
2379 }
2380 else
2381 {
2382 /* note that this will clear moreLeft if we can stop */
2383 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
2384 break;
2385 blkno = so->currPos.prevPage;
2386 }
2387 }
2388 else
2389 {
2390 /* _bt_readpage not called, so do all this for ourselves */
2391 if (ScanDirectionIsForward(dir))
2392 blkno = opaque->btpo_next;
2393 else
2394 blkno = opaque->btpo_prev;
2395 if (scan->parallel_scan != NULL)
2396 _bt_parallel_release(scan, blkno, lastcurrblkno);
2397 }
2398
2399 /* no matching tuples on this page */
2400 _bt_relbuf(rel, so->currPos.buf);
2401 seized = false; /* released by _bt_readpage (or by us) */
2402 }
2403
2404 /*
2405 * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2406 * so->currPos.buf in preparation for btgettuple returning tuples.
2407 */
2408 Assert(so->currPos.currPage == blkno);
2411
2412 return true;
2413}
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:922
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition: nbtsearch.c:2436
BlockNumber currPage
Definition: nbtree.h:967
BlockNumber prevPage
Definition: nbtree.h:968
BlockNumber nextPage
Definition: nbtree.h:969

References _bt_drop_lock_and_maybe_pin(), _bt_getbuf(), _bt_lock_and_validate_left(), _bt_parallel_done(), _bt_parallel_release(), _bt_parallel_seize(), _bt_readpage(), _bt_relbuf(), Assert(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosData::buf, BufferGetPage(), CHECK_FOR_INTERRUPTS, BTScanPosData::currPage, BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBuffer, likely, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, BTScanPosData::prevPage, and ScanDirectionIsForward.

Referenced by _bt_first(), and _bt_steppage().

◆ _bt_readpage()

static bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum,
bool  firstpage 
)
static

Definition at line 1593 of file nbtsearch.c.

1595{
1596 Relation rel = scan->indexRelation;
1597 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1598 Page page;
1599 BTPageOpaque opaque;
1600 OffsetNumber minoff;
1601 OffsetNumber maxoff;
1602 BTReadPageState pstate;
1603 bool arrayKeys;
1604 int itemIndex,
1605 indnatts;
1606
1607 /* save the page/buffer block number, along with its sibling links */
1608 page = BufferGetPage(so->currPos.buf);
1609 opaque = BTPageGetOpaque(page);
1611 so->currPos.prevPage = opaque->btpo_prev;
1612 so->currPos.nextPage = opaque->btpo_next;
1613
1614 Assert(!P_IGNORE(opaque));
1616 Assert(!so->needPrimScan);
1617
1618 if (scan->parallel_scan)
1619 {
1620 /* allow next/prev page to be read by other worker without delay */
1621 if (ScanDirectionIsForward(dir))
1623 so->currPos.currPage);
1624 else
1626 so->currPos.currPage);
1627 }
1628
1629 /* initialize remaining currPos fields related to current page */
1631 so->currPos.dir = dir;
1632 so->currPos.nextTupleOffset = 0;
1633 /* either moreLeft or moreRight should be set now (may be unset later) */
1635 so->currPos.moreLeft);
1636
1638
1639 /* initialize local variables */
1640 indnatts = IndexRelationGetNumberOfAttributes(rel);
1641 arrayKeys = so->numArrayKeys != 0;
1642 minoff = P_FIRSTDATAKEY(opaque);
1643 maxoff = PageGetMaxOffsetNumber(page);
1644
1645 /* initialize page-level state that we'll pass to _bt_checkkeys */
1646 pstate.minoff = minoff;
1647 pstate.maxoff = maxoff;
1648 pstate.finaltup = NULL;
1649 pstate.page = page;
1650 pstate.firstpage = firstpage;
1651 pstate.forcenonrequired = false;
1652 pstate.startikey = 0;
1653 pstate.offnum = InvalidOffsetNumber;
1654 pstate.skip = InvalidOffsetNumber;
1655 pstate.continuescan = true; /* default assumption */
1656 pstate.rechecks = 0;
1657 pstate.targetdistance = 0;
1658 pstate.nskipadvances = 0;
1659
1660 if (ScanDirectionIsForward(dir))
1661 {
1662 /* SK_SEARCHARRAY forward scans must provide high key up front */
1663 if (arrayKeys)
1664 {
1665 if (!P_RIGHTMOST(opaque))
1666 {
1667 ItemId iid = PageGetItemId(page, P_HIKEY);
1668
1669 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1670
1671 if (so->scanBehind &&
1672 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1673 {
1674 /* Schedule another primitive index scan after all */
1675 so->currPos.moreRight = false;
1676 so->needPrimScan = true;
1677 if (scan->parallel_scan)
1679 so->currPos.currPage);
1680 return false;
1681 }
1682 }
1683
1684 so->scanBehind = so->oppositeDirCheck = false; /* reset */
1685 }
1686
1687 /*
1688 * Consider pstate.startikey optimization once the ongoing primitive
1689 * index scan has already read at least one page
1690 */
1691 if (!pstate.firstpage && minoff < maxoff)
1692 _bt_set_startikey(scan, &pstate);
1693
1694 /* load items[] in ascending order */
1695 itemIndex = 0;
1696
1697 offnum = Max(offnum, minoff);
1698
1699 while (offnum <= maxoff)
1700 {
1701 ItemId iid = PageGetItemId(page, offnum);
1702 IndexTuple itup;
1703 bool passes_quals;
1704
1705 /*
1706 * If the scan specifies not to return killed tuples, then we
1707 * treat a killed tuple as not passing the qual
1708 */
1709 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1710 {
1711 offnum = OffsetNumberNext(offnum);
1712 continue;
1713 }
1714
1715 itup = (IndexTuple) PageGetItem(page, iid);
1716 Assert(!BTreeTupleIsPivot(itup));
1717
1718 pstate.offnum = offnum;
1719 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1720 itup, indnatts);
1721
1722 /*
1723 * Check if we need to skip ahead to a later tuple (only possible
1724 * when the scan uses array keys)
1725 */
1726 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1727 {
1728 Assert(!passes_quals && pstate.continuescan);
1729 Assert(offnum < pstate.skip);
1730 Assert(!pstate.forcenonrequired);
1731
1732 offnum = pstate.skip;
1733 pstate.skip = InvalidOffsetNumber;
1734 continue;
1735 }
1736
1737 if (passes_quals)
1738 {
1739 /* tuple passes all scan key conditions */
1740 if (!BTreeTupleIsPosting(itup))
1741 {
1742 /* Remember it */
1743 _bt_saveitem(so, itemIndex, offnum, itup);
1744 itemIndex++;
1745 }
1746 else
1747 {
1748 int tupleOffset;
1749
1750 /*
1751 * Set up state to return posting list, and remember first
1752 * TID
1753 */
1754 tupleOffset =
1755 _bt_setuppostingitems(so, itemIndex, offnum,
1756 BTreeTupleGetPostingN(itup, 0),
1757 itup);
1758 itemIndex++;
1759 /* Remember additional TIDs */
1760 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1761 {
1762 _bt_savepostingitem(so, itemIndex, offnum,
1763 BTreeTupleGetPostingN(itup, i),
1764 tupleOffset);
1765 itemIndex++;
1766 }
1767 }
1768 }
1769 /* When !continuescan, there can't be any more matches, so stop */
1770 if (!pstate.continuescan)
1771 break;
1772
1773 offnum = OffsetNumberNext(offnum);
1774 }
1775
1776 /*
1777 * We don't need to visit page to the right when the high key
1778 * indicates that no more matches will be found there.
1779 *
1780 * Checking the high key like this works out more often than you might
1781 * think. Leaf page splits pick a split point between the two most
1782 * dissimilar tuples (this is weighed against the need to evenly share
1783 * free space). Leaf pages with high key attribute values that can
1784 * only appear on non-pivot tuples on the right sibling page are
1785 * common.
1786 */
1787 if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
1788 {
1789 ItemId iid = PageGetItemId(page, P_HIKEY);
1790 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1791 int truncatt;
1792
1793 truncatt = BTreeTupleGetNAtts(itup, rel);
1794 pstate.forcenonrequired = false;
1795 pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
1796 _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1797 }
1798
1799 if (!pstate.continuescan)
1800 so->currPos.moreRight = false;
1801
1802 Assert(itemIndex <= MaxTIDsPerBTreePage);
1803 so->currPos.firstItem = 0;
1804 so->currPos.lastItem = itemIndex - 1;
1805 so->currPos.itemIndex = 0;
1806 }
1807 else
1808 {
1809 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1810 if (arrayKeys)
1811 {
1812 if (minoff <= maxoff && !P_LEFTMOST(opaque))
1813 {
1814 ItemId iid = PageGetItemId(page, minoff);
1815
1816 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1817
1818 if (so->scanBehind &&
1819 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1820 {
1821 /* Schedule another primitive index scan after all */
1822 so->currPos.moreLeft = false;
1823 so->needPrimScan = true;
1824 if (scan->parallel_scan)
1826 so->currPos.currPage);
1827 return false;
1828 }
1829 }
1830
1831 so->scanBehind = so->oppositeDirCheck = false; /* reset */
1832 }
1833
1834 /*
1835 * Consider pstate.startikey optimization once the ongoing primitive
1836 * index scan has already read at least one page
1837 */
1838 if (!pstate.firstpage && minoff < maxoff)
1839 _bt_set_startikey(scan, &pstate);
1840
1841 /* load items[] in descending order */
1842 itemIndex = MaxTIDsPerBTreePage;
1843
1844 offnum = Min(offnum, maxoff);
1845
1846 while (offnum >= minoff)
1847 {
1848 ItemId iid = PageGetItemId(page, offnum);
1849 IndexTuple itup;
1850 bool tuple_alive;
1851 bool passes_quals;
1852
1853 /*
1854 * If the scan specifies not to return killed tuples, then we
1855 * treat a killed tuple as not passing the qual. Most of the
1856 * time, it's a win to not bother examining the tuple's index
1857 * keys, but just skip to the next tuple (previous, actually,
1858 * since we're scanning backwards). However, if this is the first
1859 * tuple on the page, we do check the index keys, to prevent
1860 * uselessly advancing to the page to the left. This is similar
1861 * to the high key optimization used by forward scans.
1862 */
1863 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1864 {
1865 if (offnum > minoff)
1866 {
1867 offnum = OffsetNumberPrev(offnum);
1868 continue;
1869 }
1870
1871 tuple_alive = false;
1872 }
1873 else
1874 tuple_alive = true;
1875
1876 itup = (IndexTuple) PageGetItem(page, iid);
1877 Assert(!BTreeTupleIsPivot(itup));
1878
1879 pstate.offnum = offnum;
1880 if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
1881 {
1882 pstate.forcenonrequired = false;
1883 pstate.startikey = 0;
1884 }
1885 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1886 itup, indnatts);
1887
1888 if (arrayKeys && so->scanBehind)
1889 {
1890 /*
1891 * Done scanning this page, but not done with the current
1892 * primscan.
1893 *
1894 * Note: Forward scans don't check this explicitly, since they
1895 * prefer to reuse pstate.skip for this instead.
1896 */
1897 Assert(!passes_quals && pstate.continuescan);
1898 Assert(!pstate.forcenonrequired);
1899
1900 break;
1901 }
1902
1903 /*
1904 * Check if we need to skip ahead to a later tuple (only possible
1905 * when the scan uses array keys)
1906 */
1907 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1908 {
1909 Assert(!passes_quals && pstate.continuescan);
1910 Assert(offnum > pstate.skip);
1911 Assert(!pstate.forcenonrequired);
1912
1913 offnum = pstate.skip;
1914 pstate.skip = InvalidOffsetNumber;
1915 continue;
1916 }
1917
1918 if (passes_quals && tuple_alive)
1919 {
1920 /* tuple passes all scan key conditions */
1921 if (!BTreeTupleIsPosting(itup))
1922 {
1923 /* Remember it */
1924 itemIndex--;
1925 _bt_saveitem(so, itemIndex, offnum, itup);
1926 }
1927 else
1928 {
1929 int tupleOffset;
1930
1931 /*
1932 * Set up state to return posting list, and remember first
1933 * TID.
1934 *
1935 * Note that we deliberately save/return items from
1936 * posting lists in ascending heap TID order for backwards
1937 * scans. This allows _bt_killitems() to make a
1938 * consistent assumption about the order of items
1939 * associated with the same posting list tuple.
1940 */
1941 itemIndex--;
1942 tupleOffset =
1943 _bt_setuppostingitems(so, itemIndex, offnum,
1944 BTreeTupleGetPostingN(itup, 0),
1945 itup);
1946 /* Remember additional TIDs */
1947 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1948 {
1949 itemIndex--;
1950 _bt_savepostingitem(so, itemIndex, offnum,
1951 BTreeTupleGetPostingN(itup, i),
1952 tupleOffset);
1953 }
1954 }
1955 }
1956 /* When !continuescan, there can't be any more matches, so stop */
1957 if (!pstate.continuescan)
1958 break;
1959
1960 offnum = OffsetNumberPrev(offnum);
1961 }
1962
1963 /*
1964 * We don't need to visit page to the left when no more matches will
1965 * be found there
1966 */
1967 if (!pstate.continuescan)
1968 so->currPos.moreLeft = false;
1969
1970 Assert(itemIndex >= 0);
1971 so->currPos.firstItem = itemIndex;
1974 }
1975
1976 /*
1977 * If _bt_set_startikey told us to temporarily treat the scan's keys as
1978 * nonrequired (possible only during scans with array keys), there must be
1979 * no lasting consequences for the scan's array keys. The scan's arrays
1980 * should now have exactly the same elements as they would have had if the
1981 * nonrequired behavior had never been used. (In general, a scan's arrays
1982 * are expected to track its progress through the index's key space.)
1983 *
1984 * We are required (by _bt_set_startikey) to call _bt_checkkeys against
1985 * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
1986 * arrays to recover. Assert that that step hasn't been missed.
1987 */
1988 Assert(!pstate.forcenonrequired);
1989
1990 return (so->currPos.firstItem <= so->currPos.lastItem);
1991}
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4493
#define Max(x, y)
Definition: c.h:969
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition: nbtree.c:999
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:1995
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:2025
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:2063
bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
Definition: nbtutils.c:2389
bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtutils.c:2261
void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
Definition: nbtutils.c:2485
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2599
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:528
bool oppositeDirCheck
Definition: nbtree.h:1065
int nextTupleOffset
Definition: nbtree.h:979
ScanDirection dir
Definition: nbtree.h:973
XLogRecPtr lsn
Definition: nbtree.h:970
bool ignore_killed_tuples
Definition: relscan.h:148

References _bt_checkkeys(), _bt_parallel_primscan_schedule(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_scanbehind_checkkeys(), _bt_set_startikey(), _bt_setuppostingitems(), Assert(), BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetPage(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::dir, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, InvalidOffsetNumber, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanPosData::lsn, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, OffsetNumberIsValid, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_LEFTMOST, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), BTScanPosData::prevPage, BTScanOpaqueData::scanBehind, ScanDirectionIsForward, and IndexScanDescData::xs_snapshot.

Referenced by _bt_readfirstpage(), and _bt_readnextpage().

◆ _bt_returnitem()

static void _bt_returnitem ( IndexScanDesc  scan,
BTScanOpaque  so 
)
inlinestatic

Definition at line 2084 of file nbtsearch.c.

2085{
2086 BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
2087
2088 /* Most recent _bt_readpage must have succeeded */
2092
2093 /* Return next item, per amgettuple contract */
2094 scan->xs_heaptid = currItem->heapTid;
2095 if (so->currTuples)
2096 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2097}
char * currTuples
Definition: nbtree.h:1079
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
ItemPointerData heapTid
Definition: nbtree.h:957
LocationIndex tupleOffset
Definition: nbtree.h:959
IndexTuple xs_itup
Definition: relscan.h:167
ItemPointerData xs_heaptid
Definition: relscan.h:172

References Assert(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, BTScanPosItem::tupleOffset, IndexScanDescData::xs_heaptid, and IndexScanDescData::xs_itup.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

◆ _bt_saveitem()

static void _bt_saveitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
IndexTuple  itup 
)
static

Definition at line 1995 of file nbtsearch.c.

1997{
1998 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1999
2001
2002 currItem->heapTid = itup->t_tid;
2003 currItem->indexOffset = offnum;
2004 if (so->currTuples)
2005 {
2006 Size itupsz = IndexTupleSize(itup);
2007
2008 currItem->tupleOffset = so->currPos.nextTupleOffset;
2009 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
2010 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
2011 }
2012}
#define MAXALIGN(LEN)
Definition: c.h:782
size_t Size
Definition: c.h:576
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
OffsetNumber indexOffset
Definition: nbtree.h:958
ItemPointerData t_tid
Definition: itup.h:37

References Assert(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize(), BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_savepostingitem()

static void _bt_savepostingitem ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
int  tupleOffset 
)
inlinestatic

Definition at line 2063 of file nbtsearch.c.

2065{
2066 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2067
2068 currItem->heapTid = *heapTid;
2069 currItem->indexOffset = offnum;
2070
2071 /*
2072 * Have index-only scans return the same base IndexTuple for every TID
2073 * that originates from the same posting list
2074 */
2075 if (so->currTuples)
2076 currItem->tupleOffset = tupleOffset;
2077}

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 102 of file nbtsearch.c.

104{
105 BTStack stack_in = NULL;
106 int page_access = BT_READ;
107
108 /* heaprel must be set whenever _bt_allocbuf is reachable */
110 Assert(access == BT_READ || heaprel != NULL);
111
112 /* Get the root page to start with */
113 *bufP = _bt_getroot(rel, heaprel, access);
114
115 /* If index is empty and access = BT_READ, no root page is created. */
116 if (!BufferIsValid(*bufP))
117 return (BTStack) NULL;
118
119 /* Loop iterates once per level descended in the tree */
120 for (;;)
121 {
122 Page page;
123 BTPageOpaque opaque;
124 OffsetNumber offnum;
125 ItemId itemid;
126 IndexTuple itup;
127 BlockNumber child;
128 BTStack new_stack;
129
130 /*
131 * Race -- the page we just grabbed may have split since we read its
132 * downlink in its parent page (or the metapage). If it has, we may
133 * need to move right to its new sibling. Do that.
134 *
135 * In write-mode, allow _bt_moveright to finish any incomplete splits
136 * along the way. Strictly speaking, we'd only need to finish an
137 * incomplete split on the leaf page we're about to insert to, not on
138 * any of the upper levels (internal pages with incomplete splits are
139 * also taken care of in _bt_getstackbuf). But this is a good
140 * opportunity to finish splits of internal pages too.
141 */
142 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
143 stack_in, page_access);
144
145 /* if this is a leaf page, we're done */
146 page = BufferGetPage(*bufP);
147 opaque = BTPageGetOpaque(page);
148 if (P_ISLEAF(opaque))
149 break;
150
151 /*
152 * Find the appropriate pivot tuple on this page. Its downlink points
153 * to the child page that we're about to descend to.
154 */
155 offnum = _bt_binsrch(rel, key, *bufP);
156 itemid = PageGetItemId(page, offnum);
157 itup = (IndexTuple) PageGetItem(page, itemid);
158 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
159 child = BTreeTupleGetDownLink(itup);
160
161 /*
162 * We need to save the location of the pivot tuple we chose in a new
163 * stack entry for this page/level. If caller ends up splitting a
164 * page one level down, it usually ends up inserting a new pivot
165 * tuple/downlink immediately after the location recorded here.
166 */
167 new_stack = (BTStack) palloc(sizeof(BTStackData));
168 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169 new_stack->bts_offset = offnum;
170 new_stack->bts_parent = stack_in;
171
172 /*
173 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
174 * we're on the level 1 and asked to lock leaf page in write mode,
175 * then lock next page in write mode, because it must be a leaf.
176 */
177 if (opaque->btpo_level == 1 && access == BT_WRITE)
178 page_access = BT_WRITE;
179
180 /* drop the read lock on the page, then acquire one on its child */
181 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
182
183 /* okay, all set to move down a level */
184 stack_in = new_stack;
185 }
186
187 /*
188 * If we're asked to lock leaf in write mode, but didn't manage to, then
189 * relock. This should only happen when the root page is a leaf page (and
190 * the only page in the index other than the metapage).
191 */
192 if (access == BT_WRITE && page_access == BT_READ)
193 {
194 /* trade in our read lock for a write lock */
195 _bt_unlockbuf(rel, *bufP);
196 _bt_lockbuf(rel, *bufP, BT_WRITE);
197
198 /*
199 * Race -- the leaf page may have split after we dropped the read lock
200 * but before we acquired a write lock. If it has, we may need to
201 * move right to its new sibling. Do that.
202 */
203 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
204 }
205
206 return stack_in;
207}
void * palloc(Size size)
Definition: mcxt.c:1943
BTStackData * BTStack
Definition: nbtree.h:750
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:241
BlockNumber bts_blkno
Definition: nbtree.h:745
struct BTStackData * bts_parent
Definition: nbtree.h:747
OffsetNumber bts_offset
Definition: nbtree.h:746

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_setuppostingitems()

static int _bt_setuppostingitems ( BTScanOpaque  so,
int  itemIndex,
OffsetNumber  offnum,
ItemPointer  heapTid,
IndexTuple  itup 
)
static

Definition at line 2025 of file nbtsearch.c.

2027{
2028 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2029
2031
2032 currItem->heapTid = *heapTid;
2033 currItem->indexOffset = offnum;
2034 if (so->currTuples)
2035 {
2036 /* Save base IndexTuple (truncate posting list) */
2037 IndexTuple base;
2038 Size itupsz = BTreeTupleGetPostingOffset(itup);
2039
2040 itupsz = MAXALIGN(itupsz);
2041 currItem->tupleOffset = so->currPos.nextTupleOffset;
2042 base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
2043 memcpy(base, itup, itupsz);
2044 /* Defensively reduce work area index tuple header size */
2045 base->t_info &= ~INDEX_SIZE_MASK;
2046 base->t_info |= itupsz;
2047 so->currPos.nextTupleOffset += itupsz;
2048
2049 return currItem->tupleOffset;
2050 }
2051
2052 return 0;
2053}
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:530
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_info, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

◆ _bt_steppage()

static bool _bt_steppage ( IndexScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 2110 of file nbtsearch.c.

2111{
2112 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2113 BlockNumber blkno,
2114 lastcurrblkno;
2115
2117
2118 /* Before leaving current page, deal with any killed items */
2119 if (so->numKilled > 0)
2120 _bt_killitems(scan);
2121
2122 /*
2123 * Before we modify currPos, make a copy of the page data if there was a
2124 * mark position that needs it.
2125 */
2126 if (so->markItemIndex >= 0)
2127 {
2128 /* bump pin on current buffer for assignment to mark buffer */
2129 if (BTScanPosIsPinned(so->currPos))
2131 memcpy(&so->markPos, &so->currPos,
2132 offsetof(BTScanPosData, items[1]) +
2133 so->currPos.lastItem * sizeof(BTScanPosItem));
2134 if (so->markTuples)
2135 memcpy(so->markTuples, so->currTuples,
2138 so->markItemIndex = -1;
2139
2140 /*
2141 * If we're just about to start the next primitive index scan
2142 * (possible with a scan that has arrays keys, and needs to skip to
2143 * continue in the current scan direction), moreLeft/moreRight only
2144 * indicate the end of the current primitive index scan. They must
2145 * never be taken to indicate that the top-level index scan has ended
2146 * (that would be wrong).
2147 *
2148 * We could handle this case by treating the current array keys as
2149 * markPos state. But depending on the current array state like this
2150 * would add complexity. Instead, we just unset markPos's copy of
2151 * moreRight or moreLeft (whichever might be affected), while making
2152 * btrestrpos reset the scan's arrays to their initial scan positions.
2153 * In effect, btrestrpos leaves advancing the arrays up to the first
2154 * _bt_readpage call (that takes place after it has restored markPos).
2155 */
2156 if (so->needPrimScan)
2157 {
2159 so->markPos.moreRight = true;
2160 else
2161 so->markPos.moreLeft = true;
2162 }
2163
2164 /* mark/restore not supported by parallel scans */
2165 Assert(!scan->parallel_scan);
2166 }
2167
2169
2170 /* Walk to the next page with data */
2171 if (ScanDirectionIsForward(dir))
2172 blkno = so->currPos.nextPage;
2173 else
2174 blkno = so->currPos.prevPage;
2175 lastcurrblkno = so->currPos.currPage;
2176
2177 /*
2178 * Cancel primitive index scans that were scheduled when the call to
2179 * _bt_readpage for currPos happened to use the opposite direction to the
2180 * one that we're stepping in now. (It's okay to leave the scan's array
2181 * keys as-is, since the next _bt_readpage will advance them.)
2182 */
2183 if (so->currPos.dir != dir)
2184 so->needPrimScan = false;
2185
2186 return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
2187}
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5405
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:3310
char * markTuples
Definition: nbtree.h:1080
BTScanPosData markPos
Definition: nbtree.h:1093
static ItemArray items
Definition: test_tidstore.c:48

References _bt_killitems(), _bt_readnextpage(), Assert(), BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, IncrBufferRefCount(), BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTScanPosData::prevPage, and ScanDirectionIsForward.

Referenced by _bt_next(), and _bt_readfirstpage().