nbtsearch_8c_source.html

/*-------------------------------------------------------------------------

 *

 * nbtsearch.c

 *    Search code for postgres btrees.

 *

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 * IDENTIFICATION

 *    src/backend/access/nbtree/nbtsearch.c

 *

 *-------------------------------------------------------------------------

 */


#include "postgres.h"


#include "access/nbtree.h"

#include "access/relscan.h"

#include "access/xact.h"

#include "miscadmin.h"

#include "pgstat.h"

#include "storage/predicate.h"

#include "utils/lsyscache.h"

#include "utils/rel.h"


static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);

static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,

                            Buffer buf, bool forupdate, BTStack stack,

                            int access);

static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);

static int  _bt_binsrch_posting(BTScanInsert key, Page page,

                                OffsetNumber offnum);

static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,

                         OffsetNumber offnum, bool firstpage);

static void _bt_saveitem(BTScanOpaque so, int itemIndex,

                         OffsetNumber offnum, IndexTuple itup);

static int  _bt_setuppostingitems(BTScanOpaque so, int itemIndex,

                                  OffsetNumber offnum, ItemPointer heapTid,

                                  IndexTuple itup);

static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,

                                       OffsetNumber offnum,

                                       ItemPointer heapTid, int tupleOffset);

static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);

static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);

static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,

                              ScanDirection dir);

static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,

                             BlockNumber lastcurrblkno, ScanDirection dir,

                             bool seized);

static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,

                                         BlockNumber lastcurrblkno);

static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);


/*

 *  _bt_drop_lock_and_maybe_pin()

 *

 * Unlock the buffer; and if it is safe to release the pin, do that, too.

 * This will prevent vacuum from stalling in a blocked state trying to read a

 * page when a cursor is sitting on it.

 *

 * See nbtree/README section on making concurrent TID recycling safe.

 */

static void

_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)

{

    _bt_unlockbuf(scan->indexRelation, sp->buf);


    if (IsMVCCSnapshot(scan->xs_snapshot) &&

        RelationNeedsWAL(scan->indexRelation) &&

        !scan->xs_want_itup)

    {

        ReleaseBuffer(sp->buf);

        sp->buf = InvalidBuffer;

    }

}


/*

 *  _bt_search() -- Search the tree for a particular scankey,

 *      or more precisely for the first leaf page it could be on.

 *

 * The passed scankey is an insertion-type scankey (see nbtree/README),

 * but it can omit the rightmost column(s) of the index.

 *

 * Return value is a stack of parent-page pointers (i.e. there is no entry for

 * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,

 * which is locked and pinned.  No locks are held on the parent pages,

 * however!

 *

 * The returned buffer is locked according to access parameter.  Additionally,

 * access = BT_WRITE will allow an empty root page to be created and returned.

 * When access = BT_READ, an empty index will result in *bufP being set to

 * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered

 * during the search will be finished.

 *

 * heaprel must be provided by callers that pass access = BT_WRITE, since we

 * might need to allocate a new root page for caller -- see _bt_allocbuf.

 */

BTStack

_bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,

           int access)

{

    BTStack     stack_in = NULL;

    int         page_access = BT_READ;


    /* heaprel must be set whenever _bt_allocbuf is reachable */

    Assert(access == BT_READ || access == BT_WRITE);

    Assert(access == BT_READ || heaprel != NULL);


    /* Get the root page to start with */

    *bufP = _bt_getroot(rel, heaprel, access);


    /* If index is empty and access = BT_READ, no root page is created. */

    if (!BufferIsValid(*bufP))

        return (BTStack) NULL;


    /* Loop iterates once per level descended in the tree */

    for (;;)

    {

        Page        page;

        BTPageOpaque opaque;

        OffsetNumber offnum;

        ItemId      itemid;

        IndexTuple  itup;

        BlockNumber child;

        BTStack     new_stack;


        /*

         * Race -- the page we just grabbed may have split since we read its

         * downlink in its parent page (or the metapage).  If it has, we may

         * need to move right to its new sibling.  Do that.

         *

         * In write-mode, allow _bt_moveright to finish any incomplete splits

         * along the way.  Strictly speaking, we'd only need to finish an

         * incomplete split on the leaf page we're about to insert to, not on

         * any of the upper levels (internal pages with incomplete splits are

         * also taken care of in _bt_getstackbuf).  But this is a good

         * opportunity to finish splits of internal pages too.

         */

        *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),

                              stack_in, page_access);


        /* if this is a leaf page, we're done */

        page = BufferGetPage(*bufP);

        opaque = BTPageGetOpaque(page);

        if (P_ISLEAF(opaque))

            break;


        /*

         * Find the appropriate pivot tuple on this page.  Its downlink points

         * to the child page that we're about to descend to.

         */

        offnum = _bt_binsrch(rel, key, *bufP);

        itemid = PageGetItemId(page, offnum);

        itup = (IndexTuple) PageGetItem(page, itemid);

        Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);

        child = BTreeTupleGetDownLink(itup);


        /*

         * We need to save the location of the pivot tuple we chose in a new

         * stack entry for this page/level.  If caller ends up splitting a

         * page one level down, it usually ends up inserting a new pivot

         * tuple/downlink immediately after the location recorded here.

         */

        new_stack = (BTStack) palloc(sizeof(BTStackData));

        new_stack->bts_blkno = BufferGetBlockNumber(*bufP);

        new_stack->bts_offset = offnum;

        new_stack->bts_parent = stack_in;


        /*

         * Page level 1 is lowest non-leaf page level prior to leaves.  So, if

         * we're on the level 1 and asked to lock leaf page in write mode,

         * then lock next page in write mode, because it must be a leaf.

         */

        if (opaque->btpo_level == 1 && access == BT_WRITE)

            page_access = BT_WRITE;


        /* drop the read lock on the page, then acquire one on its child */

        *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);


        /* okay, all set to move down a level */

        stack_in = new_stack;

    }


    /*

     * If we're asked to lock leaf in write mode, but didn't manage to, then

     * relock.  This should only happen when the root page is a leaf page (and

     * the only page in the index other than the metapage).

     */

    if (access == BT_WRITE && page_access == BT_READ)

    {

        /* trade in our read lock for a write lock */

        _bt_unlockbuf(rel, *bufP);

        _bt_lockbuf(rel, *bufP, BT_WRITE);


        /*

         * Race -- the leaf page may have split after we dropped the read lock

         * but before we acquired a write lock.  If it has, we may need to

         * move right to its new sibling.  Do that.

         */

        *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);

    }


    return stack_in;

}


/*

 *  _bt_moveright() -- move right in the btree if necessary.

 *

 * When we follow a pointer to reach a page, it is possible that

 * the page has changed in the meanwhile.  If this happens, we're

 * guaranteed that the page has "split right" -- that is, that any

 * data that appeared on the page originally is either on the page

 * or strictly to the right of it.

 *

 * This routine decides whether or not we need to move right in the

 * tree by examining the high key entry on the page.  If that entry is

 * strictly less than the scankey, or <= the scankey in the

 * key.nextkey=true case, then we followed the wrong link and we need

 * to move right.

 *

 * The passed insertion-type scankey can omit the rightmost column(s) of the

 * index. (see nbtree/README)

 *

 * When key.nextkey is false (the usual case), we are looking for the first

 * item >= key.  When key.nextkey is true, we are looking for the first item

 * strictly greater than key.

 *

 * If forupdate is true, we will attempt to finish any incomplete splits

 * that we encounter.  This is required when locking a target page for an

 * insertion, because we don't allow inserting on a page before the split is

 * completed.  'heaprel' and 'stack' are only used if forupdate is true.

 *

 * On entry, we have the buffer pinned and a lock of the type specified by

 * 'access'.  If we move right, we release the buffer and lock and acquire

 * the same on the right sibling.  Return value is the buffer we stop at.

 */

static Buffer

_bt_moveright(Relation rel,

              Relation heaprel,

              BTScanInsert key,

              Buffer buf,

              bool forupdate,

              BTStack stack,

              int access)

{

    Page        page;

    BTPageOpaque opaque;

    int32       cmpval;


    Assert(!forupdate || heaprel != NULL);


    /*

     * When nextkey = false (normal case): if the scan key that brought us to

     * this page is > the high key stored on the page, then the page has split

     * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could

     * have some duplicates to the right as well as the left, but that's

     * something that's only ever dealt with on the leaf level, after

     * _bt_search has found an initial leaf page.)

     *

     * When nextkey = true: move right if the scan key is >= page's high key.

     * (Note that key.scantid cannot be set in this case.)

     *

     * The page could even have split more than once, so scan as far as

     * needed.

     *

     * We also have to move right if we followed a link that brought us to a

     * dead page.

     */

    cmpval = key->nextkey ? 0 : 1;


    for (;;)

    {

        page = BufferGetPage(buf);

        opaque = BTPageGetOpaque(page);


        if (P_RIGHTMOST(opaque))

            break;


        /*

         * Finish any incomplete splits we encounter along the way.

         */

        if (forupdate && P_INCOMPLETE_SPLIT(opaque))

        {

            BlockNumber blkno = BufferGetBlockNumber(buf);


            /* upgrade our lock if necessary */

            if (access == BT_READ)

            {

                _bt_unlockbuf(rel, buf);

                _bt_lockbuf(rel, buf, BT_WRITE);

            }


            if (P_INCOMPLETE_SPLIT(opaque))

                _bt_finish_split(rel, heaprel, buf, stack);

            else

                _bt_relbuf(rel, buf);


            /* re-acquire the lock in the right mode, and re-check */

            buf = _bt_getbuf(rel, blkno, access);

            continue;

        }


        if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)

        {

            /* step right one page */

            buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);

            continue;

        }

        else

            break;

    }


    if (P_IGNORE(opaque))

        elog(ERROR, "fell off the end of index \"%s\"",

             RelationGetRelationName(rel));


    return buf;

}


/*

 *  _bt_binsrch() -- Do a binary search for a key on a particular page.

 *

 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber

 * of the last key < given scankey, or last key <= given scankey if nextkey

 * is true.  (Since _bt_compare treats the first data key of such a page as

 * minus infinity, there will be at least one key < scankey, so the result

 * always points at one of the keys on the page.)

 *

 * On a leaf page, _bt_binsrch() returns the final result of the initial

 * positioning process that started with _bt_first's call to _bt_search.

 * We're returning a non-pivot tuple offset, so things are a little different.

 * It is possible that we'll return an offset that's either past the last

 * non-pivot slot, or (in the case of a backward scan) before the first slot.

 *

 * This procedure is not responsible for walking right, it just examines

 * the given page.  _bt_binsrch() has no lock or refcount side effects

 * on the buffer.

 */

static OffsetNumber

_bt_binsrch(Relation rel,

            BTScanInsert key,

            Buffer buf)

{

    Page        page;

    BTPageOpaque opaque;

    OffsetNumber low,

                high;

    int32       result,

                cmpval;


    page = BufferGetPage(buf);

    opaque = BTPageGetOpaque(page);


    /* Requesting nextkey semantics while using scantid seems nonsensical */

    Assert(!key->nextkey || key->scantid == NULL);

    /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */

    Assert(!P_ISLEAF(opaque) || key->scantid == NULL);


    low = P_FIRSTDATAKEY(opaque);

    high = PageGetMaxOffsetNumber(page);


    /*

     * If there are no keys on the page, return the first available slot. Note

     * this covers two cases: the page is really empty (no keys), or it

     * contains only a high key.  The latter case is possible after vacuuming.

     * This can never happen on an internal page, however, since they are

     * never empty (an internal page must have at least one child).

     */

    if (unlikely(high < low))

        return low;


    /*

     * Binary search to find the first key on the page >= scan key, or first

     * key > scankey when nextkey is true.

     *

     * For nextkey=false (cmpval=1), the loop invariant is: all slots before

     * 'low' are < scan key, all slots at or after 'high' are >= scan key.

     *

     * For nextkey=true (cmpval=0), the loop invariant is: all slots before

     * 'low' are <= scan key, all slots at or after 'high' are > scan key.

     *

     * We can fall out when high == low.

     */

    high++;                     /* establish the loop invariant for high */


    cmpval = key->nextkey ? 0 : 1;  /* select comparison value */


    while (high > low)

    {

        OffsetNumber mid = low + ((high - low) / 2);


        /* We have low <= mid < high, so mid points at a real slot */


        result = _bt_compare(rel, key, page, mid);


        if (result >= cmpval)

            low = mid + 1;

        else

            high = mid;

    }


    /*

     * At this point we have high == low.

     *

     * On a leaf page we always return the first non-pivot tuple >= scan key

     * (resp. > scan key) for forward scan callers.  For backward scans, it's

     * always the _last_ non-pivot tuple < scan key (resp. <= scan key).

     */

    if (P_ISLEAF(opaque))

    {

        /*

         * In the backward scan case we're supposed to locate the last

         * matching tuple on the leaf level -- not the first matching tuple

         * (the last tuple will be the first one returned by the scan).

         *

         * At this point we've located the first non-pivot tuple immediately

         * after the last matching tuple (which might just be maxoff + 1).

         * Compensate by stepping back.

         */

        if (key->backward)

            return OffsetNumberPrev(low);


        return low;

    }


    /*

     * On a non-leaf page, return the last key < scan key (resp. <= scan key).

     * There must be one if _bt_compare() is playing by the rules.

     *

     * _bt_compare() will seldom see any exactly-matching pivot tuples, since

     * a truncated -inf heap TID is usually enough to prevent it altogether.

     * Even omitted scan key entries are treated as > truncated attributes.

     *

     * However, during backward scans _bt_compare() interprets omitted scan

     * key attributes as == corresponding truncated -inf attributes instead.

     * This works just like < would work here.  Under this scheme, < strategy

     * backward scans will always directly descend to the correct leaf page.

     * In particular, they will never incur an "extra" leaf page access with a

     * scan key that happens to contain the same prefix of values as some

     * pivot tuple's untruncated prefix.  VACUUM relies on this guarantee when

     * it uses a leaf page high key to "re-find" a page undergoing deletion.

     */

    Assert(low > P_FIRSTDATAKEY(opaque));


    return OffsetNumberPrev(low);

}


/*

 *

 *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.

 *

 * Like _bt_binsrch(), but with support for caching the binary search

 * bounds.  Only used during insertion, and only on the leaf page that it

 * looks like caller will insert tuple on.  Exclusive-locked and pinned

 * leaf page is contained within insertstate.

 *

 * Caches the bounds fields in insertstate so that a subsequent call can

 * reuse the low and strict high bounds of original binary search.  Callers

 * that use these fields directly must be prepared for the case where low

 * and/or stricthigh are not on the same page (one or both exceed maxoff

 * for the page).  The case where there are no items on the page (high <

 * low) makes bounds invalid.

 *

 * Caller is responsible for invalidating bounds when it modifies the page

 * before calling here a second time, and for dealing with posting list

 * tuple matches (callers can use insertstate's postingoff field to

 * determine which existing heap TID will need to be replaced by a posting

 * list split).

 */

OffsetNumber

_bt_binsrch_insert(Relation rel, BTInsertState insertstate)

{

    BTScanInsert key = insertstate->itup_key;

    Page        page;

    BTPageOpaque opaque;

    OffsetNumber low,

                high,

                stricthigh;

    int32       result,

                cmpval;


    page = BufferGetPage(insertstate->buf);

    opaque = BTPageGetOpaque(page);


    Assert(P_ISLEAF(opaque));

    Assert(!key->nextkey);

    Assert(insertstate->postingoff == 0);


    if (!insertstate->bounds_valid)

    {

        /* Start new binary search */

        low = P_FIRSTDATAKEY(opaque);

        high = PageGetMaxOffsetNumber(page);

    }

    else

    {

        /* Restore result of previous binary search against same page */

        low = insertstate->low;

        high = insertstate->stricthigh;

    }


    /* If there are no keys on the page, return the first available slot */

    if (unlikely(high < low))

    {

        /* Caller can't reuse bounds */

        insertstate->low = InvalidOffsetNumber;

        insertstate->stricthigh = InvalidOffsetNumber;

        insertstate->bounds_valid = false;

        return low;

    }


    /*

     * Binary search to find the first key on the page >= scan key. (nextkey

     * is always false when inserting).

     *

     * The loop invariant is: all slots before 'low' are < scan key, all slots

     * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is

     * maintained to save additional search effort for caller.

     *

     * We can fall out when high == low.

     */

    if (!insertstate->bounds_valid)

        high++;                 /* establish the loop invariant for high */

    stricthigh = high;          /* high initially strictly higher */


    cmpval = 1;                 /* !nextkey comparison value */


    while (high > low)

    {

        OffsetNumber mid = low + ((high - low) / 2);


        /* We have low <= mid < high, so mid points at a real slot */


        result = _bt_compare(rel, key, page, mid);


        if (result >= cmpval)

            low = mid + 1;

        else

        {

            high = mid;

            if (result != 0)

                stricthigh = high;

        }


        /*

         * If tuple at offset located by binary search is a posting list whose

         * TID range overlaps with caller's scantid, perform posting list

         * binary search to set postingoff for caller.  Caller must split the

         * posting list when postingoff is set.  This should happen

         * infrequently.

         */

        if (unlikely(result == 0 && key->scantid != NULL))

        {

            /*

             * postingoff should never be set more than once per leaf page

             * binary search.  That would mean that there are duplicate table

             * TIDs in the index, which is never okay.  Check for that here.

             */

            if (insertstate->postingoff != 0)

                ereport(ERROR,

                        (errcode(ERRCODE_INDEX_CORRUPTED),

                         errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",

                                         ItemPointerGetBlockNumber(key->scantid),

                                         ItemPointerGetOffsetNumber(key->scantid),

                                         low, stricthigh,

                                         BufferGetBlockNumber(insertstate->buf),

                                         RelationGetRelationName(rel))));


            insertstate->postingoff = _bt_binsrch_posting(key, page, mid);

        }

    }


    /*

     * On a leaf page, a binary search always returns the first key >= scan

     * key (at least in !nextkey case), which could be the last slot + 1. This

     * is also the lower bound of cached search.

     *

     * stricthigh may also be the last slot + 1, which prevents caller from

     * using bounds directly, but is still useful to us if we're called a

     * second time with cached bounds (cached low will be < stricthigh when

     * that happens).

     */

    insertstate->low = low;

    insertstate->stricthigh = stricthigh;

    insertstate->bounds_valid = true;


    return low;

}


/*----------

 *  _bt_binsrch_posting() -- posting list binary search.

 *

 * Helper routine for _bt_binsrch_insert().

 *

 * Returns offset into posting list where caller's scantid belongs.

 *----------

 */

static int

_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)

{

    IndexTuple  itup;

    ItemId      itemid;

    int         low,

                high,

                mid,

                res;


    /*

     * If this isn't a posting tuple, then the index must be corrupt (if it is

     * an ordinary non-pivot tuple then there must be an existing tuple with a

     * heap TID that equals inserter's new heap TID/scantid).  Defensively

     * check that tuple is a posting list tuple whose posting list range

     * includes caller's scantid.

     *

     * (This is also needed because contrib/amcheck's rootdescend option needs

     * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)

     */

    itemid = PageGetItemId(page, offnum);

    itup = (IndexTuple) PageGetItem(page, itemid);

    if (!BTreeTupleIsPosting(itup))

        return 0;


    Assert(key->heapkeyspace && key->allequalimage);


    /*

     * In the event that posting list tuple has LP_DEAD bit set, indicate this

     * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A

     * second call to _bt_binsrch_insert() can take place when its caller has

     * removed the dead item.

     */

    if (ItemIdIsDead(itemid))

        return -1;


    /* "high" is past end of posting list for loop invariant */

    low = 0;

    high = BTreeTupleGetNPosting(itup);

    Assert(high >= 2);


    while (high > low)

    {

        mid = low + ((high - low) / 2);

        res = ItemPointerCompare(key->scantid,

                                 BTreeTupleGetPostingN(itup, mid));


        if (res > 0)

            low = mid + 1;

        else if (res < 0)

            high = mid;

        else

            return mid;

    }


    /* Exact match not found */

    return low;

}


/*----------

 *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.

 *

 *  page/offnum: location of btree item to be compared to.

 *

 *      This routine returns:

 *          <0 if scankey < tuple at offnum;

 *           0 if scankey == tuple at offnum;

 *          >0 if scankey > tuple at offnum.

 *

 * NULLs in the keys are treated as sortable values.  Therefore

 * "equality" does not necessarily mean that the item should be returned

 * to the caller as a matching key.  Similarly, an insertion scankey

 * with its scantid set is treated as equal to a posting tuple whose TID

 * range overlaps with their scantid.  There generally won't be a

 * matching TID in the posting tuple, which caller must handle

 * themselves (e.g., by splitting the posting list tuple).

 *

 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be

 * "minus infinity": this routine will always claim it is less than the

 * scankey.  The actual key value stored is explicitly truncated to 0

 * attributes (explicitly minus infinity) with version 3+ indexes, but

 * that isn't relied upon.  This allows us to implement the Lehman and

 * Yao convention that the first down-link pointer is before the first

 * key.  See backend/access/nbtree/README for details.

 *----------

 */

int32

_bt_compare(Relation rel,

            BTScanInsert key,

            Page page,

            OffsetNumber offnum)

{

    TupleDesc   itupdesc = RelationGetDescr(rel);

    BTPageOpaque opaque = BTPageGetOpaque(page);

    IndexTuple  itup;

    ItemPointer heapTid;

    ScanKey     scankey;

    int         ncmpkey;

    int         ntupatts;

    int32       result;


    Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));

    Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));

    Assert(key->heapkeyspace || key->scantid == NULL);


    /*

     * Force result ">" if target item is first data item on an internal page

     * --- see NOTE above.

     */

    if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))

        return 1;


    itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

    ntupatts = BTreeTupleGetNAtts(itup, rel);


    /*

     * The scan key is set up with the attribute number associated with each

     * term in the key.  It is important that, if the index is multi-key, the

     * scan contain the first k key attributes, and that they be in order.  If

     * you think about how multi-key ordering works, you'll understand why

     * this is.

     *

     * We don't test for violation of this condition here, however.  The

     * initial setup for the index scan had better have gotten it right (see

     * _bt_first).

     */


    ncmpkey = Min(ntupatts, key->keysz);

    Assert(key->heapkeyspace || ncmpkey == key->keysz);

    Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);

    scankey = key->scankeys;

    for (int i = 1; i <= ncmpkey; i++)

    {

        Datum       datum;

        bool        isNull;


        datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);


        if (scankey->sk_flags & SK_ISNULL)  /* key is NULL */

        {

            if (isNull)

                result = 0;     /* NULL "=" NULL */

            else if (scankey->sk_flags & SK_BT_NULLS_FIRST)

                result = -1;    /* NULL "<" NOT_NULL */

            else

                result = 1;     /* NULL ">" NOT_NULL */

        }

        else if (isNull)        /* key is NOT_NULL and item is NULL */

        {

            if (scankey->sk_flags & SK_BT_NULLS_FIRST)

                result = 1;     /* NOT_NULL ">" NULL */

            else

                result = -1;    /* NOT_NULL "<" NULL */

        }

        else

        {

            /*

             * The sk_func needs to be passed the index value as left arg and

             * the sk_argument as right arg (they might be of different

             * types).  Since it is convenient for callers to think of

             * _bt_compare as comparing the scankey to the index item, we have

             * to flip the sign of the comparison result.  (Unless it's a DESC

             * column, in which case we *don't* flip the sign.)

             */

            result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,

                                                     scankey->sk_collation,

                                                     datum,

                                                     scankey->sk_argument));


            if (!(scankey->sk_flags & SK_BT_DESC))

                INVERT_COMPARE_RESULT(result);

        }


        /* if the keys are unequal, return the difference */

        if (result != 0)

            return result;


        scankey++;

    }


    /*

     * All non-truncated attributes (other than heap TID) were found to be

     * equal.  Treat truncated attributes as minus infinity when scankey has a

     * key attribute value that would otherwise be compared directly.

     *

     * Note: it doesn't matter if ntupatts includes non-key attributes;

     * scankey won't, so explicitly excluding non-key attributes isn't

     * necessary.

     */

    if (key->keysz > ntupatts)

        return 1;


    /*

     * Use the heap TID attribute and scantid to try to break the tie.  The

     * rules are the same as any other key attribute -- only the

     * representation differs.

     */

    heapTid = BTreeTupleGetHeapTID(itup);

    if (key->scantid == NULL)

    {

        /*

         * Forward scans have a scankey that is considered greater than a

         * truncated pivot tuple if and when the scankey has equal values for

         * attributes up to and including the least significant untruncated

         * attribute in tuple.  Even attributes that were omitted from the

         * scan key are considered greater than -inf truncated attributes.

         * (See _bt_binsrch for an explanation of our backward scan behavior.)

         *

         * For example, if an index has the minimum two attributes (single

         * user key attribute, plus heap TID attribute), and a page's high key

         * is ('foo', -inf), and scankey is ('foo', <omitted>), the search

         * will not descend to the page to the left.  The search will descend

         * right instead.  The truncated attribute in pivot tuple means that

         * all non-pivot tuples on the page to the left are strictly < 'foo',

         * so it isn't necessary to descend left.  In other words, search

         * doesn't have to descend left because it isn't interested in a match

         * that has a heap TID value of -inf.

         *

         * Note: the heap TID part of the test ensures that scankey is being

         * compared to a pivot tuple with one or more truncated -inf key

         * attributes.  The heap TID attribute is the last key attribute in

         * every index, of course, but other than that it isn't special.

         */

        if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&

            key->heapkeyspace)

            return 1;


        /* All provided scankey arguments found to be equal */

        return 0;

    }


    /*

     * Treat truncated heap TID as minus infinity, since scankey has a key

     * attribute value (scantid) that would otherwise be compared directly

     */

    Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));

    if (heapTid == NULL)

        return 1;


    /*

     * Scankey must be treated as equal to a posting list tuple if its scantid

     * value falls within the range of the posting list.  In all other cases

     * there can only be a single heap TID value, which is compared directly

     * with scantid.

     */

    Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));

    result = ItemPointerCompare(key->scantid, heapTid);

    if (result <= 0 || !BTreeTupleIsPosting(itup))

        return result;

    else

    {

        result = ItemPointerCompare(key->scantid,

                                    BTreeTupleGetMaxHeapTID(itup));

        if (result > 0)

            return 1;

    }


    return 0;

}


/*

 *  _bt_first() -- Find the first item in a scan.

 *

 *      We need to be clever about the direction of scan, the search

 *      conditions, and the tree ordering.  We find the first item (or,

 *      if backwards scan, the last item) in the tree that satisfies the

 *      qualifications in the scan key.  On success exit, data about the

 *      matching tuple(s) on the page has been loaded into so->currPos.  We'll

 *      drop all locks and hold onto a pin on page's buffer, except when

 *      _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.

 *      _bt_returnitem sets the next item to return to scan on success exit.

 *

 * If there are no matching items in the index, we return false, with no

 * pins or locks held.  so->currPos will remain invalid.

 *

 * Note that scan->keyData[], and the so->keyData[] scankey built from it,

 * are both search-type scankeys (see nbtree/README for more about this).

 * Within this routine, we build a temporary insertion-type scankey to use

 * in locating the scan start position.

 */

bool

_bt_first(IndexScanDesc scan, ScanDirection dir)

{

    Relation    rel = scan->indexRelation;

    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    BTStack     stack;

    OffsetNumber offnum;

    BTScanInsertData inskey;

    ScanKey     startKeys[INDEX_MAX_KEYS];

    ScanKeyData notnullkeys[INDEX_MAX_KEYS];

    int         keysz = 0;

    StrategyNumber strat_total;

    BlockNumber blkno = InvalidBlockNumber,

                lastcurrblkno;


    Assert(!BTScanPosIsValid(so->currPos));


    /*

     * Examine the scan keys and eliminate any redundant keys; also mark the

     * keys that must be matched to continue the scan.

     */

    _bt_preprocess_keys(scan);


    /*

     * Quit now if _bt_preprocess_keys() discovered that the scan keys can

     * never be satisfied (eg, x == 1 AND x > 2).

     */

    if (!so->qual_ok)

    {

        Assert(!so->needPrimScan);

        _bt_parallel_done(scan);

        return false;

    }


    /*

     * If this is a parallel scan, we must seize the scan.  _bt_readfirstpage

     * will likely release the parallel scan later on.

     */

    if (scan->parallel_scan != NULL &&

        !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))

        return false;


    /*

     * Initialize the scan's arrays (if any) for the current scan direction

     * (except when they were already set to later values as part of

     * scheduling the primitive index scan that is now underway)

     */

    if (so->numArrayKeys && !so->needPrimScan)

        _bt_start_array_keys(scan, dir);


    if (blkno != InvalidBlockNumber)

    {

        /*

         * We anticipated calling _bt_search, but another worker bet us to it.

         * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).

         */

        Assert(scan->parallel_scan != NULL);

        Assert(!so->needPrimScan);

        Assert(blkno != P_NONE);


        if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))

            return false;


        _bt_returnitem(scan, so);

        return true;

    }


    /*

     * Count an indexscan for stats, now that we know that we'll call

     * _bt_search/_bt_endpoint below

     */

    pgstat_count_index_scan(rel);

    if (scan->instrument)

        scan->instrument->nsearches++;


    /*----------

     * Examine the scan keys to discover where we need to start the scan.

     *

     * We want to identify the keys that can be used as starting boundaries;

     * these are =, >, or >= keys for a forward scan or =, <, <= keys for

     * a backwards scan.  We can use keys for multiple attributes so long as

     * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept

     * a > or < boundary or find an attribute with no boundary (which can be

     * thought of as the same as "> -infinity"), we can't use keys for any

     * attributes to its right, because it would break our simplistic notion

     * of what initial positioning strategy to use.

     *

     * When the scan keys include cross-type operators, _bt_preprocess_keys

     * may not be able to eliminate redundant keys; in such cases we will

     * arbitrarily pick a usable one for each attribute.  This is correct

     * but possibly not optimal behavior.  (For example, with keys like

     * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when

     * x=5 would be more efficient.)  Since the situation only arises given

     * a poorly-worded query plus an incomplete opfamily, live with it.

     *

     * When both equality and inequality keys appear for a single attribute

     * (again, only possible when cross-type operators appear), we *must*

     * select one of the equality keys for the starting point, because

     * _bt_checkkeys() will stop the scan as soon as an equality qual fails.

     * For example, if we have keys like "x >= 4 AND x = 10" and we elect to

     * start at x=4, we will fail and stop before reaching x=10.  If multiple

     * equality quals survive preprocessing, however, it doesn't matter which

     * one we use --- by definition, they are either redundant or

     * contradictory.

     *

     * In practice we rarely see any "attribute boundary key gaps" here.

     * Preprocessing can usually backfill skip array keys for any attributes

     * that were omitted from the original scan->keyData[] input keys.  All

     * array keys are always considered = keys, but we'll sometimes need to

     * treat the current key value as if we were using an inequality strategy.

     * This happens with range skip arrays, which store inequality keys in the

     * array's low_compare/high_compare fields (used to find the first/last

     * set of matches, when = key will lack a usable sk_argument value).

     * These are always preferred over any redundant "standard" inequality

     * keys on the same column (per the usual rule about preferring = keys).

     * Note also that any column with an = skip array key can never have an

     * additional, contradictory = key.

     *

     * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP

     * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.

     * If the index stores nulls at the end of the index we'll be starting

     * from, and we have no boundary key for the column (which means the key

     * we deduced NOT NULL from is an inequality key that constrains the other

     * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to

     * use as a boundary key.  If we didn't do this, we might find ourselves

     * traversing a lot of null entries at the start of the scan.

     *

     * In this loop, row-comparison keys are treated the same as keys on their

     * first (leftmost) columns.  We'll add on lower-order columns of the row

     * comparison below, if possible.

     *

     * The selected scan keys (at most one per index column) are remembered by

     * storing their addresses into the local startKeys[] array.

     *

     * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start

     * the next primitive index scan (for scans with array keys) based in part

     * on an understanding of how it'll enable us to reposition the scan.

     * They're directly aware of how we'll sometimes cons up an explicit

     * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a

     * symmetric "deduce NOT NULL" rule of their own.  This allows top-level

     * scans to skip large groups of NULLs through repeated deductions about

     * key strictness (for a required inequality key) and whether NULLs in the

     * key's index column are stored last or first (relative to non-NULLs).

     * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might

     * need to be kept in sync.

     *----------

     */

    strat_total = BTEqualStrategyNumber;

    if (so->numberOfKeys > 0)

    {

        AttrNumber  curattr;

        ScanKey     chosen;

        ScanKey     impliesNN;

        ScanKey     cur;


        /*

         * chosen is the so-far-chosen key for the current attribute, if any.

         * We don't cast the decision in stone until we reach keys for the

         * next attribute.

         */

        cur = so->keyData;

        curattr = 1;

        chosen = NULL;

        /* Also remember any scankey that implies a NOT NULL constraint */

        impliesNN = NULL;


        /*

         * Loop iterates from 0 to numberOfKeys inclusive; we use the last

         * pass to handle after-last-key processing.  Actual exit from the

         * loop is at one of the "break" statements below.

         */

        for (int i = 0;; cur++, i++)

        {

            if (i >= so->numberOfKeys || cur->sk_attno != curattr)

            {

                /*

                 * Done looking at keys for curattr.

                 *

                 * If this is a scan key for a skip array whose current

                 * element is MINVAL, choose low_compare (when scanning

                 * backwards it'll be MAXVAL, and we'll choose high_compare).

                 *

                 * Note: if the array's low_compare key makes 'chosen' NULL,

                 * then we behave as if the array's first element is -inf,

                 * except when !array->null_elem implies a usable NOT NULL

                 * constraint.

                 */

                if (chosen != NULL &&

                    (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))

                {

                    int         ikey = chosen - so->keyData;

                    ScanKey     skipequalitykey = chosen;

                    BTArrayKeyInfo *array = NULL;


                    for (int arridx = 0; arridx < so->numArrayKeys; arridx++)

                    {

                        array = &so->arrayKeys[arridx];

                        if (array->scan_key == ikey)

                            break;

                    }


                    if (ScanDirectionIsForward(dir))

                    {

                        Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));

                        chosen = array->low_compare;

                    }

                    else

                    {

                        Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));

                        chosen = array->high_compare;

                    }


                    Assert(chosen == NULL ||

                           chosen->sk_attno == skipequalitykey->sk_attno);


                    if (!array->null_elem)

                        impliesNN = skipequalitykey;

                    else

                        Assert(chosen == NULL && impliesNN == NULL);

                }


                /*

                 * If we didn't find a usable boundary key, see if we can

                 * deduce a NOT NULL key

                 */

                if (chosen == NULL && impliesNN != NULL &&

                    ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?

                     ScanDirectionIsForward(dir) :

                     ScanDirectionIsBackward(dir)))

                {

                    /* Yes, so build the key in notnullkeys[keysz] */

                    chosen = &notnullkeys[keysz];

                    ScanKeyEntryInitialize(chosen,

                                           (SK_SEARCHNOTNULL | SK_ISNULL |

                                            (impliesNN->sk_flags &

                                             (SK_BT_DESC | SK_BT_NULLS_FIRST))),

                                           curattr,

                                           ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?

                                            BTGreaterStrategyNumber :

                                            BTLessStrategyNumber),

                                           InvalidOid,

                                           InvalidOid,

                                           InvalidOid,

                                           (Datum) 0);

                }


                /*

                 * If we still didn't find a usable boundary key, quit; else

                 * save the boundary key pointer in startKeys.

                 */

                if (chosen == NULL)

                    break;

                startKeys[keysz++] = chosen;


                /*

                 * We can only consider adding more boundary keys when the one

                 * that we just chose to add uses either the = or >= strategy

                 * (during backwards scans we can only do so when the key that

                 * we just added to startKeys[] uses the = or <= strategy)

                 */

                strat_total = chosen->sk_strategy;

                if (strat_total == BTGreaterStrategyNumber ||

                    strat_total == BTLessStrategyNumber)

                    break;


                /*

                 * If the key that we just added to startKeys[] is a skip

                 * array = key whose current element is marked NEXT or PRIOR,

                 * make strat_total > or < (and stop adding boundary keys).

                 * This can only happen with opclasses that lack skip support.

                 */

                if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))

                {

                    Assert(chosen->sk_flags & SK_BT_SKIP);

                    Assert(strat_total == BTEqualStrategyNumber);


                    if (ScanDirectionIsForward(dir))

                    {

                        Assert(!(chosen->sk_flags & SK_BT_PRIOR));

                        strat_total = BTGreaterStrategyNumber;

                    }

                    else

                    {

                        Assert(!(chosen->sk_flags & SK_BT_NEXT));

                        strat_total = BTLessStrategyNumber;

                    }


                    /*

                     * We're done.  We'll never find an exact = match for a

                     * NEXT or PRIOR sentinel sk_argument value.  There's no

                     * sense in trying to add more keys to startKeys[].

                     */

                    break;

                }


                /*

                 * Done if that was the last scan key output by preprocessing.

                 * Also done if there is a gap index attribute that lacks a

                 * usable key (only possible when preprocessing was unable to

                 * generate a skip array key to "fill in the gap").

                 */

                if (i >= so->numberOfKeys ||

                    cur->sk_attno != curattr + 1)

                    break;


                /*

                 * Reset for next attr.

                 */

                curattr = cur->sk_attno;

                chosen = NULL;

                impliesNN = NULL;

            }


            /*

             * Can we use this key as a starting boundary for this attr?

             *

             * If not, does it imply a NOT NULL constraint?  (Because

             * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,

             * *any* inequality key works for that; we need not test.)

             */

            switch (cur->sk_strategy)

            {

                case BTLessStrategyNumber:

                case BTLessEqualStrategyNumber:

                    if (chosen == NULL)

                    {

                        if (ScanDirectionIsBackward(dir))

                            chosen = cur;

                        else

                            impliesNN = cur;

                    }

                    break;

                case BTEqualStrategyNumber:

                    /* override any non-equality choice */

                    chosen = cur;

                    break;

                case BTGreaterEqualStrategyNumber:

                case BTGreaterStrategyNumber:

                    if (chosen == NULL)

                    {

                        if (ScanDirectionIsForward(dir))

                            chosen = cur;

                        else

                            impliesNN = cur;

                    }

                    break;

            }

        }

    }


    /*

     * If we found no usable boundary keys, we have to start from one end of

     * the tree.  Walk down that edge to the first or last key, and scan from

     * there.

     *

     * Note: calls _bt_readfirstpage for us, which releases the parallel scan.

     */

    if (keysz == 0)

        return _bt_endpoint(scan, dir);


    /*

     * We want to start the scan somewhere within the index.  Set up an

     * insertion scankey we can use to search for the boundary point we

     * identified above.  The insertion scankey is built using the keys

     * identified by startKeys[].  (Remaining insertion scankey fields are

     * initialized after initial-positioning scan keys are finalized.)

     */

    Assert(keysz <= INDEX_MAX_KEYS);

    for (int i = 0; i < keysz; i++)

    {

        ScanKey     cur = startKeys[i];


        Assert(cur->sk_attno == i + 1);


        if (cur->sk_flags & SK_ROW_HEADER)

        {

            /*

             * Row comparison header: look to the first row member instead

             */

            ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);


            /*

             * Cannot be a NULL in the first row member: _bt_preprocess_keys

             * would've marked the qual as unsatisfiable, preventing us from

             * ever getting this far

             */

            Assert(subkey->sk_flags & SK_ROW_MEMBER);

            Assert(subkey->sk_attno == cur->sk_attno);

            Assert(!(subkey->sk_flags & SK_ISNULL));


            /*

             * The member scankeys are already in insertion format (ie, they

             * have sk_func = 3-way-comparison function)

             */

            memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));


            /*

             * If the row comparison is the last positioning key we accepted,

             * try to add additional keys from the lower-order row members.

             * (If we accepted independent conditions on additional index

             * columns, we use those instead --- doesn't seem worth trying to

             * determine which is more restrictive.)  Note that this is OK

             * even if the row comparison is of ">" or "<" type, because the

             * condition applied to all but the last row member is effectively

             * ">=" or "<=", and so the extra keys don't break the positioning

             * scheme.  But, by the same token, if we aren't able to use all

             * the row members, then the part of the row comparison that we

             * did use has to be treated as just a ">=" or "<=" condition, and

             * so we'd better adjust strat_total accordingly.

             */

            if (i == keysz - 1)

            {

                bool        used_all_subkeys = false;


                Assert(!(subkey->sk_flags & SK_ROW_END));

                for (;;)

                {

                    subkey++;

                    Assert(subkey->sk_flags & SK_ROW_MEMBER);

                    if (subkey->sk_attno != keysz + 1)

                        break;  /* out-of-sequence, can't use it */

                    if (subkey->sk_strategy != cur->sk_strategy)

                        break;  /* wrong direction, can't use it */

                    if (subkey->sk_flags & SK_ISNULL)

                        break;  /* can't use null keys */

                    Assert(keysz < INDEX_MAX_KEYS);

                    memcpy(inskey.scankeys + keysz, subkey,

                           sizeof(ScanKeyData));

                    keysz++;

                    if (subkey->sk_flags & SK_ROW_END)

                    {

                        used_all_subkeys = true;

                        break;

                    }

                }

                if (!used_all_subkeys)

                {

                    switch (strat_total)

                    {

                        case BTLessStrategyNumber:

                            strat_total = BTLessEqualStrategyNumber;

                            break;

                        case BTGreaterStrategyNumber:

                            strat_total = BTGreaterEqualStrategyNumber;

                            break;

                    }

                }

                break;          /* done with outer loop */

            }

        }

        else

        {

            /*

             * Ordinary comparison key.  Transform the search-style scan key

             * to an insertion scan key by replacing the sk_func with the

             * appropriate btree comparison function.

             *

             * If scankey operator is not a cross-type comparison, we can use

             * the cached comparison function; otherwise gotta look it up in

             * the catalogs.  (That can't lead to infinite recursion, since no

             * indexscan initiated by syscache lookup will use cross-data-type

             * operators.)

             *

             * We support the convention that sk_subtype == InvalidOid means

             * the opclass input type; this is a hack to simplify life for

             * ScanKeyInit().

             */

            if (cur->sk_subtype == rel->rd_opcintype[i] ||

                cur->sk_subtype == InvalidOid)

            {

                FmgrInfo   *procinfo;


                procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);

                ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,

                                               cur->sk_flags,

                                               cur->sk_attno,

                                               InvalidStrategy,

                                               cur->sk_subtype,

                                               cur->sk_collation,

                                               procinfo,

                                               cur->sk_argument);

            }

            else

            {

                RegProcedure cmp_proc;


                cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],

                                             rel->rd_opcintype[i],

                                             cur->sk_subtype,

                                             BTORDER_PROC);

                if (!RegProcedureIsValid(cmp_proc))

                    elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",

                         BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,

                         cur->sk_attno, RelationGetRelationName(rel));

                ScanKeyEntryInitialize(inskey.scankeys + i,

                                       cur->sk_flags,

                                       cur->sk_attno,

                                       InvalidStrategy,

                                       cur->sk_subtype,

                                       cur->sk_collation,

                                       cmp_proc,

                                       cur->sk_argument);

            }

        }

    }


    /*----------

     * Examine the selected initial-positioning strategy to determine exactly

     * where we need to start the scan, and set flag variables to control the

     * initial descent by _bt_search (and our _bt_binsrch call for the leaf

     * page _bt_search returns).

     *----------

     */

    _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);

    inskey.anynullkeys = false; /* unused */

    inskey.scantid = NULL;

    inskey.keysz = keysz;

    switch (strat_total)

    {

        case BTLessStrategyNumber:


            inskey.nextkey = false;

            inskey.backward = true;

            break;


        case BTLessEqualStrategyNumber:


            inskey.nextkey = true;

            inskey.backward = true;

            break;


        case BTEqualStrategyNumber:


            /*

             * If a backward scan was specified, need to start with last equal

             * item not first one.

             */

            if (ScanDirectionIsBackward(dir))

            {

                /*

                 * This is the same as the <= strategy

                 */

                inskey.nextkey = true;

                inskey.backward = true;

            }

            else

            {

                /*

                 * This is the same as the >= strategy

                 */

                inskey.nextkey = false;

                inskey.backward = false;

            }

            break;


        case BTGreaterEqualStrategyNumber:


            /*

             * Find first item >= scankey

             */

            inskey.nextkey = false;

            inskey.backward = false;

            break;


        case BTGreaterStrategyNumber:


            /*

             * Find first item > scankey

             */

            inskey.nextkey = true;

            inskey.backward = false;

            break;


        default:

            /* can't get here, but keep compiler quiet */

            elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);

            return false;

    }


    /*

     * Use the manufactured insertion scan key to descend the tree and

     * position ourselves on the target leaf page.

     */

    Assert(ScanDirectionIsBackward(dir) == inskey.backward);

    stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);


    /* don't need to keep the stack around... */

    _bt_freestack(stack);


    if (!BufferIsValid(so->currPos.buf))

    {

        /*

         * We only get here if the index is completely empty. Lock relation

         * because nothing finer to lock exists.  Without a buffer lock, it's

         * possible for another transaction to insert data between

         * _bt_search() and PredicateLockRelation().  We have to try again

         * after taking the relation-level predicate lock, to close a narrow

         * window where we wouldn't scan concurrently inserted tuples, but the

         * writer wouldn't see our predicate lock.

         */

        if (IsolationIsSerializable())

        {

            PredicateLockRelation(rel, scan->xs_snapshot);

            stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);

            _bt_freestack(stack);

        }


        if (!BufferIsValid(so->currPos.buf))

        {

            Assert(!so->needPrimScan);

            _bt_parallel_done(scan);

            return false;

        }

    }


    /* position to the precise item on the page */

    offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);


    /*

     * Now load data from the first page of the scan (usually the page

     * currently in so->currPos.buf).

     *

     * If inskey.nextkey = false and inskey.backward = false, offnum is

     * positioned at the first non-pivot tuple >= inskey.scankeys.

     *

     * If inskey.nextkey = false and inskey.backward = true, offnum is

     * positioned at the last non-pivot tuple < inskey.scankeys.

     *

     * If inskey.nextkey = true and inskey.backward = false, offnum is

     * positioned at the first non-pivot tuple > inskey.scankeys.

     *

     * If inskey.nextkey = true and inskey.backward = true, offnum is

     * positioned at the last non-pivot tuple <= inskey.scankeys.

     *

     * It's possible that _bt_binsrch returned an offnum that is out of bounds

     * for the page.  For example, when inskey is both < the leaf page's high

     * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".

     */

    if (!_bt_readfirstpage(scan, offnum, dir))

        return false;


    _bt_returnitem(scan, so);

    return true;

}


/*

 *  _bt_next() -- Get the next item in a scan.

 *

 *      On entry, so->currPos describes the current page, which may be pinned

 *      but is not locked, and so->currPos.itemIndex identifies which item was

 *      previously returned.

 *

 *      On success exit, so->currPos is updated as needed, and _bt_returnitem

 *      sets the next item to return to the scan.  so->currPos remains valid.

 *

 *      On failure exit (no more tuples), we invalidate so->currPos.  It'll

 *      still be possible for the scan to return tuples by changing direction,

 *      though we'll need to call _bt_first anew in that other direction.

 */

bool

_bt_next(IndexScanDesc scan, ScanDirection dir)

{

    BTScanOpaque so = (BTScanOpaque) scan->opaque;


    Assert(BTScanPosIsValid(so->currPos));


    /*

     * Advance to next tuple on current page; or if there's no more, try to

     * step to the next page with data.

     */

    if (ScanDirectionIsForward(dir))

    {

        if (++so->currPos.itemIndex > so->currPos.lastItem)

        {

            if (!_bt_steppage(scan, dir))

                return false;

        }

    }

    else

    {

        if (--so->currPos.itemIndex < so->currPos.firstItem)

        {

            if (!_bt_steppage(scan, dir))

                return false;

        }

    }


    _bt_returnitem(scan, so);

    return true;

}


/*

 *  _bt_readpage() -- Load data from current index page into so->currPos

 *

 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state

 * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;

 * they are updated as appropriate.  All other fields of so->currPos are

 * initialized from scratch here.

 *

 * We scan the current page starting at offnum and moving in the indicated

 * direction.  All items matching the scan keys are loaded into currPos.items.

 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports

 * that there can be no more matching tuples in the current scan direction

 * (could just be for the current primitive index scan when scan has arrays).

 *

 * In the case of a parallel scan, caller must have called _bt_parallel_seize

 * prior to calling this function; this function will invoke

 * _bt_parallel_release before returning.

 *

 * Returns true if any matching items found on the page, false if none.

 */

static bool

_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,

             bool firstpage)

{

    Relation    rel = scan->indexRelation;

    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    Page        page;

    BTPageOpaque opaque;

    OffsetNumber minoff;

    OffsetNumber maxoff;

    BTReadPageState pstate;

    bool        arrayKeys;

    int         itemIndex,

                indnatts;


    /* save the page/buffer block number, along with its sibling links */

    page = BufferGetPage(so->currPos.buf);

    opaque = BTPageGetOpaque(page);

    so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);

    so->currPos.prevPage = opaque->btpo_prev;

    so->currPos.nextPage = opaque->btpo_next;


    Assert(!P_IGNORE(opaque));

    Assert(BTScanPosIsPinned(so->currPos));

    Assert(!so->needPrimScan);


    if (scan->parallel_scan)

    {

        /* allow next/prev page to be read by other worker without delay */

        if (ScanDirectionIsForward(dir))

            _bt_parallel_release(scan, so->currPos.nextPage,

                                 so->currPos.currPage);

        else

            _bt_parallel_release(scan, so->currPos.prevPage,

                                 so->currPos.currPage);

    }


    /* initialize remaining currPos fields related to current page */

    so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);

    so->currPos.dir = dir;

    so->currPos.nextTupleOffset = 0;

    /* either moreLeft or moreRight should be set now (may be unset later) */

    Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :

           so->currPos.moreLeft);


    PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);


    /* initialize local variables */

    indnatts = IndexRelationGetNumberOfAttributes(rel);

    arrayKeys = so->numArrayKeys != 0;

    minoff = P_FIRSTDATAKEY(opaque);

    maxoff = PageGetMaxOffsetNumber(page);


    /* initialize page-level state that we'll pass to _bt_checkkeys */

    pstate.minoff = minoff;

    pstate.maxoff = maxoff;

    pstate.finaltup = NULL;

    pstate.page = page;

    pstate.firstpage = firstpage;

    pstate.forcenonrequired = false;

    pstate.startikey = 0;

    pstate.offnum = InvalidOffsetNumber;

    pstate.skip = InvalidOffsetNumber;

    pstate.continuescan = true; /* default assumption */

    pstate.rechecks = 0;

    pstate.targetdistance = 0;

    pstate.nskipadvances = 0;


    if (ScanDirectionIsForward(dir))

    {

        /* SK_SEARCHARRAY forward scans must provide high key up front */

        if (arrayKeys)

        {

            if (!P_RIGHTMOST(opaque))

            {

                ItemId      iid = PageGetItemId(page, P_HIKEY);


                pstate.finaltup = (IndexTuple) PageGetItem(page, iid);


                if (so->scanBehind &&

                    !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))

                {

                    /* Schedule another primitive index scan after all */

                    so->currPos.moreRight = false;

                    so->needPrimScan = true;

                    if (scan->parallel_scan)

                        _bt_parallel_primscan_schedule(scan,

                                                       so->currPos.currPage);

                    return false;

                }

            }


            so->scanBehind = so->oppositeDirCheck = false;  /* reset */

        }


        /*

         * Consider pstate.startikey optimization once the ongoing primitive

         * index scan has already read at least one page

         */

        if (!pstate.firstpage && minoff < maxoff)

            _bt_set_startikey(scan, &pstate);


        /* load items[] in ascending order */

        itemIndex = 0;


        offnum = Max(offnum, minoff);


        while (offnum <= maxoff)

        {

            ItemId      iid = PageGetItemId(page, offnum);

            IndexTuple  itup;

            bool        passes_quals;


            /*

             * If the scan specifies not to return killed tuples, then we

             * treat a killed tuple as not passing the qual

             */

            if (scan->ignore_killed_tuples && ItemIdIsDead(iid))

            {

                offnum = OffsetNumberNext(offnum);

                continue;

            }


            itup = (IndexTuple) PageGetItem(page, iid);

            Assert(!BTreeTupleIsPivot(itup));


            pstate.offnum = offnum;

            passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,

                                         itup, indnatts);


            /*

             * Check if we need to skip ahead to a later tuple (only possible

             * when the scan uses array keys)

             */

            if (arrayKeys && OffsetNumberIsValid(pstate.skip))

            {

                Assert(!passes_quals && pstate.continuescan);

                Assert(offnum < pstate.skip);

                Assert(!pstate.forcenonrequired);


                offnum = pstate.skip;

                pstate.skip = InvalidOffsetNumber;

                continue;

            }


            if (passes_quals)

            {

                /* tuple passes all scan key conditions */

                if (!BTreeTupleIsPosting(itup))

                {

                    /* Remember it */

                    _bt_saveitem(so, itemIndex, offnum, itup);

                    itemIndex++;

                }

                else

                {

                    int         tupleOffset;


                    /*

                     * Set up state to return posting list, and remember first

                     * TID

                     */

                    tupleOffset =

                        _bt_setuppostingitems(so, itemIndex, offnum,

                                              BTreeTupleGetPostingN(itup, 0),

                                              itup);

                    itemIndex++;

                    /* Remember additional TIDs */

                    for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)

                    {

                        _bt_savepostingitem(so, itemIndex, offnum,

                                            BTreeTupleGetPostingN(itup, i),

                                            tupleOffset);

                        itemIndex++;

                    }

                }

            }

            /* When !continuescan, there can't be any more matches, so stop */

            if (!pstate.continuescan)

                break;


            offnum = OffsetNumberNext(offnum);

        }


        /*

         * We don't need to visit page to the right when the high key

         * indicates that no more matches will be found there.

         *

         * Checking the high key like this works out more often than you might

         * think.  Leaf page splits pick a split point between the two most

         * dissimilar tuples (this is weighed against the need to evenly share

         * free space).  Leaf pages with high key attribute values that can

         * only appear on non-pivot tuples on the right sibling page are

         * common.

         */

        if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))

        {

            ItemId      iid = PageGetItemId(page, P_HIKEY);

            IndexTuple  itup = (IndexTuple) PageGetItem(page, iid);

            int         truncatt;


            /* Reset arrays, per _bt_set_startikey contract */

            if (pstate.forcenonrequired)

                _bt_start_array_keys(scan, dir);

            pstate.forcenonrequired = false;

            pstate.startikey = 0;   /* _bt_set_startikey ignores P_HIKEY */


            truncatt = BTreeTupleGetNAtts(itup, rel);

            _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);

        }


        if (!pstate.continuescan)

            so->currPos.moreRight = false;


        Assert(itemIndex <= MaxTIDsPerBTreePage);

        so->currPos.firstItem = 0;

        so->currPos.lastItem = itemIndex - 1;

        so->currPos.itemIndex = 0;

    }

    else

    {

        /* SK_SEARCHARRAY backward scans must provide final tuple up front */

        if (arrayKeys)

        {

            if (minoff <= maxoff && !P_LEFTMOST(opaque))

            {

                ItemId      iid = PageGetItemId(page, minoff);


                pstate.finaltup = (IndexTuple) PageGetItem(page, iid);


                if (so->scanBehind &&

                    !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))

                {

                    /* Schedule another primitive index scan after all */

                    so->currPos.moreLeft = false;

                    so->needPrimScan = true;

                    if (scan->parallel_scan)

                        _bt_parallel_primscan_schedule(scan,

                                                       so->currPos.currPage);

                    return false;

                }

            }


            so->scanBehind = so->oppositeDirCheck = false;  /* reset */

        }


        /*

         * Consider pstate.startikey optimization once the ongoing primitive

         * index scan has already read at least one page

         */

        if (!pstate.firstpage && minoff < maxoff)

            _bt_set_startikey(scan, &pstate);


        /* load items[] in descending order */

        itemIndex = MaxTIDsPerBTreePage;


        offnum = Min(offnum, maxoff);


        while (offnum >= minoff)

        {

            ItemId      iid = PageGetItemId(page, offnum);

            IndexTuple  itup;

            bool        tuple_alive;

            bool        passes_quals;


            /*

             * If the scan specifies not to return killed tuples, then we

             * treat a killed tuple as not passing the qual.  Most of the

             * time, it's a win to not bother examining the tuple's index

             * keys, but just skip to the next tuple (previous, actually,

             * since we're scanning backwards).  However, if this is the first

             * tuple on the page, we do check the index keys, to prevent

             * uselessly advancing to the page to the left.  This is similar

             * to the high key optimization used by forward scans.

             */

            if (scan->ignore_killed_tuples && ItemIdIsDead(iid))

            {

                if (offnum > minoff)

                {

                    offnum = OffsetNumberPrev(offnum);

                    continue;

                }


                tuple_alive = false;

            }

            else

                tuple_alive = true;


            itup = (IndexTuple) PageGetItem(page, iid);

            Assert(!BTreeTupleIsPivot(itup));


            pstate.offnum = offnum;

            if (arrayKeys && offnum == minoff && pstate.forcenonrequired)

            {

                /* Reset arrays, per _bt_set_startikey contract */

                pstate.forcenonrequired = false;

                pstate.startikey = 0;

                _bt_start_array_keys(scan, dir);

            }

            passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,

                                         itup, indnatts);


            if (arrayKeys && so->scanBehind)

            {

                /*

                 * Done scanning this page, but not done with the current

                 * primscan.

                 *

                 * Note: Forward scans don't check this explicitly, since they

                 * prefer to reuse pstate.skip for this instead.

                 */

                Assert(!passes_quals && pstate.continuescan);

                Assert(!pstate.forcenonrequired);


                break;

            }


            /*

             * Check if we need to skip ahead to a later tuple (only possible

             * when the scan uses array keys)

             */

            if (arrayKeys && OffsetNumberIsValid(pstate.skip))

            {

                Assert(!passes_quals && pstate.continuescan);

                Assert(offnum > pstate.skip);

                Assert(!pstate.forcenonrequired);


                offnum = pstate.skip;

                pstate.skip = InvalidOffsetNumber;

                continue;

            }


            if (passes_quals && tuple_alive)

            {

                /* tuple passes all scan key conditions */

                if (!BTreeTupleIsPosting(itup))

                {

                    /* Remember it */

                    itemIndex--;

                    _bt_saveitem(so, itemIndex, offnum, itup);

                }

                else

                {

                    int         tupleOffset;


                    /*

                     * Set up state to return posting list, and remember first

                     * TID.

                     *

                     * Note that we deliberately save/return items from

                     * posting lists in ascending heap TID order for backwards

                     * scans.  This allows _bt_killitems() to make a

                     * consistent assumption about the order of items

                     * associated with the same posting list tuple.

                     */

                    itemIndex--;

                    tupleOffset =

                        _bt_setuppostingitems(so, itemIndex, offnum,

                                              BTreeTupleGetPostingN(itup, 0),

                                              itup);

                    /* Remember additional TIDs */

                    for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)

                    {

                        itemIndex--;

                        _bt_savepostingitem(so, itemIndex, offnum,

                                            BTreeTupleGetPostingN(itup, i),

                                            tupleOffset);

                    }

                }

            }

            /* When !continuescan, there can't be any more matches, so stop */

            if (!pstate.continuescan)

                break;


            offnum = OffsetNumberPrev(offnum);

        }


        /*

         * We don't need to visit page to the left when no more matches will

         * be found there

         */

        if (!pstate.continuescan)

            so->currPos.moreLeft = false;


        Assert(itemIndex >= 0);

        so->currPos.firstItem = itemIndex;

        so->currPos.lastItem = MaxTIDsPerBTreePage - 1;

        so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;

    }


    /*

     * If _bt_set_startikey told us to temporarily treat the scan's keys as

     * nonrequired (possible only during scans with array keys), there must be

     * no lasting consequences for the scan's array keys.  The scan's arrays

     * should now have exactly the same elements as they would have had if the

     * nonrequired behavior had never been used.  (In general, a scan's arrays

     * are expected to track its progress through the index's key space.)

     *

     * We are required (by _bt_set_startikey) to call _bt_checkkeys against

     * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's

     * arrays to recover.  Assert that that step hasn't been missed.

     */

    Assert(!pstate.forcenonrequired);


    return (so->currPos.firstItem <= so->currPos.lastItem);

}


/* Save an index item into so->currPos.items[itemIndex] */

static void

_bt_saveitem(BTScanOpaque so, int itemIndex,

             OffsetNumber offnum, IndexTuple itup)

{

    BTScanPosItem *currItem = &so->currPos.items[itemIndex];


    Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));


    currItem->heapTid = itup->t_tid;

    currItem->indexOffset = offnum;

    if (so->currTuples)

    {

        Size        itupsz = IndexTupleSize(itup);


        currItem->tupleOffset = so->currPos.nextTupleOffset;

        memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);

        so->currPos.nextTupleOffset += MAXALIGN(itupsz);

    }

}


/*

 * Setup state to save TIDs/items from a single posting list tuple.

 *

 * Saves an index item into so->currPos.items[itemIndex] for TID that is

 * returned to scan first.  Second or subsequent TIDs for posting list should

 * be saved by calling _bt_savepostingitem().

 *

 * Returns an offset into tuple storage space that main tuple is stored at if

 * needed.

 */

static int

_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,

                      ItemPointer heapTid, IndexTuple itup)

{

    BTScanPosItem *currItem = &so->currPos.items[itemIndex];


    Assert(BTreeTupleIsPosting(itup));


    currItem->heapTid = *heapTid;

    currItem->indexOffset = offnum;

    if (so->currTuples)

    {

        /* Save base IndexTuple (truncate posting list) */

        IndexTuple  base;

        Size        itupsz = BTreeTupleGetPostingOffset(itup);


        itupsz = MAXALIGN(itupsz);

        currItem->tupleOffset = so->currPos.nextTupleOffset;

        base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);

        memcpy(base, itup, itupsz);

        /* Defensively reduce work area index tuple header size */

        base->t_info &= ~INDEX_SIZE_MASK;

        base->t_info |= itupsz;

        so->currPos.nextTupleOffset += itupsz;


        return currItem->tupleOffset;

    }


    return 0;

}


/*

 * Save an index item into so->currPos.items[itemIndex] for current posting

 * tuple.

 *

 * Assumes that _bt_setuppostingitems() has already been called for current

 * posting list tuple.  Caller passes its return value as tupleOffset.

 */

static inline void

_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,

                    ItemPointer heapTid, int tupleOffset)

{

    BTScanPosItem *currItem = &so->currPos.items[itemIndex];


    currItem->heapTid = *heapTid;

    currItem->indexOffset = offnum;


    /*

     * Have index-only scans return the same base IndexTuple for every TID

     * that originates from the same posting list

     */

    if (so->currTuples)

        currItem->tupleOffset = tupleOffset;

}


/*

 * Return the index item from so->currPos.items[so->currPos.itemIndex] to the

 * index scan by setting the relevant fields in caller's index scan descriptor

 */

static inline void

_bt_returnitem(IndexScanDesc scan, BTScanOpaque so)

{

    BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];


    /* Most recent _bt_readpage must have succeeded */

    Assert(BTScanPosIsValid(so->currPos));

    Assert(so->currPos.itemIndex >= so->currPos.firstItem);

    Assert(so->currPos.itemIndex <= so->currPos.lastItem);


    /* Return next item, per amgettuple contract */

    scan->xs_heaptid = currItem->heapTid;

    if (so->currTuples)

        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

}


/*

 *  _bt_steppage() -- Step to next page containing valid data for scan

 *

 * Wrapper on _bt_readnextpage that performs final steps for the current page.

 *

 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.

 * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped

 * the pin eagerly earlier on.  The scan must have so->currPos.currPage set to

 * a valid block, in any case.

 */

static bool

_bt_steppage(IndexScanDesc scan, ScanDirection dir)

{

    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    BlockNumber blkno,

                lastcurrblkno;


    Assert(BTScanPosIsValid(so->currPos));


    /* Before leaving current page, deal with any killed items */

    if (so->numKilled > 0)

        _bt_killitems(scan);


    /*

     * Before we modify currPos, make a copy of the page data if there was a

     * mark position that needs it.

     */

    if (so->markItemIndex >= 0)

    {

        /* bump pin on current buffer for assignment to mark buffer */

        if (BTScanPosIsPinned(so->currPos))

            IncrBufferRefCount(so->currPos.buf);

        memcpy(&so->markPos, &so->currPos,

               offsetof(BTScanPosData, items[1]) +

               so->currPos.lastItem * sizeof(BTScanPosItem));

        if (so->markTuples)

            memcpy(so->markTuples, so->currTuples,

                   so->currPos.nextTupleOffset);

        so->markPos.itemIndex = so->markItemIndex;

        so->markItemIndex = -1;


        /*

         * If we're just about to start the next primitive index scan

         * (possible with a scan that has arrays keys, and needs to skip to

         * continue in the current scan direction), moreLeft/moreRight only

         * indicate the end of the current primitive index scan.  They must

         * never be taken to indicate that the top-level index scan has ended

         * (that would be wrong).

         *

         * We could handle this case by treating the current array keys as

         * markPos state.  But depending on the current array state like this

         * would add complexity.  Instead, we just unset markPos's copy of

         * moreRight or moreLeft (whichever might be affected), while making

         * btrestrpos reset the scan's arrays to their initial scan positions.

         * In effect, btrestrpos leaves advancing the arrays up to the first

         * _bt_readpage call (that takes place after it has restored markPos).

         */

        if (so->needPrimScan)

        {

            if (ScanDirectionIsForward(so->currPos.dir))

                so->markPos.moreRight = true;

            else

                so->markPos.moreLeft = true;

        }


        /* mark/restore not supported by parallel scans */

        Assert(!scan->parallel_scan);

    }


    BTScanPosUnpinIfPinned(so->currPos);


    /* Walk to the next page with data */

    if (ScanDirectionIsForward(dir))

        blkno = so->currPos.nextPage;

    else

        blkno = so->currPos.prevPage;

    lastcurrblkno = so->currPos.currPage;


    /*

     * Cancel primitive index scans that were scheduled when the call to

     * _bt_readpage for currPos happened to use the opposite direction to the

     * one that we're stepping in now.  (It's okay to leave the scan's array

     * keys as-is, since the next _bt_readpage will advance them.)

     */

    if (so->currPos.dir != dir)

        so->needPrimScan = false;


    return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);

}


/*

 *  _bt_readfirstpage() -- Read first page containing valid data for _bt_first

 *

 * _bt_first caller passes us an offnum returned by _bt_binsrch, which might

 * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.

 * _bt_checkkeys will stop the scan as soon as an equality qual fails (when

 * its scan key was marked required), so _bt_first _must_ pass us an offnum

 * exactly at the beginning of where equal tuples are to be found.  When we're

 * passed an offnum past the end of the page, we might still manage to stop

 * the scan on this page by calling _bt_checkkeys against the high key.  See

 * _bt_readpage for full details.

 *

 * On entry, so->currPos must be pinned and locked (so offnum stays valid).

 * Parallel scan callers must have seized the scan before calling here.

 *

 * On exit, we'll have updated so->currPos and retained locks and pins

 * according to the same rules as those laid out for _bt_readnextpage exit.

 * Like _bt_readnextpage, our return value indicates if there are any matching

 * records in the given direction.

 *

 * We always release the scan for a parallel scan caller, regardless of

 * success or failure; we'll call _bt_parallel_release as soon as possible.

 */

static bool

_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)

{

    BTScanOpaque so = (BTScanOpaque) scan->opaque;


    so->numKilled = 0;          /* just paranoia */

    so->markItemIndex = -1;     /* ditto */


    /* Initialize so->currPos for the first page (page in so->currPos.buf) */

    if (so->needPrimScan)

    {

        Assert(so->numArrayKeys);


        so->currPos.moreLeft = true;

        so->currPos.moreRight = true;

        so->needPrimScan = false;

    }

    else if (ScanDirectionIsForward(dir))

    {

        so->currPos.moreLeft = false;

        so->currPos.moreRight = true;

    }

    else

    {

        so->currPos.moreLeft = true;

        so->currPos.moreRight = false;

    }


    /*

     * Attempt to load matching tuples from the first page.

     *

     * Note that _bt_readpage will finish initializing the so->currPos fields.

     * _bt_readpage also releases parallel scan (even when it returns false).

     */

    if (_bt_readpage(scan, dir, offnum, true))

    {

        /*

         * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on

         * so->currPos.buf in preparation for btgettuple returning tuples.

         */

        Assert(BTScanPosIsPinned(so->currPos));

        _bt_drop_lock_and_maybe_pin(scan, &so->currPos);

        return true;

    }


    /* There's no actually-matching data on the page in so->currPos.buf */

    _bt_unlockbuf(scan->indexRelation, so->currPos.buf);


    /* Call _bt_readnextpage using its _bt_steppage wrapper function */

    if (!_bt_steppage(scan, dir))

        return false;


    /* _bt_readpage for a later page (now in so->currPos) succeeded */

    return true;

}


/*

 *  _bt_readnextpage() -- Read next page containing valid data for _bt_next

 *

 * Caller's blkno is the next interesting page's link, taken from either the

 * previously-saved right link or left link.  lastcurrblkno is the page that

 * was current at the point where the blkno link was saved, which we use to

 * reason about concurrent page splits/page deletions during backwards scans.

 *

 * On entry, caller shouldn't hold any locks or pins on any page (we work

 * directly off of blkno and lastcurrblkno instead).  Parallel scan callers

 * that seized the scan before calling here should pass seized=true; such a

 * caller's blkno and lastcurrblkno arguments come from the seized scan.

 * seized=false callers just pass us the blkno/lastcurrblkno taken from their

 * so->currPos, which (along with so->currPos itself) can be used to end the

 * scan.  A seized=false caller's blkno can never be assumed to be the page

 * that must be read next during a parallel scan, though.  We must figure that

 * part out for ourselves by seizing the scan (the correct page to read might

 * already be beyond the seized=false caller's blkno during a parallel scan,

 * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,

 * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).

 *

 * On success exit, so->currPos is updated to contain data from the next

 * interesting page, and we return true.  We hold a pin on the buffer on

 * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe

 * to eagerly drop the pin (to avoid blocking VACUUM).

 *

 * If there are no more matching records in the given direction, we drop all

 * locks and pins, invalidate so->currPos, and return false.

 *

 * We always release the scan for a parallel scan caller, regardless of

 * success or failure; we'll call _bt_parallel_release as soon as possible.

 */

static bool

_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,

                 BlockNumber lastcurrblkno, ScanDirection dir, bool seized)

{

    Relation    rel = scan->indexRelation;

    BTScanOpaque so = (BTScanOpaque) scan->opaque;


    Assert(so->currPos.currPage == lastcurrblkno || seized);

    Assert(!(blkno == P_NONE && seized));

    Assert(!BTScanPosIsPinned(so->currPos));


    /*

     * Remember that the scan already read lastcurrblkno, a page to the left

     * of blkno (or remember reading a page to the right, for backwards scans)

     */

    if (ScanDirectionIsForward(dir))

        so->currPos.moreLeft = true;

    else

        so->currPos.moreRight = true;


    for (;;)

    {

        Page        page;

        BTPageOpaque opaque;


        if (blkno == P_NONE ||

            (ScanDirectionIsForward(dir) ?

             !so->currPos.moreRight : !so->currPos.moreLeft))

        {

            /* most recent _bt_readpage call (for lastcurrblkno) ended scan */

            Assert(so->currPos.currPage == lastcurrblkno && !seized);

            BTScanPosInvalidate(so->currPos);

            _bt_parallel_done(scan);    /* iff !so->needPrimScan */

            return false;

        }


        Assert(!so->needPrimScan);


        /* parallel scan must never actually visit so->currPos blkno */

        if (!seized && scan->parallel_scan != NULL &&

            !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))

        {

            /* whole scan is now done (or another primitive scan required) */

            BTScanPosInvalidate(so->currPos);

            return false;

        }


        if (ScanDirectionIsForward(dir))

        {

            /* read blkno, but check for interrupts first */

            CHECK_FOR_INTERRUPTS();

            so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);

        }

        else

        {

            /* read blkno, avoiding race (also checks for interrupts) */

            so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,

                                                         lastcurrblkno);

            if (so->currPos.buf == InvalidBuffer)

            {

                /* must have been a concurrent deletion of leftmost page */

                BTScanPosInvalidate(so->currPos);

                _bt_parallel_done(scan);

                return false;

            }

        }


        page = BufferGetPage(so->currPos.buf);

        opaque = BTPageGetOpaque(page);

        lastcurrblkno = blkno;

        if (likely(!P_IGNORE(opaque)))

        {

            /* see if there are any matches on this page */

            if (ScanDirectionIsForward(dir))

            {

                /* note that this will clear moreRight if we can stop */

                if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))

                    break;

                blkno = so->currPos.nextPage;

            }

            else

            {

                /* note that this will clear moreLeft if we can stop */

                if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))

                    break;

                blkno = so->currPos.prevPage;

            }

        }

        else

        {

            /* _bt_readpage not called, so do all this for ourselves */

            if (ScanDirectionIsForward(dir))

                blkno = opaque->btpo_next;

            else

                blkno = opaque->btpo_prev;

            if (scan->parallel_scan != NULL)

                _bt_parallel_release(scan, blkno, lastcurrblkno);

        }


        /* no matching tuples on this page */

        _bt_relbuf(rel, so->currPos.buf);

        seized = false;         /* released by _bt_readpage (or by us) */

    }


    /*

     * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on

     * so->currPos.buf in preparation for btgettuple returning tuples.

     */

    Assert(so->currPos.currPage == blkno);

    Assert(BTScanPosIsPinned(so->currPos));

    _bt_drop_lock_and_maybe_pin(scan, &so->currPos);


    return true;

}


/*

 * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,

 * recovering from concurrent page splits/page deletions when necessary

 *

 * Called during backwards scans, to deal with their unique concurrency rules.

 *

 * blkno points to the block number of the page that we expect to move the

 * scan to.  We'll successfully move the scan there when we find that its

 * right sibling link still points to lastcurrblkno (the page we just read).

 * Otherwise, we have to figure out which page is the correct one for the scan

 * to now read the hard way, reasoning about concurrent splits and deletions.

 * See nbtree/README.

 *

 * On return, we have both a pin and a read lock on the returned page, whose

 * block number will be set in *blkno.  Returns InvalidBuffer if there is no

 * page to the left (no lock or pin is held in that case).

 *

 * It is possible for the returned leaf page to be half-dead; caller must

 * check that condition and step left again when required.

 */

static Buffer

_bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,

                           BlockNumber lastcurrblkno)

{

    BlockNumber origblkno = *blkno; /* detects circular links */


    for (;;)

    {

        Buffer      buf;

        Page        page;

        BTPageOpaque opaque;

        int         tries;


        /* check for interrupts while we're not holding any buffer lock */

        CHECK_FOR_INTERRUPTS();

        buf = _bt_getbuf(rel, *blkno, BT_READ);

        page = BufferGetPage(buf);

        opaque = BTPageGetOpaque(page);


        /*

         * If this isn't the page we want, walk right till we find what we

         * want --- but go no more than four hops (an arbitrary limit). If we

         * don't find the correct page by then, the most likely bet is that

         * lastcurrblkno got deleted and isn't in the sibling chain at all

         * anymore, not that its left sibling got split more than four times.

         *

         * Note that it is correct to test P_ISDELETED not P_IGNORE here,

         * because half-dead pages are still in the sibling chain.

         */

        tries = 0;

        for (;;)

        {

            if (likely(!P_ISDELETED(opaque) &&

                       opaque->btpo_next == lastcurrblkno))

            {

                /* Found desired page, return it */

                return buf;

            }

            if (P_RIGHTMOST(opaque) || ++tries > 4)

                break;

            /* step right */

            *blkno = opaque->btpo_next;

            buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);

            page = BufferGetPage(buf);

            opaque = BTPageGetOpaque(page);

        }


        /*

         * Return to the original page (usually the page most recently read by

         * _bt_readpage, which is passed by caller as lastcurrblkno) to see

         * what's up with its prev sibling link

         */

        buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);

        page = BufferGetPage(buf);

        opaque = BTPageGetOpaque(page);

        if (P_ISDELETED(opaque))

        {

            /*

             * It was deleted.  Move right to first nondeleted page (there

             * must be one); that is the page that has acquired the deleted

             * one's keyspace, so stepping left from it will take us where we

             * want to be.

             */

            for (;;)

            {

                if (P_RIGHTMOST(opaque))

                    elog(ERROR, "fell off the end of index \"%s\"",

                         RelationGetRelationName(rel));

                lastcurrblkno = opaque->btpo_next;

                buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);

                page = BufferGetPage(buf);

                opaque = BTPageGetOpaque(page);

                if (!P_ISDELETED(opaque))

                    break;

            }

        }

        else

        {

            /*

             * Original lastcurrblkno wasn't deleted; the explanation had

             * better be that the page to the left got split or deleted.

             * Without this check, we risk going into an infinite loop.

             */

            if (opaque->btpo_prev == origblkno)

                elog(ERROR, "could not find left sibling of block %u in index \"%s\"",

                     lastcurrblkno, RelationGetRelationName(rel));

            /* Okay to try again, since left sibling link changed */

        }


        /*

         * Original lastcurrblkno from caller was concurrently deleted (could

         * also have been a great many concurrent left sibling page splits).

         * Found a non-deleted page that should now act as our lastcurrblkno.

         */

        if (P_LEFTMOST(opaque))

        {

            /* New lastcurrblkno has no left sibling (concurrently deleted) */

            _bt_relbuf(rel, buf);

            break;

        }


        /* Start from scratch with new lastcurrblkno's blkno/prev link */

        *blkno = origblkno = opaque->btpo_prev;

        _bt_relbuf(rel, buf);

    }


    return InvalidBuffer;

}


/*

 * _bt_get_endpoint() -- Find the first or last page on a given tree level

 *

 * If the index is empty, we will return InvalidBuffer; any other failure

 * condition causes ereport().  We will not return a dead page.

 *

 * The returned buffer is pinned and read-locked.

 */

Buffer

_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)

{

    Buffer      buf;

    Page        page;

    BTPageOpaque opaque;

    OffsetNumber offnum;

    BlockNumber blkno;

    IndexTuple  itup;


    /*

     * If we are looking for a leaf page, okay to descend from fast root;

     * otherwise better descend from true root.  (There is no point in being

     * smarter about intermediate levels.)

     */

    if (level == 0)

        buf = _bt_getroot(rel, NULL, BT_READ);

    else

        buf = _bt_gettrueroot(rel);


    if (!BufferIsValid(buf))

        return InvalidBuffer;


    page = BufferGetPage(buf);

    opaque = BTPageGetOpaque(page);


    for (;;)

    {

        /*

         * If we landed on a deleted page, step right to find a live page

         * (there must be one).  Also, if we want the rightmost page, step

         * right if needed to get to it (this could happen if the page split

         * since we obtained a pointer to it).

         */

        while (P_IGNORE(opaque) ||

               (rightmost && !P_RIGHTMOST(opaque)))

        {

            blkno = opaque->btpo_next;

            if (blkno == P_NONE)

                elog(ERROR, "fell off the end of index \"%s\"",

                     RelationGetRelationName(rel));

            buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);

            page = BufferGetPage(buf);

            opaque = BTPageGetOpaque(page);

        }


        /* Done? */

        if (opaque->btpo_level == level)

            break;

        if (opaque->btpo_level < level)

            ereport(ERROR,

                    (errcode(ERRCODE_INDEX_CORRUPTED),

                     errmsg_internal("btree level %u not found in index \"%s\"",

                                     level, RelationGetRelationName(rel))));


        /* Descend to leftmost or rightmost child page */

        if (rightmost)

            offnum = PageGetMaxOffsetNumber(page);

        else

            offnum = P_FIRSTDATAKEY(opaque);


        itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

        blkno = BTreeTupleGetDownLink(itup);


        buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);

        page = BufferGetPage(buf);

        opaque = BTPageGetOpaque(page);

    }


    return buf;

}


/*

 *  _bt_endpoint() -- Find the first or last page in the index, and scan

 * from there to the first key satisfying all the quals.

 *

 * This is used by _bt_first() to set up a scan when we've determined

 * that the scan must start at the beginning or end of the index (for

 * a forward or backward scan respectively).

 *

 * Parallel scan callers must have seized the scan before calling here.

 * Exit conditions are the same as for _bt_first().

 */

static bool

_bt_endpoint(IndexScanDesc scan, ScanDirection dir)

{

    Relation    rel = scan->indexRelation;

    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    Page        page;

    BTPageOpaque opaque;

    OffsetNumber start;


    Assert(!BTScanPosIsValid(so->currPos));

    Assert(!so->needPrimScan);


    /*

     * Scan down to the leftmost or rightmost leaf page.  This is a simplified

     * version of _bt_search().

     */

    so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));


    if (!BufferIsValid(so->currPos.buf))

    {

        /*

         * Empty index. Lock the whole relation, as nothing finer to lock

         * exists.

         */

        PredicateLockRelation(rel, scan->xs_snapshot);

        _bt_parallel_done(scan);

        return false;

    }


    page = BufferGetPage(so->currPos.buf);

    opaque = BTPageGetOpaque(page);

    Assert(P_ISLEAF(opaque));


    if (ScanDirectionIsForward(dir))

    {

        /* There could be dead pages to the left, so not this: */

        /* Assert(P_LEFTMOST(opaque)); */


        start = P_FIRSTDATAKEY(opaque);

    }

    else if (ScanDirectionIsBackward(dir))

    {

        Assert(P_RIGHTMOST(opaque));


        start = PageGetMaxOffsetNumber(page);

    }

    else

    {

        elog(ERROR, "invalid scan direction: %d", (int) dir);

        start = 0;              /* keep compiler quiet */

    }


    /*

     * Now load data from the first page of the scan.

     */

    if (!_bt_readfirstpage(scan, start, dir))

        return false;


    _bt_returnitem(scan, so);

    return true;

}

AttrNumber
int16 AttrNumber
Definition: attnum.h:21

BlockNumber
uint32 BlockNumber
Definition: block.h:31

InvalidBlockNumber
#define InvalidBlockNumber
Definition: block.h:33

Buffer
int Buffer
Definition: buf.h:23

InvalidBuffer
#define InvalidBuffer
Definition: buf.h:25

IncrBufferRefCount
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5403

BufferGetBlockNumber
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4229

ReleaseBuffer
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5371

BufferGetLSNAtomic
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4491

BufferGetPage
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417

BufferIsValid
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368

PageGetItem
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354

PageGetItemId
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244

Page
PageData * Page
Definition: bufpage.h:82

PageGetMaxOffsetNumber
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372

RegProcedureIsValid
#define RegProcedureIsValid(p)
Definition: c.h:748

Min
#define Min(x, y)
Definition: c.h:975

INVERT_COMPARE_RESULT
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1077

likely
#define likely(x)
Definition: c.h:346

MAXALIGN
#define MAXALIGN(LEN)
Definition: c.h:782

Max
#define Max(x, y)
Definition: c.h:969

RegProcedure
regproc RegProcedure
Definition: c.h:621

int32
int32_t int32
Definition: c.h:498

unlikely
#define unlikely(x)
Definition: c.h:347

uint32
uint32_t uint32
Definition: c.h:502

Size
size_t Size
Definition: c.h:576

cur
struct cursor * cur
Definition: ecpg.c:29

errmsg_internal
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158

errcode
int errcode(int sqlerrcode)
Definition: elog.c:854

ERROR
#define ERROR
Definition: elog.h:39

elog
#define elog(elevel,...)
Definition: elog.h:225

ereport
#define ereport(elevel,...)
Definition: elog.h:149

FunctionCall2Coll
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149

Assert
Assert(PointerIsAligned(start, uint64))

start
return str start
Definition: hashfn_unstable.h:282

index_getprocinfo
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:907

i
int i
Definition: isn.c:77

if
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81

ItemIdIsDead
#define ItemIdIsDead(itemId)
Definition: itemid.h:113

ItemPointerCompare
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51

ItemPointerGetOffsetNumber
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124

ItemPointerGetBlockNumber
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103

IndexTuple
IndexTupleData * IndexTuple
Definition: itup.h:53

index_getattr
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131

IndexTupleSize
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71

get_opfamily_proc
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:888

lsyscache.h

palloc
void * palloc(Size size)
Definition: mcxt.c:1321

miscadmin.h

CHECK_FOR_INTERRUPTS
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122

sort-test.key
key
Definition: sort-test.py:19

_bt_finish_split
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241

_bt_relandgetbuf
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003

_bt_relbuf
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023

_bt_gettrueroot
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580

_bt_metaversion
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739

_bt_getbuf
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845

_bt_unlockbuf
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070

_bt_lockbuf
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039

_bt_getroot
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344

_bt_preprocess_keys
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtpreprocesskeys.c:182

_bt_parallel_primscan_schedule
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition: nbtree.c:999

_bt_parallel_seize
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:784

_bt_parallel_done
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:949

_bt_parallel_release
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:922

nbtree.h

BTScanPosIsPinned
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004

BTreeTupleGetNPosting
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519

BTreeTupleIsPivot
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481

BTStack
BTStackData * BTStack
Definition: nbtree.h:750

P_ISLEAF
#define P_ISLEAF(opaque)
Definition: nbtree.h:221

SK_BT_SKIP
#define SK_BT_SKIP
Definition: nbtree.h:1136

P_HIKEY
#define P_HIKEY
Definition: nbtree.h:368

BTORDER_PROC
#define BTORDER_PROC
Definition: nbtree.h:717

P_LEFTMOST
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219

BTPageGetOpaque
#define BTPageGetOpaque(page)
Definition: nbtree.h:74

SK_BT_PRIOR
#define SK_BT_PRIOR
Definition: nbtree.h:1142

P_ISDELETED
#define P_ISDELETED(opaque)
Definition: nbtree.h:223

SK_BT_NEXT
#define SK_BT_NEXT
Definition: nbtree.h:1141

MaxTIDsPerBTreePage
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186

BTScanPosIsValid
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021

P_FIRSTDATAKEY
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370

BTreeTupleGetPostingOffset
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:530

P_NONE
#define P_NONE
Definition: nbtree.h:213

P_RIGHTMOST
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220

SK_BT_NULLS_FIRST
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1147

P_INCOMPLETE_SPLIT
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228

BTreeTupleGetPostingN
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545

SK_BT_MAXVAL
#define SK_BT_MAXVAL
Definition: nbtree.h:1140

BT_READ
#define BT_READ
Definition: nbtree.h:730

BTreeTupleGetDownLink
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557

SK_BT_DESC
#define SK_BT_DESC
Definition: nbtree.h:1146

P_IGNORE
#define P_IGNORE(opaque)
Definition: nbtree.h:226

BTreeTupleGetMaxHeapTID
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665

BTreeTupleIsPosting
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493

BTScanPosInvalidate
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027

BTScanPosUnpinIfPinned
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015

BTreeTupleGetHeapTID
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639

BT_WRITE
#define BT_WRITE
Definition: nbtree.h:731

BTreeTupleGetNAtts
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578

SK_BT_MINVAL
#define SK_BT_MINVAL
Definition: nbtree.h:1139

BTScanOpaque
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1096

_bt_get_endpoint
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2559

_bt_moveright
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:241

_bt_binsrch_posting
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:602

_bt_first
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:882

_bt_saveitem
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtsearch.c:2001

_bt_setuppostingitems
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup)
Definition: nbtsearch.c:2031

_bt_readnextpage
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2307

_bt_drop_lock_and_maybe_pin
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:67

_bt_readpage
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
Definition: nbtsearch.c:1593

_bt_binsrch
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343

_bt_endpoint
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2642

_bt_steppage
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2116

_bt_readfirstpage
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2219

_bt_lock_and_validate_left
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno)
Definition: nbtsearch.c:2442

_bt_savepostingitem
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtsearch.c:2069

_bt_search
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102

_bt_binsrch_insert
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:474

_bt_next
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1541

_bt_returnitem
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:2090

_bt_compare
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688

_bt_scanbehind_checkkeys
bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
Definition: nbtutils.c:2389

_bt_checkkeys
bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtutils.c:2261

_bt_freestack
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:187

_bt_killitems
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:3357

_bt_check_natts
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4103

_bt_set_startikey
void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
Definition: nbtutils.c:2502

_bt_start_array_keys
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:611

InvalidOffsetNumber
#define InvalidOffsetNumber
Definition: off.h:26

OffsetNumberIsValid
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39

OffsetNumberNext
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52

OffsetNumber
uint16 OffsetNumber
Definition: off.h:24

OffsetNumberPrev
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54

INDEX_MAX_KEYS
#define INDEX_MAX_KEYS
Definition: pg_config_manual.h:69

buf
static char * buf
Definition: pg_test_fsync.c:72

pgstat.h

pgstat_count_index_scan
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:694

postgres.h

Datum
uintptr_t Datum
Definition: postgres.h:69

DatumGetPointer
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317

DatumGetInt32
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207

InvalidOid
#define InvalidOid
Definition: postgres_ext.h:35

PredicateLockPage
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2599

PredicateLockRelation
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2576

predicate.h

access
short access
Definition: preproc-type.c:36

rel.h

RelationGetDescr
#define RelationGetDescr(relation)
Definition: rel.h:542

RelationGetRelationName
#define RelationGetRelationName(relation)
Definition: rel.h:550

RelationNeedsWAL
#define RelationNeedsWAL(relation)
Definition: rel.h:639

IndexRelationGetNumberOfAttributes
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:528

IndexRelationGetNumberOfKeyAttributes
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:535

relscan.h

ScanKeyEntryInitialize
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32

ScanKeyEntryInitializeWithInfo
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101

ScanDirectionIsForward
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64

ScanDirectionIsBackward
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50

ScanDirection
ScanDirection
Definition: sdir.h:25

SK_ROW_HEADER
#define SK_ROW_HEADER
Definition: skey.h:117

SK_ROW_MEMBER
#define SK_ROW_MEMBER
Definition: skey.h:118

SK_SEARCHNOTNULL
#define SK_SEARCHNOTNULL
Definition: skey.h:122

SK_ROW_END
#define SK_ROW_END
Definition: skey.h:119

ScanKey
ScanKeyData * ScanKey
Definition: skey.h:75

SK_ISNULL
#define SK_ISNULL
Definition: skey.h:115

IsMVCCSnapshot
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55

StrategyNumber
uint16 StrategyNumber
Definition: stratnum.h:22

BTGreaterStrategyNumber
#define BTGreaterStrategyNumber
Definition: stratnum.h:33

InvalidStrategy
#define InvalidStrategy
Definition: stratnum.h:24

BTLessStrategyNumber
#define BTLessStrategyNumber
Definition: stratnum.h:29

BTEqualStrategyNumber
#define BTEqualStrategyNumber
Definition: stratnum.h:31

BTLessEqualStrategyNumber
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30

BTGreaterEqualStrategyNumber
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32

BTArrayKeyInfo
Definition: nbtree.h:1035

BTArrayKeyInfo::high_compare
ScanKey high_compare
Definition: nbtree.h:1050

BTArrayKeyInfo::low_compare
ScanKey low_compare
Definition: nbtree.h:1049

BTArrayKeyInfo::scan_key
int scan_key
Definition: nbtree.h:1037

BTArrayKeyInfo::null_elem
bool null_elem
Definition: nbtree.h:1047

BTInsertStateData
Definition: nbtree.h:821

BTInsertStateData::stricthigh
OffsetNumber stricthigh
Definition: nbtree.h:836

BTInsertStateData::bounds_valid
bool bounds_valid
Definition: nbtree.h:834

BTInsertStateData::buf
Buffer buf
Definition: nbtree.h:827

BTInsertStateData::low
OffsetNumber low
Definition: nbtree.h:835

BTInsertStateData::postingoff
int postingoff
Definition: nbtree.h:843

BTInsertStateData::itup_key
BTScanInsert itup_key
Definition: nbtree.h:824

BTPageOpaqueData
Definition: nbtree.h:64

BTPageOpaqueData::btpo_next
BlockNumber btpo_next
Definition: nbtree.h:66

BTPageOpaqueData::btpo_prev
BlockNumber btpo_prev
Definition: nbtree.h:65

BTPageOpaqueData::btpo_level
uint32 btpo_level
Definition: nbtree.h:67

BTReadPageState
Definition: nbtree.h:1102

BTScanInsertData
Definition: nbtree.h:796

BTScanOpaqueData
Definition: nbtree.h:1054

BTScanOpaqueData::markItemIndex
int markItemIndex
Definition: nbtree.h:1089

BTScanOpaqueData::numberOfKeys
int numberOfKeys
Definition: nbtree.h:1057

BTScanOpaqueData::needPrimScan
bool needPrimScan
Definition: nbtree.h:1063

BTScanOpaqueData::numKilled
int numKilled
Definition: nbtree.h:1072

BTScanOpaqueData::qual_ok
bool qual_ok
Definition: nbtree.h:1056

BTScanOpaqueData::arrayKeys
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066

BTScanOpaqueData::numArrayKeys
int numArrayKeys
Definition: nbtree.h:1061

BTScanOpaqueData::markTuples
char * markTuples
Definition: nbtree.h:1080

BTScanOpaqueData::currPos
BTScanPosData currPos
Definition: nbtree.h:1092

BTScanOpaqueData::currTuples
char * currTuples
Definition: nbtree.h:1079

BTScanOpaqueData::oppositeDirCheck
bool oppositeDirCheck
Definition: nbtree.h:1065

BTScanOpaqueData::scanBehind
bool scanBehind
Definition: nbtree.h:1064

BTScanOpaqueData::markPos
BTScanPosData markPos
Definition: nbtree.h:1093

BTScanOpaqueData::keyData
ScanKey keyData
Definition: nbtree.h:1058

BTScanPosData
Definition: nbtree.h:963

BTScanPosData::moreRight
bool moreRight
Definition: nbtree.h:986

BTScanPosData::buf
Buffer buf
Definition: nbtree.h:964

BTScanPosData::currPage
BlockNumber currPage
Definition: nbtree.h:967

BTScanPosData::firstItem
int firstItem
Definition: nbtree.h:995

BTScanPosData::nextTupleOffset
int nextTupleOffset
Definition: nbtree.h:979

BTScanPosData::prevPage
BlockNumber prevPage
Definition: nbtree.h:968

BTScanPosData::nextPage
BlockNumber nextPage
Definition: nbtree.h:969

BTScanPosData::moreLeft
bool moreLeft
Definition: nbtree.h:985

BTScanPosData::lastItem
int lastItem
Definition: nbtree.h:996

BTScanPosData::items
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999

BTScanPosData::itemIndex
int itemIndex
Definition: nbtree.h:997

BTScanPosData::dir
ScanDirection dir
Definition: nbtree.h:973

BTScanPosData::lsn
XLogRecPtr lsn
Definition: nbtree.h:970

BTScanPosItem
Definition: nbtree.h:956

BTScanPosItem::heapTid
ItemPointerData heapTid
Definition: nbtree.h:957

BTScanPosItem::tupleOffset
LocationIndex tupleOffset
Definition: nbtree.h:959

BTScanPosItem::indexOffset
OffsetNumber indexOffset
Definition: nbtree.h:958

BTStackData
Definition: nbtree.h:744

BTStackData::bts_blkno
BlockNumber bts_blkno
Definition: nbtree.h:745

BTStackData::bts_parent
struct BTStackData * bts_parent
Definition: nbtree.h:747

BTStackData::bts_offset
OffsetNumber bts_offset
Definition: nbtree.h:746

FmgrInfo
Definition: fmgr.h:57

IndexScanDescData
Definition: relscan.h:134

IndexScanDescData::parallel_scan
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:191

IndexScanDescData::ignore_killed_tuples
bool ignore_killed_tuples
Definition: relscan.h:148

IndexScanDescData::instrument
struct IndexScanInstrumentation * instrument
Definition: relscan.h:159

IndexScanDescData::xs_want_itup
bool xs_want_itup
Definition: relscan.h:143

IndexScanDescData::xs_itup
IndexTuple xs_itup
Definition: relscan.h:167

IndexScanDescData::indexRelation
Relation indexRelation
Definition: relscan.h:137

IndexScanDescData::xs_heaptid
ItemPointerData xs_heaptid
Definition: relscan.h:172

IndexScanDescData::opaque
void * opaque
Definition: relscan.h:153

IndexScanDescData::xs_snapshot
struct SnapshotData * xs_snapshot
Definition: relscan.h:138

IndexScanInstrumentation::nsearches
uint64 nsearches
Definition: genam.h:38

IndexTupleData
Definition: itup.h:36

IndexTupleData::t_tid
ItemPointerData t_tid
Definition: itup.h:37

IndexTupleData::t_info
unsigned short t_info
Definition: itup.h:49

ItemIdData
Definition: itemid.h:26

ItemPointerData
Definition: itemptr.h:37

RelationData
Definition: rel.h:56

RelationData::rd_opcintype
Oid * rd_opcintype
Definition: rel.h:208

RelationData::rd_opfamily
Oid * rd_opfamily
Definition: rel.h:207

ScanKeyData
Definition: skey.h:65

ScanKeyData::sk_flags
int sk_flags
Definition: skey.h:66

ScanKeyData::sk_argument
Datum sk_argument
Definition: skey.h:72

ScanKeyData::sk_func
FmgrInfo sk_func
Definition: skey.h:71

ScanKeyData::sk_collation
Oid sk_collation
Definition: skey.h:70

ScanKeyData::sk_strategy
StrategyNumber sk_strategy
Definition: skey.h:68

ScanKeyData::sk_attno
AttrNumber sk_attno
Definition: skey.h:67

TupleDescData
Definition: tupdesc.h:136

items
static ItemArray items
Definition: test_tidstore.c:48

xact.h

IsolationIsSerializable
#define IsolationIsSerializable()
Definition: xact.h:52