#include "postgres.h"
#include "access/nbtree.h"
#include "common/int.h"

Include dependency graph for nbtsplitloc.c:

Data Structures
struct	SplitPoint

struct	FindSplitData

Macros
#define	LEAF_SPLIT_DISTANCE 0.050

#define	INTERNAL_SPLIT_DISTANCE 0.075

Enumerations
enum	FindSplitStrat { SPLIT_DEFAULT , SPLIT_MANY_DUPLICATES , SPLIT_SINGLE_VALUE }

Functions
static void	_bt_recsplitloc (FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)

static void	_bt_deltasortsplits (FindSplitData *state, double fillfactormult, bool usemult)

static int	_bt_splitcmp (const void arg1, const void arg2)

static bool	_bt_afternewitemoff (FindSplitData state, OffsetNumber maxoff, int leaffillfactor, bool usemult)

static bool	_bt_adjacenthtid (ItemPointer lowhtid, ItemPointer highhtid)

static OffsetNumber	_bt_bestsplitloc (FindSplitData state, int perfectpenalty, bool newitemonleft, FindSplitStrat strategy)

static int	_bt_defaultinterval (FindSplitData *state)

static int	_bt_strategy (FindSplitData state, SplitPoint leftpage, SplitPoint rightpage, FindSplitStrat strategy)

static void	_bt_interval_edges (FindSplitData state, SplitPoint leftinterval, SplitPoint *rightinterval)

static int	_bt_split_penalty (FindSplitData state, SplitPoint split)

static IndexTuple	_bt_split_lastleft (FindSplitData state, SplitPoint split)

static IndexTuple	_bt_split_firstright (FindSplitData state, SplitPoint split)

OffsetNumber	_bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)

Macro Definition Documentation

◆ INTERNAL_SPLIT_DISTANCE

#define INTERNAL_SPLIT_DISTANCE 0.075

Definition at line 850 of file nbtsplitloc.c.

◆ LEAF_SPLIT_DISTANCE

#define LEAF_SPLIT_DISTANCE 0.050

Definition at line 849 of file nbtsplitloc.c.

Enumeration Type Documentation

◆ FindSplitStrat

enum FindSplitStrat

Enumerator
SPLIT_DEFAULT
SPLIT_MANY_DUPLICATES
SPLIT_SINGLE_VALUE

Definition at line 20 of file nbtsplitloc.c.

{
    /* strategy for searching through materialized list of split points */
    SPLIT_DEFAULT,              /* give some weight to truncation */
    SPLIT_MANY_DUPLICATES,      /* find minimally distinguishing point */
    SPLIT_SINGLE_VALUE,         /* leave left page almost full */
} FindSplitStrat;

Function Documentation

◆ _bt_adjacenthtid()

static bool _bt_adjacenthtid	(	ItemPointer	lowhtid,
		ItemPointer	highhtid
	)

static

Definition at line 749 of file nbtsplitloc.c.

{
    BlockNumber lowblk,
                highblk;
 
    lowblk = ItemPointerGetBlockNumber(lowhtid);
    highblk = ItemPointerGetBlockNumber(highhtid);
 
    /* Make optimistic assumption of adjacency when heap blocks match */
    if (lowblk == highblk)
        return true;
 
    /* When heap block one up, second offset should be FirstOffsetNumber */
    if (lowblk + 1 == highblk &&
        ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
        return true;
 
    return false;
}

References FirstOffsetNumber, ItemPointerGetBlockNumber(), and ItemPointerGetOffsetNumber().

Referenced by _bt_afternewitemoff().

◆ _bt_afternewitemoff()

static bool _bt_afternewitemoff	(	FindSplitData *	state,
		OffsetNumber	maxoff,
		int	leaffillfactor,
		bool *	usemult
	)

static

Definition at line 630 of file nbtsplitloc.c.

{
    int16       nkeyatts;
    ItemId      itemid;
    IndexTuple  tup;
    int         keepnatts;
 
    Assert(state->is_leaf && !state->is_rightmost);
 
    nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 
    /* Single key indexes not considered here */
    if (nkeyatts == 1)
        return false;
 
    /* Ascending insertion pattern never inferred when new item is first */
    if (state->newitemoff == P_FIRSTKEY)
        return false;
 
    /*
     * Only apply optimization on pages with equisized tuples, since ordinal
     * keys are likely to be fixed-width.  Testing if the new tuple is
     * variable width directly might also work, but that fails to apply the
     * optimization to indexes with a numeric_ops attribute.
     *
     * Conclude that page has equisized tuples when the new item is the same
     * width as the smallest item observed during pass over page, and other
     * non-pivot tuples must be the same width as well.  (Note that the
     * possibly-truncated existing high key isn't counted in
     * olddataitemstotal, and must be subtracted from maxoff.)
     */
    if (state->newitemsz != state->minfirstrightsz)
        return false;
    if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
        return false;
 
    /*
     * Avoid applying optimization when tuples are wider than a tuple
     * consisting of two non-NULL int8/int64 attributes (or four non-NULL
     * int4/int32 attributes)
     */
    if (state->newitemsz >
        MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
        sizeof(ItemIdData))
        return false;
 
    /*
     * At least the first attribute's value must be equal to the corresponding
     * value in previous tuple to apply optimization.  New item cannot be a
     * duplicate, either.
     *
     * Handle case where new item is to the right of all items on the existing
     * page.  This is suggestive of monotonically increasing insertions in
     * itself, so the "heap TID adjacency" test is not applied here.
     */
    if (state->newitemoff > maxoff)
    {
        itemid = PageGetItemId(state->origpage, maxoff);
        tup = (IndexTuple) PageGetItem(state->origpage, itemid);
        keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
 
        if (keepnatts > 1 && keepnatts <= nkeyatts)
        {
            *usemult = true;
            return true;
        }
 
        return false;
    }
 
    /*
     * "Low cardinality leading column, high cardinality suffix column"
     * indexes with a random insertion pattern (e.g., an index with a boolean
     * column, such as an index on '(book_is_in_print, book_isbn)') present us
     * with a risk of consistently misapplying the optimization.  We're
     * willing to accept very occasional misapplication of the optimization,
     * provided the cases where we get it wrong are rare and self-limiting.
     *
     * Heap TID adjacency strongly suggests that the item just to the left was
     * inserted very recently, which limits overapplication of the
     * optimization.  Besides, all inappropriate cases triggered here will
     * still split in the middle of the page on average.
     */
    itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
    tup = (IndexTuple) PageGetItem(state->origpage, itemid);
    /* Do cheaper test first */
    if (BTreeTupleIsPosting(tup) ||
        !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
        return false;
    /* Check same conditions as rightmost item case, too */
    keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
 
    if (keepnatts > 1 && keepnatts <= nkeyatts)
    {
        double      interp = (double) state->newitemoff / ((double) maxoff + 1);
        double      leaffillfactormult = (double) leaffillfactor / 100.0;
 
        /*
         * Don't allow caller to split after a new item when it will result in
         * a split point to the right of the point that a leaf fillfactor
         * split would use -- have caller apply leaf fillfactor instead
         */
        *usemult = interp > leaffillfactormult;
 
        return true;
    }
 
    return false;
}

References _bt_adjacenthtid(), _bt_keep_natts_fast(), Assert(), BTreeTupleIsPosting(), IndexRelationGetNumberOfKeyAttributes, MAXALIGN, OffsetNumberPrev, P_FIRSTKEY, PageGetItem(), PageGetItemId(), and IndexTupleData::t_tid.

Referenced by _bt_findsplitloc().

◆ _bt_bestsplitloc()

static OffsetNumber _bt_bestsplitloc	(	FindSplitData *	state,
		int	perfectpenalty,
		bool *	newitemonleft,
		FindSplitStrat	strategy
	)

static

Definition at line 788 of file nbtsplitloc.c.

{
    int         bestpenalty,
                lowsplit;
    int         highsplit = Min(state->interval, state->nsplits);
    SplitPoint *final;
 
    bestpenalty = INT_MAX;
    lowsplit = 0;
    for (int i = lowsplit; i < highsplit; i++)
    {
        int         penalty;
 
        penalty = _bt_split_penalty(state, state->splits + i);
 
        if (penalty < bestpenalty)
        {
            bestpenalty = penalty;
            lowsplit = i;
        }
 
        if (penalty <= perfectpenalty)
            break;
    }
 
    final = &state->splits[lowsplit];
 
    /*
     * There is a risk that the "many duplicates" strategy will repeatedly do
     * the wrong thing when there are monotonically decreasing insertions to
     * the right of a large group of duplicates.   Repeated splits could leave
     * a succession of right half pages with free space that can never be
     * used.  This must be avoided.
     *
     * Consider the example of the leftmost page in a single integer attribute
     * NULLS FIRST index which is almost filled with NULLs.  Monotonically
     * decreasing integer insertions might cause the same leftmost page to
     * split repeatedly at the same point.  Each split derives its new high
     * key from the lowest current value to the immediate right of the large
     * group of NULLs, which will always be higher than all future integer
     * insertions, directing all future integer insertions to the same
     * leftmost page.
     */
    if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
        !final->newitemonleft && final->firstrightoff >= state->newitemoff &&
        final->firstrightoff < state->newitemoff + 9)
    {
        /*
         * Avoid the problem by performing a 50:50 split when the new item is
         * just to the right of the would-be "many duplicates" split point.
         * (Note that the test used for an insert that is "just to the right"
         * of the split point is conservative.)
         */
        final = &state->splits[0];
    }
 
    *newitemonleft = final->newitemonleft;
    return final->firstrightoff;
}

References _bt_split_penalty(), final, i, Min, and SPLIT_MANY_DUPLICATES.

Referenced by _bt_findsplitloc().

◆ _bt_defaultinterval()

static int _bt_defaultinterval ( FindSplitData * state )

static

Definition at line 876 of file nbtsplitloc.c.

{
    SplitPoint *spaceoptimal;
    int16       tolerance,
                lowleftfree,
                lowrightfree,
                highleftfree,
                highrightfree;
 
    /*
     * Determine leftfree and rightfree values that are higher and lower than
     * we're willing to tolerate.  Note that the final split interval will be
     * about 10% of nsplits in the common case where all non-pivot tuples
     * (data items) from a leaf page are uniformly sized.  We're a bit more
     * aggressive when splitting internal pages.
     */
    if (state->is_leaf)
        tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
    else
        tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
 
    /* First candidate split point is the most evenly balanced */
    spaceoptimal = state->splits;
    lowleftfree = spaceoptimal->leftfree - tolerance;
    lowrightfree = spaceoptimal->rightfree - tolerance;
    highleftfree = spaceoptimal->leftfree + tolerance;
    highrightfree = spaceoptimal->rightfree + tolerance;
 
    /*
     * Iterate through split points, starting from the split immediately after
     * 'spaceoptimal'.  Find the first split point that divides free space so
     * unevenly that including it in the split interval would be unacceptable.
     */
    for (int i = 1; i < state->nsplits; i++)
    {
        SplitPoint *split = state->splits + i;
 
        /* Cannot use curdelta here, since its value is often weighted */
        if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
            split->leftfree > highleftfree || split->rightfree > highrightfree)
            return i;
    }
 
    return state->nsplits;
}

References i, INTERNAL_SPLIT_DISTANCE, LEAF_SPLIT_DISTANCE, SplitPoint::leftfree, and SplitPoint::rightfree.

Referenced by _bt_findsplitloc().

◆ _bt_deltasortsplits()

static void _bt_deltasortsplits	(	FindSplitData *	state,
		double	fillfactormult,
		bool	usemult
	)

static

Definition at line 566 of file nbtsplitloc.c.

{
    for (int i = 0; i < state->nsplits; i++)
    {
        SplitPoint *split = state->splits + i;
        int16       delta;
 
        if (usemult)
            delta = fillfactormult * split->leftfree -
                (1.0 - fillfactormult) * split->rightfree;
        else
            delta = split->leftfree - split->rightfree;
 
        if (delta < 0)
            delta = -delta;
 
        /* Save delta */
        split->curdelta = delta;
    }
 
    qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
}

References _bt_splitcmp(), SplitPoint::curdelta, i, if(), SplitPoint::leftfree, qsort, and SplitPoint::rightfree.

Referenced by _bt_findsplitloc().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc	(	Relation	rel,
		Page	origpage,
		OffsetNumber	newitemoff,
		Size	newitemsz,
		IndexTuple	newitem,
		bool *	newitemonleft
	)

Definition at line 129 of file nbtsplitloc.c.

{
    BTPageOpaque opaque;
    int         leftspace,
                rightspace,
                olddataitemstotal,
                olddataitemstoleft,
                perfectpenalty,
                leaffillfactor;
    FindSplitData state;
    FindSplitStrat strategy;
    ItemId      itemid;
    OffsetNumber offnum,
                maxoff,
                firstrightoff;
    double      fillfactormult;
    bool        usemult;
    SplitPoint  leftpage,
                rightpage;
 
    opaque = BTPageGetOpaque(origpage);
    maxoff = PageGetMaxOffsetNumber(origpage);
 
    /* Total free space available on a btree page, after fixed overhead */
    leftspace = rightspace =
        PageGetPageSize(origpage) - SizeOfPageHeaderData -
        MAXALIGN(sizeof(BTPageOpaqueData));
 
    /* The right page will have the same high key as the old page */
    if (!P_RIGHTMOST(opaque))
    {
        itemid = PageGetItemId(origpage, P_HIKEY);
        rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
                             sizeof(ItemIdData));
    }
 
    /* Count up total space in data items before actually scanning 'em */
    olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
    leaffillfactor = BTGetFillFactor(rel);
 
    /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
    newitemsz += sizeof(ItemIdData);
    state.rel = rel;
    state.origpage = origpage;
    state.newitem = newitem;
    state.newitemsz = newitemsz;
    state.is_leaf = P_ISLEAF(opaque);
    state.is_rightmost = P_RIGHTMOST(opaque);
    state.leftspace = leftspace;
    state.rightspace = rightspace;
    state.olddataitemstotal = olddataitemstotal;
    state.minfirstrightsz = SIZE_MAX;
    state.newitemoff = newitemoff;
 
    /* newitem cannot be a posting list item */
    Assert(!BTreeTupleIsPosting(newitem));
 
    /*
     * nsplits should never exceed maxoff because there will be at most as
     * many candidate split points as there are points _between_ tuples, once
     * you imagine that the new item is already on the original page (the
     * final number of splits may be slightly lower because not all points
     * between tuples will be legal).
     */
    state.maxsplits = maxoff;
    state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
    state.nsplits = 0;
 
    /*
     * Scan through the data items and calculate space usage for a split at
     * each possible position
     */
    olddataitemstoleft = 0;
 
    for (offnum = P_FIRSTDATAKEY(opaque);
         offnum <= maxoff;
         offnum = OffsetNumberNext(offnum))
    {
        Size        itemsz;
 
        itemid = PageGetItemId(origpage, offnum);
        itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
 
        /*
         * When item offset number is not newitemoff, neither side of the
         * split can be newitem.  Record a split after the previous data item
         * from original page, but before the current data item from original
         * page. (_bt_recsplitloc() will reject the split when there are no
         * previous items, which we rely on.)
         */
        if (offnum < newitemoff)
            _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
        else if (offnum > newitemoff)
            _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
        else
        {
            /*
             * Record a split after all "offnum < newitemoff" original page
             * data items, but before newitem
             */
            _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
 
            /*
             * Record a split after newitem, but before data item from
             * original page at offset newitemoff/current offset
             */
            _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
        }
 
        olddataitemstoleft += itemsz;
    }
 
    /*
     * Record a split after all original page data items, but before newitem.
     * (Though only when it's possible that newitem will end up alone on new
     * right page.)
     */
    Assert(olddataitemstoleft == olddataitemstotal);
    if (newitemoff > maxoff)
        _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
 
    /*
     * I believe it is not possible to fail to find a feasible split, but just
     * in case ...
     */
    if (state.nsplits == 0)
        elog(ERROR, "could not find a feasible split point for index \"%s\"",
             RelationGetRelationName(rel));
 
    /*
     * Start search for a split point among list of legal split points.  Give
     * primary consideration to equalizing available free space in each half
     * of the split initially (start with default strategy), while applying
     * rightmost and split-after-new-item optimizations where appropriate.
     * Either of the two other fallback strategies may be required for cases
     * with a large number of duplicates around the original/space-optimal
     * split point.
     *
     * Default strategy gives some weight to suffix truncation in deciding a
     * split point on leaf pages.  It attempts to select a split point where a
     * distinguishing attribute appears earlier in the new high key for the
     * left side of the split, in order to maximize the number of trailing
     * attributes that can be truncated away.  Only candidate split points
     * that imply an acceptable balance of free space on each side are
     * considered.  See _bt_defaultinterval().
     */
    if (!state.is_leaf)
    {
        /* fillfactormult only used on rightmost page */
        usemult = state.is_rightmost;
        fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
    }
    else if (state.is_rightmost)
    {
        /* Rightmost leaf page --  fillfactormult always used */
        usemult = true;
        fillfactormult = leaffillfactor / 100.0;
    }
    else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
    {
        /*
         * New item inserted at rightmost point among a localized grouping on
         * a leaf page -- apply "split after new item" optimization, either by
         * applying leaf fillfactor multiplier, or by choosing the exact split
         * point that leaves newitem as lastleft. (usemult is set for us.)
         */
        if (usemult)
        {
            /* fillfactormult should be set based on leaf fillfactor */
            fillfactormult = leaffillfactor / 100.0;
        }
        else
        {
            /* find precise split point after newitemoff */
            for (int i = 0; i < state.nsplits; i++)
            {
                SplitPoint *split = state.splits + i;
 
                if (split->newitemonleft &&
                    newitemoff == split->firstrightoff)
                {
                    pfree(state.splits);
                    *newitemonleft = true;
                    return newitemoff;
                }
            }
 
            /*
             * Cannot legally split after newitemoff; proceed with split
             * without using fillfactor multiplier.  This is defensive, and
             * should never be needed in practice.
             */
            fillfactormult = 0.50;
        }
    }
    else
    {
        /* Other leaf page.  50:50 page split. */
        usemult = false;
        /* fillfactormult not used, but be tidy */
        fillfactormult = 0.50;
    }
 
    /*
     * Save leftmost and rightmost splits for page before original ordinal
     * sort order is lost by delta/fillfactormult sort
     */
    leftpage = state.splits[0];
    rightpage = state.splits[state.nsplits - 1];
 
    /* Give split points a fillfactormult-wise delta, and sort on deltas */
    _bt_deltasortsplits(&state, fillfactormult, usemult);
 
    /* Determine split interval for default strategy */
    state.interval = _bt_defaultinterval(&state);
 
    /*
     * Determine if default strategy/split interval will produce a
     * sufficiently distinguishing split, or if we should change strategies.
     * Alternative strategies change the range of split points that are
     * considered acceptable (split interval), and possibly change
     * fillfactormult, in order to deal with pages with a large number of
     * duplicates gracefully.
     *
     * Pass low and high splits for the entire page (actually, they're for an
     * imaginary version of the page that includes newitem).  These are used
     * when the initial split interval encloses split points that are full of
     * duplicates, and we need to consider if it's even possible to avoid
     * appending a heap TID.
     */
    perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
 
    if (strategy == SPLIT_DEFAULT)
    {
        /*
         * Default strategy worked out (always works out with internal page).
         * Original split interval still stands.
         */
    }
 
    /*
     * Many duplicates strategy is used when a heap TID would otherwise be
     * appended, but the page isn't completely full of logical duplicates.
     *
     * The split interval is widened to include all legal candidate split
     * points.  There might be a few as two distinct values in the whole-page
     * split interval, though it's also possible that most of the values on
     * the page are unique.  The final split point will either be to the
     * immediate left or to the immediate right of the group of duplicate
     * tuples that enclose the first/delta-optimal split point (perfect
     * penalty was set so that the lowest delta split point that avoids
     * appending a heap TID will be chosen).  Maximizing the number of
     * attributes that can be truncated away is not a goal of the many
     * duplicates strategy.
     *
     * Single value strategy is used when it is impossible to avoid appending
     * a heap TID.  It arranges to leave the left page very full.  This
     * maximizes space utilization in cases where tuples with the same
     * attribute values span many pages.  Newly inserted duplicates will tend
     * to have higher heap TID values, so we'll end up splitting to the right
     * consistently.  (Single value strategy is harmless though not
     * particularly useful with !heapkeyspace indexes.)
     */
    else if (strategy == SPLIT_MANY_DUPLICATES)
    {
        Assert(state.is_leaf);
        /* Shouldn't try to truncate away extra user attributes */
        Assert(perfectpenalty ==
               IndexRelationGetNumberOfKeyAttributes(state.rel));
        /* No need to resort splits -- no change in fillfactormult/deltas */
        state.interval = state.nsplits;
    }
    else if (strategy == SPLIT_SINGLE_VALUE)
    {
        Assert(state.is_leaf);
        /* Split near the end of the page */
        usemult = true;
        fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
        /* Resort split points with new delta */
        _bt_deltasortsplits(&state, fillfactormult, usemult);
        /* Appending a heap TID is unavoidable, so interval of 1 is fine */
        state.interval = 1;
    }
 
    /*
     * Search among acceptable split points (using final split interval) for
     * the entry that has the lowest penalty, and is therefore expected to
     * maximize fan-out.  Sets *newitemonleft for us.
     */
    firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
                                     strategy);
    pfree(state.splits);
 
    return firstrightoff;
}

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert(), BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc(), pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_interval_edges()

static void _bt_interval_edges	(	FindSplitData *	state,
		SplitPoint **	leftinterval,
		SplitPoint **	rightinterval
	)

static

Definition at line 1052 of file nbtsplitloc.c.

{
    int         highsplit = Min(state->interval, state->nsplits);
    SplitPoint *deltaoptimal;
 
    deltaoptimal = state->splits;
    *leftinterval = NULL;
    *rightinterval = NULL;
 
    /*
     * Delta is an absolute distance to optimal split point, so both the
     * leftmost and rightmost split point will usually be at the end of the
     * array
     */
    for (int i = highsplit - 1; i >= 0; i--)
    {
        SplitPoint *distant = state->splits + i;
 
        if (distant->firstrightoff < deltaoptimal->firstrightoff)
        {
            if (*leftinterval == NULL)
                *leftinterval = distant;
        }
        else if (distant->firstrightoff > deltaoptimal->firstrightoff)
        {
            if (*rightinterval == NULL)
                *rightinterval = distant;
        }
        else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
        {
            /*
             * "incoming tuple will become firstright" (distant) is to the
             * left of "incoming tuple will become lastleft" (delta-optimal)
             */
            Assert(distant->firstrightoff == state->newitemoff);
            if (*leftinterval == NULL)
                *leftinterval = distant;
        }
        else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
        {
            /*
             * "incoming tuple will become lastleft" (distant) is to the right
             * of "incoming tuple will become firstright" (delta-optimal)
             */
            Assert(distant->firstrightoff == state->newitemoff);
            if (*rightinterval == NULL)
                *rightinterval = distant;
        }
        else
        {
            /* There was only one or two splits in initial split interval */
            Assert(distant == deltaoptimal);
            if (*leftinterval == NULL)
                *leftinterval = distant;
            if (*rightinterval == NULL)
                *rightinterval = distant;
        }
 
        if (*leftinterval && *rightinterval)
            return;
    }
 
    Assert(false);
}

References Assert(), SplitPoint::firstrightoff, i, Min, and SplitPoint::newitemonleft.

Referenced by _bt_strategy().

◆ _bt_recsplitloc()

static void _bt_recsplitloc	(	FindSplitData *	state,
		OffsetNumber	firstrightoff,
		bool	newitemonleft,
		int	olddataitemstoleft,
		Size	firstrightofforigpagetuplesz
	)

static

Definition at line 449 of file nbtsplitloc.c.

{
    int16       leftfree,
                rightfree;
    Size        firstrightsz;
    Size        postingsz = 0;
    bool        newitemisfirstright;
 
    /* Is the new item going to be split point's firstright tuple? */
    newitemisfirstright = (firstrightoff == state->newitemoff &&
                           !newitemonleft);
 
    if (newitemisfirstright)
        firstrightsz = state->newitemsz;
    else
    {
        firstrightsz = firstrightofforigpagetuplesz;
 
        /*
         * Calculate suffix truncation space saving when firstright tuple is a
         * posting list tuple, though only when the tuple is over 64 bytes
         * including line pointer overhead (arbitrary).  This avoids accessing
         * the tuple in cases where its posting list must be very small (if
         * tuple has one at all).
         *
         * Note: We don't do this in the case where firstright tuple is
         * newitem, since newitem cannot have a posting list.
         */
        if (state->is_leaf && firstrightsz > 64)
        {
            ItemId      itemid;
            IndexTuple  newhighkey;
 
            itemid = PageGetItemId(state->origpage, firstrightoff);
            newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
 
            if (BTreeTupleIsPosting(newhighkey))
                postingsz = IndexTupleSize(newhighkey) -
                    BTreeTupleGetPostingOffset(newhighkey);
        }
    }
 
    /* Account for all the old tuples */
    leftfree = state->leftspace - olddataitemstoleft;
    rightfree = state->rightspace -
        (state->olddataitemstotal - olddataitemstoleft);
 
    /*
     * The first item on the right page becomes the high key of the left page;
     * therefore it counts against left space as well as right space (we
     * cannot assume that suffix truncation will make it any smaller).  When
     * index has included attributes, then those attributes of left page high
     * key will be truncated leaving that page with slightly more free space.
     * However, that shouldn't affect our ability to find valid split
     * location, since we err in the direction of being pessimistic about free
     * space on the left half.  Besides, even when suffix truncation of
     * non-TID attributes occurs, the new high key often won't even be a
     * single MAXALIGN() quantum smaller than the firstright tuple it's based
     * on.
     *
     * If we are on the leaf level, assume that suffix truncation cannot avoid
     * adding a heap TID to the left half's new high key when splitting at the
     * leaf level.  In practice the new high key will often be smaller and
     * will rarely be larger, but conservatively assume the worst case.  We do
     * go to the trouble of subtracting away posting list overhead, though
     * only when it looks like it will make an appreciable difference.
     * (Posting lists are the only case where truncation will typically make
     * the final high key far smaller than firstright, so being a bit more
     * precise there noticeably improves the balance of free space.)
     */
    if (state->is_leaf)
        leftfree -= (int16) (firstrightsz +
                             MAXALIGN(sizeof(ItemPointerData)) -
                             postingsz);
    else
        leftfree -= (int16) firstrightsz;
 
    /* account for the new item */
    if (newitemonleft)
        leftfree -= (int16) state->newitemsz;
    else
        rightfree -= (int16) state->newitemsz;
 
    /*
     * If we are not on the leaf level, we will be able to discard the key
     * data from the first item that winds up on the right page.
     */
    if (!state->is_leaf)
        rightfree += (int16) firstrightsz -
            (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
 
    /* Record split if legal */
    if (leftfree >= 0 && rightfree >= 0)
    {
        Assert(state->nsplits < state->maxsplits);
 
        /* Determine smallest firstright tuple size among legal splits */
        state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
 
        state->splits[state->nsplits].curdelta = 0;
        state->splits[state->nsplits].leftfree = leftfree;
        state->splits[state->nsplits].rightfree = rightfree;
        state->splits[state->nsplits].firstrightoff = firstrightoff;
        state->splits[state->nsplits].newitemonleft = newitemonleft;
        state->nsplits++;
    }
}

References Assert(), BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, Min, PageGetItem(), and PageGetItemId().

Referenced by _bt_findsplitloc().

◆ _bt_split_firstright()

static IndexTuple _bt_split_firstright	(	FindSplitData *	state,
		SplitPoint *	split
	)

inlinestatic

Definition at line 1175 of file nbtsplitloc.c.

{
    ItemId      itemid;
 
    if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
        return state->newitem;
 
    itemid = PageGetItemId(state->origpage, split->firstrightoff);
    return (IndexTuple) PageGetItem(state->origpage, itemid);
}

References SplitPoint::firstrightoff, SplitPoint::newitemonleft, PageGetItem(), and PageGetItemId().

Referenced by _bt_split_penalty(), and _bt_strategy().

◆ _bt_split_lastleft()

static IndexTuple _bt_split_lastleft	(	FindSplitData *	state,
		SplitPoint *	split
	)

inlinestatic

Definition at line 1159 of file nbtsplitloc.c.

{
    ItemId      itemid;
 
    if (split->newitemonleft && split->firstrightoff == state->newitemoff)
        return state->newitem;
 
    itemid = PageGetItemId(state->origpage,
                           OffsetNumberPrev(split->firstrightoff));
    return (IndexTuple) PageGetItem(state->origpage, itemid);
}

References SplitPoint::firstrightoff, SplitPoint::newitemonleft, OffsetNumberPrev, PageGetItem(), and PageGetItemId().

Referenced by _bt_split_penalty(), and _bt_strategy().

◆ _bt_split_penalty()

static int _bt_split_penalty	(	FindSplitData *	state,
		SplitPoint *	split
	)

inlinestatic

Definition at line 1131 of file nbtsplitloc.c.

{
    IndexTuple  lastleft;
    IndexTuple  firstright;
 
    if (!state->is_leaf)
    {
        ItemId      itemid;
 
        if (!split->newitemonleft &&
            split->firstrightoff == state->newitemoff)
            return state->newitemsz;
 
        itemid = PageGetItemId(state->origpage, split->firstrightoff);
 
        return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
    }
 
    lastleft = _bt_split_lastleft(state, split);
    firstright = _bt_split_firstright(state, split);
 
    return _bt_keep_natts_fast(state->rel, lastleft, firstright);
}

References _bt_keep_natts_fast(), _bt_split_firstright(), _bt_split_lastleft(), SplitPoint::firstrightoff, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, and PageGetItemId().

Referenced by _bt_bestsplitloc().

◆ _bt_splitcmp()

static int _bt_splitcmp	(	const void *	arg1,
		const void *	arg2
	)

static

Definition at line 594 of file nbtsplitloc.c.

{
    SplitPoint *split1 = (SplitPoint *) arg1;
    SplitPoint *split2 = (SplitPoint *) arg2;
 
    return pg_cmp_s16(split1->curdelta, split2->curdelta);
}

References SplitPoint::curdelta, and pg_cmp_s16().

Referenced by _bt_deltasortsplits().

◆ _bt_strategy()

static int _bt_strategy	(	FindSplitData *	state,
		SplitPoint *	leftpage,
		SplitPoint *	rightpage,
		FindSplitStrat *	strategy
	)

static

Definition at line 934 of file nbtsplitloc.c.

{
    IndexTuple  leftmost,
                rightmost;
    SplitPoint *leftinterval,
               *rightinterval;
    int         perfectpenalty;
    int         indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 
    /* Assume that alternative strategy won't be used for now */
    *strategy = SPLIT_DEFAULT;
 
    /*
     * Use smallest observed firstright item size for entire page (actually,
     * entire imaginary version of page that includes newitem) as perfect
     * penalty on internal pages.  This can save cycles in the common case
     * where most or all splits (not just splits within interval) have
     * firstright tuples that are the same size.
     */
    if (!state->is_leaf)
        return state->minfirstrightsz;
 
    /*
     * Use leftmost and rightmost tuples from leftmost and rightmost splits in
     * current split interval
     */
    _bt_interval_edges(state, &leftinterval, &rightinterval);
    leftmost = _bt_split_lastleft(state, leftinterval);
    rightmost = _bt_split_firstright(state, rightinterval);
 
    /*
     * If initial split interval can produce a split point that will at least
     * avoid appending a heap TID in new high key, we're done.  Finish split
     * with default strategy and initial split interval.
     */
    perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
    if (perfectpenalty <= indnkeyatts)
        return perfectpenalty;
 
    /*
     * Work out how caller should finish split when even their "perfect"
     * penalty for initial/default split interval indicates that the interval
     * does not contain even a single split that avoids appending a heap TID.
     *
     * Use the leftmost split's lastleft tuple and the rightmost split's
     * firstright tuple to assess every possible split.
     */
    leftmost = _bt_split_lastleft(state, leftpage);
    rightmost = _bt_split_firstright(state, rightpage);
 
    /*
     * If page (including new item) has many duplicates but is not entirely
     * full of duplicates, a many duplicates strategy split will be performed.
     * If page is entirely full of duplicates, a single value strategy split
     * will be performed.
     */
    perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
    if (perfectpenalty <= indnkeyatts)
    {
        *strategy = SPLIT_MANY_DUPLICATES;
 
        /*
         * Many duplicates strategy should split at either side the group of
         * duplicates that enclose the delta-optimal split point.  Return
         * indnkeyatts rather than the true perfect penalty to make that
         * happen.  (If perfectpenalty was returned here then low cardinality
         * composite indexes could have continual unbalanced splits.)
         *
         * Note that caller won't go through with a many duplicates split in
         * rare cases where it looks like there are ever-decreasing insertions
         * to the immediate right of the split point.  This must happen just
         * before a final decision is made, within _bt_bestsplitloc().
         */
        return indnkeyatts;
    }
 
    /*
     * Single value strategy is only appropriate with ever-increasing heap
     * TIDs; otherwise, original default strategy split should proceed to
     * avoid pathological performance.  Use page high key to infer if this is
     * the rightmost page among pages that store the same duplicate value.
     * This should not prevent insertions of heap TIDs that are slightly out
     * of order from using single value strategy, since that's expected with
     * concurrent inserters of the same duplicate value.
     */
    else if (state->is_rightmost)
        *strategy = SPLIT_SINGLE_VALUE;
    else
    {
        ItemId      itemid;
        IndexTuple  hikey;
 
        itemid = PageGetItemId(state->origpage, P_HIKEY);
        hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
        perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
                                             state->newitem);
        if (perfectpenalty <= indnkeyatts)
            *strategy = SPLIT_SINGLE_VALUE;
        else
        {
            /*
             * Have caller finish split using default strategy, since page
             * does not appear to be the rightmost page for duplicates of the
             * value the page is filled with
             */
        }
    }
 
    return perfectpenalty;
}

References _bt_interval_edges(), _bt_keep_natts_fast(), _bt_split_firstright(), _bt_split_lastleft(), IndexRelationGetNumberOfKeyAttributes, P_HIKEY, PageGetItem(), PageGetItemId(), SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_findsplitloc().

Data Structures

Macros

Enumerations

Functions

Macro Definition Documentation

◆ INTERNAL_SPLIT_DISTANCE

◆ LEAF_SPLIT_DISTANCE

Enumeration Type Documentation

◆ FindSplitStrat

Function Documentation

◆ _bt_adjacenthtid()

◆ _bt_afternewitemoff()

◆ _bt_bestsplitloc()

◆ _bt_defaultinterval()

◆ _bt_deltasortsplits()

◆ _bt_findsplitloc()

◆ _bt_interval_edges()

◆ _bt_recsplitloc()

◆ _bt_split_firstright()

◆ _bt_split_lastleft()

◆ _bt_split_penalty()

◆ _bt_splitcmp()

◆ _bt_strategy()