PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtsplitloc.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "common/int.h"
Include dependency graph for nbtsplitloc.c:

Go to the source code of this file.

Data Structures

struct  SplitPoint
 
struct  FindSplitData
 

Macros

#define LEAF_SPLIT_DISTANCE   0.050
 
#define INTERNAL_SPLIT_DISTANCE   0.075
 

Enumerations

enum  FindSplitStrat { SPLIT_DEFAULT , SPLIT_MANY_DUPLICATES , SPLIT_SINGLE_VALUE }
 

Functions

static void _bt_recsplitloc (FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
 
static void _bt_deltasortsplits (FindSplitData *state, double fillfactormult, bool usemult)
 
static int _bt_splitcmp (const void *arg1, const void *arg2)
 
static bool _bt_afternewitemoff (FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
 
static bool _bt_adjacenthtid (ItemPointer lowhtid, ItemPointer highhtid)
 
static OffsetNumber _bt_bestsplitloc (FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
 
static int _bt_defaultinterval (FindSplitData *state)
 
static int _bt_strategy (FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
 
static void _bt_interval_edges (FindSplitData *state, SplitPoint **leftinterval, SplitPoint **rightinterval)
 
static int _bt_split_penalty (FindSplitData *state, SplitPoint *split)
 
static IndexTuple _bt_split_lastleft (FindSplitData *state, SplitPoint *split)
 
static IndexTuple _bt_split_firstright (FindSplitData *state, SplitPoint *split)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 

Macro Definition Documentation

◆ INTERNAL_SPLIT_DISTANCE

#define INTERNAL_SPLIT_DISTANCE   0.075

Definition at line 850 of file nbtsplitloc.c.

◆ LEAF_SPLIT_DISTANCE

#define LEAF_SPLIT_DISTANCE   0.050

Definition at line 849 of file nbtsplitloc.c.

Enumeration Type Documentation

◆ FindSplitStrat

Enumerator
SPLIT_DEFAULT 
SPLIT_MANY_DUPLICATES 
SPLIT_SINGLE_VALUE 

Definition at line 20 of file nbtsplitloc.c.

21{
22 /* strategy for searching through materialized list of split points */
23 SPLIT_DEFAULT, /* give some weight to truncation */
24 SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */
25 SPLIT_SINGLE_VALUE, /* leave left page almost full */
FindSplitStrat
Definition: nbtsplitloc.c:21
@ SPLIT_DEFAULT
Definition: nbtsplitloc.c:23
@ SPLIT_MANY_DUPLICATES
Definition: nbtsplitloc.c:24
@ SPLIT_SINGLE_VALUE
Definition: nbtsplitloc.c:25

Function Documentation

◆ _bt_adjacenthtid()

static bool _bt_adjacenthtid ( ItemPointer  lowhtid,
ItemPointer  highhtid 
)
static

Definition at line 749 of file nbtsplitloc.c.

750{
751 BlockNumber lowblk,
752 highblk;
753
754 lowblk = ItemPointerGetBlockNumber(lowhtid);
755 highblk = ItemPointerGetBlockNumber(highhtid);
756
757 /* Make optimistic assumption of adjacency when heap blocks match */
758 if (lowblk == highblk)
759 return true;
760
761 /* When heap block one up, second offset should be FirstOffsetNumber */
762 if (lowblk + 1 == highblk &&
764 return true;
765
766 return false;
767}
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define FirstOffsetNumber
Definition: off.h:27

References FirstOffsetNumber, ItemPointerGetBlockNumber(), and ItemPointerGetOffsetNumber().

Referenced by _bt_afternewitemoff().

◆ _bt_afternewitemoff()

static bool _bt_afternewitemoff ( FindSplitData state,
OffsetNumber  maxoff,
int  leaffillfactor,
bool *  usemult 
)
static

Definition at line 630 of file nbtsplitloc.c.

632{
633 int16 nkeyatts;
634 ItemId itemid;
635 IndexTuple tup;
636 int keepnatts;
637
638 Assert(state->is_leaf && !state->is_rightmost);
639
641
642 /* Single key indexes not considered here */
643 if (nkeyatts == 1)
644 return false;
645
646 /* Ascending insertion pattern never inferred when new item is first */
647 if (state->newitemoff == P_FIRSTKEY)
648 return false;
649
650 /*
651 * Only apply optimization on pages with equisized tuples, since ordinal
652 * keys are likely to be fixed-width. Testing if the new tuple is
653 * variable width directly might also work, but that fails to apply the
654 * optimization to indexes with a numeric_ops attribute.
655 *
656 * Conclude that page has equisized tuples when the new item is the same
657 * width as the smallest item observed during pass over page, and other
658 * non-pivot tuples must be the same width as well. (Note that the
659 * possibly-truncated existing high key isn't counted in
660 * olddataitemstotal, and must be subtracted from maxoff.)
661 */
662 if (state->newitemsz != state->minfirstrightsz)
663 return false;
664 if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
665 return false;
666
667 /*
668 * Avoid applying optimization when tuples are wider than a tuple
669 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
670 * int4/int32 attributes)
671 */
672 if (state->newitemsz >
673 MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
674 sizeof(ItemIdData))
675 return false;
676
677 /*
678 * At least the first attribute's value must be equal to the corresponding
679 * value in previous tuple to apply optimization. New item cannot be a
680 * duplicate, either.
681 *
682 * Handle case where new item is to the right of all items on the existing
683 * page. This is suggestive of monotonically increasing insertions in
684 * itself, so the "heap TID adjacency" test is not applied here.
685 */
686 if (state->newitemoff > maxoff)
687 {
688 itemid = PageGetItemId(state->origpage, maxoff);
689 tup = (IndexTuple) PageGetItem(state->origpage, itemid);
690 keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
691
692 if (keepnatts > 1 && keepnatts <= nkeyatts)
693 {
694 *usemult = true;
695 return true;
696 }
697
698 return false;
699 }
700
701 /*
702 * "Low cardinality leading column, high cardinality suffix column"
703 * indexes with a random insertion pattern (e.g., an index with a boolean
704 * column, such as an index on '(book_is_in_print, book_isbn)') present us
705 * with a risk of consistently misapplying the optimization. We're
706 * willing to accept very occasional misapplication of the optimization,
707 * provided the cases where we get it wrong are rare and self-limiting.
708 *
709 * Heap TID adjacency strongly suggests that the item just to the left was
710 * inserted very recently, which limits overapplication of the
711 * optimization. Besides, all inappropriate cases triggered here will
712 * still split in the middle of the page on average.
713 */
714 itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
715 tup = (IndexTuple) PageGetItem(state->origpage, itemid);
716 /* Do cheaper test first */
717 if (BTreeTupleIsPosting(tup) ||
718 !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
719 return false;
720 /* Check same conditions as rightmost item case, too */
721 keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
722
723 if (keepnatts > 1 && keepnatts <= nkeyatts)
724 {
725 double interp = (double) state->newitemoff / ((double) maxoff + 1);
726 double leaffillfactormult = (double) leaffillfactor / 100.0;
727
728 /*
729 * Don't allow caller to split after a new item when it will result in
730 * a split point to the right of the point that a leaf fillfactor
731 * split would use -- have caller apply leaf fillfactor instead
732 */
733 *usemult = interp > leaffillfactormult;
734
735 return true;
736 }
737
738 return false;
739}
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
#define MAXALIGN(LEN)
Definition: c.h:768
#define Assert(condition)
Definition: c.h:815
int64_t int64
Definition: c.h:485
int16_t int16
Definition: c.h:483
IndexTupleData * IndexTuple
Definition: itup.h:53
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define P_FIRSTKEY
Definition: nbtree.h:368
static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
Definition: nbtsplitloc.c:749
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:3032
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
ItemPointerData t_tid
Definition: itup.h:37
Definition: regguts.h:323

References _bt_adjacenthtid(), _bt_keep_natts_fast(), Assert, BTreeTupleIsPosting(), IndexRelationGetNumberOfKeyAttributes, MAXALIGN, OffsetNumberPrev, P_FIRSTKEY, PageGetItem(), PageGetItemId(), and IndexTupleData::t_tid.

Referenced by _bt_findsplitloc().

◆ _bt_bestsplitloc()

static OffsetNumber _bt_bestsplitloc ( FindSplitData state,
int  perfectpenalty,
bool *  newitemonleft,
FindSplitStrat  strategy 
)
static

Definition at line 788 of file nbtsplitloc.c.

790{
791 int bestpenalty,
792 lowsplit;
793 int highsplit = Min(state->interval, state->nsplits);
794 SplitPoint *final;
795
796 bestpenalty = INT_MAX;
797 lowsplit = 0;
798 for (int i = lowsplit; i < highsplit; i++)
799 {
800 int penalty;
801
802 penalty = _bt_split_penalty(state, state->splits + i);
803
804 if (penalty < bestpenalty)
805 {
806 bestpenalty = penalty;
807 lowsplit = i;
808 }
809
810 if (penalty <= perfectpenalty)
811 break;
812 }
813
814 final = &state->splits[lowsplit];
815
816 /*
817 * There is a risk that the "many duplicates" strategy will repeatedly do
818 * the wrong thing when there are monotonically decreasing insertions to
819 * the right of a large group of duplicates. Repeated splits could leave
820 * a succession of right half pages with free space that can never be
821 * used. This must be avoided.
822 *
823 * Consider the example of the leftmost page in a single integer attribute
824 * NULLS FIRST index which is almost filled with NULLs. Monotonically
825 * decreasing integer insertions might cause the same leftmost page to
826 * split repeatedly at the same point. Each split derives its new high
827 * key from the lowest current value to the immediate right of the large
828 * group of NULLs, which will always be higher than all future integer
829 * insertions, directing all future integer insertions to the same
830 * leftmost page.
831 */
832 if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
833 !final->newitemonleft && final->firstrightoff >= state->newitemoff &&
834 final->firstrightoff < state->newitemoff + 9)
835 {
836 /*
837 * Avoid the problem by performing a 50:50 split when the new item is
838 * just to the right of the would-be "many duplicates" split point.
839 * (Note that the test used for an insert that is "just to the right"
840 * of the split point is conservative.)
841 */
842 final = &state->splits[0];
843 }
844
845 *newitemonleft = final->newitemonleft;
846 return final->firstrightoff;
847}
#define Min(x, y)
Definition: c.h:961
#define final(a, b, c)
Definition: hashfn.c:116
int i
Definition: isn.c:72
static int _bt_split_penalty(FindSplitData *state, SplitPoint *split)
Definition: nbtsplitloc.c:1131

References _bt_split_penalty(), final, i, Min, and SPLIT_MANY_DUPLICATES.

Referenced by _bt_findsplitloc().

◆ _bt_defaultinterval()

static int _bt_defaultinterval ( FindSplitData state)
static

Definition at line 876 of file nbtsplitloc.c.

877{
878 SplitPoint *spaceoptimal;
879 int16 tolerance,
880 lowleftfree,
881 lowrightfree,
882 highleftfree,
883 highrightfree;
884
885 /*
886 * Determine leftfree and rightfree values that are higher and lower than
887 * we're willing to tolerate. Note that the final split interval will be
888 * about 10% of nsplits in the common case where all non-pivot tuples
889 * (data items) from a leaf page are uniformly sized. We're a bit more
890 * aggressive when splitting internal pages.
891 */
892 if (state->is_leaf)
893 tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
894 else
895 tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
896
897 /* First candidate split point is the most evenly balanced */
898 spaceoptimal = state->splits;
899 lowleftfree = spaceoptimal->leftfree - tolerance;
900 lowrightfree = spaceoptimal->rightfree - tolerance;
901 highleftfree = spaceoptimal->leftfree + tolerance;
902 highrightfree = spaceoptimal->rightfree + tolerance;
903
904 /*
905 * Iterate through split points, starting from the split immediately after
906 * 'spaceoptimal'. Find the first split point that divides free space so
907 * unevenly that including it in the split interval would be unacceptable.
908 */
909 for (int i = 1; i < state->nsplits; i++)
910 {
911 SplitPoint *split = state->splits + i;
912
913 /* Cannot use curdelta here, since its value is often weighted */
914 if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
915 split->leftfree > highleftfree || split->rightfree > highrightfree)
916 return i;
917 }
918
919 return state->nsplits;
920}
#define LEAF_SPLIT_DISTANCE
Definition: nbtsplitloc.c:849
#define INTERNAL_SPLIT_DISTANCE
Definition: nbtsplitloc.c:850
int16 leftfree
Definition: nbtsplitloc.c:32
int16 rightfree
Definition: nbtsplitloc.c:33

References i, INTERNAL_SPLIT_DISTANCE, LEAF_SPLIT_DISTANCE, SplitPoint::leftfree, and SplitPoint::rightfree.

Referenced by _bt_findsplitloc().

◆ _bt_deltasortsplits()

static void _bt_deltasortsplits ( FindSplitData state,
double  fillfactormult,
bool  usemult 
)
static

Definition at line 566 of file nbtsplitloc.c.

568{
569 for (int i = 0; i < state->nsplits; i++)
570 {
571 SplitPoint *split = state->splits + i;
572 int16 delta;
573
574 if (usemult)
575 delta = fillfactormult * split->leftfree -
576 (1.0 - fillfactormult) * split->rightfree;
577 else
578 delta = split->leftfree - split->rightfree;
579
580 if (delta < 0)
581 delta = -delta;
582
583 /* Save delta */
584 split->curdelta = delta;
585 }
586
587 qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
588}
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
static int _bt_splitcmp(const void *arg1, const void *arg2)
Definition: nbtsplitloc.c:594
#define qsort(a, b, c, d)
Definition: port.h:474
int16 curdelta
Definition: nbtsplitloc.c:31

References _bt_splitcmp(), SplitPoint::curdelta, i, if(), SplitPoint::leftfree, qsort, and SplitPoint::rightfree.

Referenced by _bt_findsplitloc().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool *  newitemonleft 
)

Definition at line 129 of file nbtsplitloc.c.

135{
136 BTPageOpaque opaque;
137 int leftspace,
138 rightspace,
139 olddataitemstotal,
140 olddataitemstoleft,
141 perfectpenalty,
142 leaffillfactor;
144 FindSplitStrat strategy;
145 ItemId itemid;
146 OffsetNumber offnum,
147 maxoff,
148 firstrightoff;
149 double fillfactormult;
150 bool usemult;
151 SplitPoint leftpage,
152 rightpage;
153
154 opaque = BTPageGetOpaque(origpage);
155 maxoff = PageGetMaxOffsetNumber(origpage);
156
157 /* Total free space available on a btree page, after fixed overhead */
158 leftspace = rightspace =
160 MAXALIGN(sizeof(BTPageOpaqueData));
161
162 /* The right page will have the same high key as the old page */
163 if (!P_RIGHTMOST(opaque))
164 {
165 itemid = PageGetItemId(origpage, P_HIKEY);
166 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
167 sizeof(ItemIdData));
168 }
169
170 /* Count up total space in data items before actually scanning 'em */
171 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
172 leaffillfactor = BTGetFillFactor(rel);
173
174 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
175 newitemsz += sizeof(ItemIdData);
176 state.rel = rel;
177 state.origpage = origpage;
178 state.newitem = newitem;
179 state.newitemsz = newitemsz;
180 state.is_leaf = P_ISLEAF(opaque);
181 state.is_rightmost = P_RIGHTMOST(opaque);
182 state.leftspace = leftspace;
183 state.rightspace = rightspace;
184 state.olddataitemstotal = olddataitemstotal;
185 state.minfirstrightsz = SIZE_MAX;
186 state.newitemoff = newitemoff;
187
188 /* newitem cannot be a posting list item */
189 Assert(!BTreeTupleIsPosting(newitem));
190
191 /*
192 * nsplits should never exceed maxoff because there will be at most as
193 * many candidate split points as there are points _between_ tuples, once
194 * you imagine that the new item is already on the original page (the
195 * final number of splits may be slightly lower because not all points
196 * between tuples will be legal).
197 */
198 state.maxsplits = maxoff;
199 state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
200 state.nsplits = 0;
201
202 /*
203 * Scan through the data items and calculate space usage for a split at
204 * each possible position
205 */
206 olddataitemstoleft = 0;
207
208 for (offnum = P_FIRSTDATAKEY(opaque);
209 offnum <= maxoff;
210 offnum = OffsetNumberNext(offnum))
211 {
212 Size itemsz;
213
214 itemid = PageGetItemId(origpage, offnum);
215 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
216
217 /*
218 * When item offset number is not newitemoff, neither side of the
219 * split can be newitem. Record a split after the previous data item
220 * from original page, but before the current data item from original
221 * page. (_bt_recsplitloc() will reject the split when there are no
222 * previous items, which we rely on.)
223 */
224 if (offnum < newitemoff)
225 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
226 else if (offnum > newitemoff)
227 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
228 else
229 {
230 /*
231 * Record a split after all "offnum < newitemoff" original page
232 * data items, but before newitem
233 */
234 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
235
236 /*
237 * Record a split after newitem, but before data item from
238 * original page at offset newitemoff/current offset
239 */
240 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
241 }
242
243 olddataitemstoleft += itemsz;
244 }
245
246 /*
247 * Record a split after all original page data items, but before newitem.
248 * (Though only when it's possible that newitem will end up alone on new
249 * right page.)
250 */
251 Assert(olddataitemstoleft == olddataitemstotal);
252 if (newitemoff > maxoff)
253 _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
254
255 /*
256 * I believe it is not possible to fail to find a feasible split, but just
257 * in case ...
258 */
259 if (state.nsplits == 0)
260 elog(ERROR, "could not find a feasible split point for index \"%s\"",
262
263 /*
264 * Start search for a split point among list of legal split points. Give
265 * primary consideration to equalizing available free space in each half
266 * of the split initially (start with default strategy), while applying
267 * rightmost and split-after-new-item optimizations where appropriate.
268 * Either of the two other fallback strategies may be required for cases
269 * with a large number of duplicates around the original/space-optimal
270 * split point.
271 *
272 * Default strategy gives some weight to suffix truncation in deciding a
273 * split point on leaf pages. It attempts to select a split point where a
274 * distinguishing attribute appears earlier in the new high key for the
275 * left side of the split, in order to maximize the number of trailing
276 * attributes that can be truncated away. Only candidate split points
277 * that imply an acceptable balance of free space on each side are
278 * considered. See _bt_defaultinterval().
279 */
280 if (!state.is_leaf)
281 {
282 /* fillfactormult only used on rightmost page */
283 usemult = state.is_rightmost;
284 fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
285 }
286 else if (state.is_rightmost)
287 {
288 /* Rightmost leaf page -- fillfactormult always used */
289 usemult = true;
290 fillfactormult = leaffillfactor / 100.0;
291 }
292 else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
293 {
294 /*
295 * New item inserted at rightmost point among a localized grouping on
296 * a leaf page -- apply "split after new item" optimization, either by
297 * applying leaf fillfactor multiplier, or by choosing the exact split
298 * point that leaves newitem as lastleft. (usemult is set for us.)
299 */
300 if (usemult)
301 {
302 /* fillfactormult should be set based on leaf fillfactor */
303 fillfactormult = leaffillfactor / 100.0;
304 }
305 else
306 {
307 /* find precise split point after newitemoff */
308 for (int i = 0; i < state.nsplits; i++)
309 {
310 SplitPoint *split = state.splits + i;
311
312 if (split->newitemonleft &&
313 newitemoff == split->firstrightoff)
314 {
315 pfree(state.splits);
316 *newitemonleft = true;
317 return newitemoff;
318 }
319 }
320
321 /*
322 * Cannot legally split after newitemoff; proceed with split
323 * without using fillfactor multiplier. This is defensive, and
324 * should never be needed in practice.
325 */
326 fillfactormult = 0.50;
327 }
328 }
329 else
330 {
331 /* Other leaf page. 50:50 page split. */
332 usemult = false;
333 /* fillfactormult not used, but be tidy */
334 fillfactormult = 0.50;
335 }
336
337 /*
338 * Save leftmost and rightmost splits for page before original ordinal
339 * sort order is lost by delta/fillfactormult sort
340 */
341 leftpage = state.splits[0];
342 rightpage = state.splits[state.nsplits - 1];
343
344 /* Give split points a fillfactormult-wise delta, and sort on deltas */
345 _bt_deltasortsplits(&state, fillfactormult, usemult);
346
347 /* Determine split interval for default strategy */
348 state.interval = _bt_defaultinterval(&state);
349
350 /*
351 * Determine if default strategy/split interval will produce a
352 * sufficiently distinguishing split, or if we should change strategies.
353 * Alternative strategies change the range of split points that are
354 * considered acceptable (split interval), and possibly change
355 * fillfactormult, in order to deal with pages with a large number of
356 * duplicates gracefully.
357 *
358 * Pass low and high splits for the entire page (actually, they're for an
359 * imaginary version of the page that includes newitem). These are used
360 * when the initial split interval encloses split points that are full of
361 * duplicates, and we need to consider if it's even possible to avoid
362 * appending a heap TID.
363 */
364 perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
365
366 if (strategy == SPLIT_DEFAULT)
367 {
368 /*
369 * Default strategy worked out (always works out with internal page).
370 * Original split interval still stands.
371 */
372 }
373
374 /*
375 * Many duplicates strategy is used when a heap TID would otherwise be
376 * appended, but the page isn't completely full of logical duplicates.
377 *
378 * The split interval is widened to include all legal candidate split
379 * points. There might be a few as two distinct values in the whole-page
380 * split interval, though it's also possible that most of the values on
381 * the page are unique. The final split point will either be to the
382 * immediate left or to the immediate right of the group of duplicate
383 * tuples that enclose the first/delta-optimal split point (perfect
384 * penalty was set so that the lowest delta split point that avoids
385 * appending a heap TID will be chosen). Maximizing the number of
386 * attributes that can be truncated away is not a goal of the many
387 * duplicates strategy.
388 *
389 * Single value strategy is used when it is impossible to avoid appending
390 * a heap TID. It arranges to leave the left page very full. This
391 * maximizes space utilization in cases where tuples with the same
392 * attribute values span many pages. Newly inserted duplicates will tend
393 * to have higher heap TID values, so we'll end up splitting to the right
394 * consistently. (Single value strategy is harmless though not
395 * particularly useful with !heapkeyspace indexes.)
396 */
397 else if (strategy == SPLIT_MANY_DUPLICATES)
398 {
399 Assert(state.is_leaf);
400 /* Shouldn't try to truncate away extra user attributes */
401 Assert(perfectpenalty ==
403 /* No need to resort splits -- no change in fillfactormult/deltas */
404 state.interval = state.nsplits;
405 }
406 else if (strategy == SPLIT_SINGLE_VALUE)
407 {
408 Assert(state.is_leaf);
409 /* Split near the end of the page */
410 usemult = true;
411 fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
412 /* Resort split points with new delta */
413 _bt_deltasortsplits(&state, fillfactormult, usemult);
414 /* Appending a heap TID is unavoidable, so interval of 1 is fine */
415 state.interval = 1;
416 }
417
418 /*
419 * Search among acceptable split points (using final split interval) for
420 * the entry that has the lowest penalty, and is therefore expected to
421 * maximize fan-out. Sets *newitemonleft for us.
422 */
423 firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
424 strategy);
425 pfree(state.splits);
426
427 return firstrightoff;
428}
Size PageGetExactFreeSpace(const PageData *page)
Definition: bufpage.c:947
static Size PageGetPageSize(const PageData *page)
Definition: bufpage.h:277
#define SizeOfPageHeaderData
Definition: bufpage.h:217
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
size_t Size
Definition: c.h:562
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
struct ItemIdData ItemIdData
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:202
#define P_HIKEY
Definition: nbtree.h:367
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTGetFillFactor(relation)
Definition: nbtree.h:1133
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:201
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:566
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:934
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:630
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:449
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:788
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:876
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define RelationGetRelationName(relation)
Definition: rel.h:539
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc(), pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_interval_edges()

static void _bt_interval_edges ( FindSplitData state,
SplitPoint **  leftinterval,
SplitPoint **  rightinterval 
)
static

Definition at line 1052 of file nbtsplitloc.c.

1054{
1055 int highsplit = Min(state->interval, state->nsplits);
1056 SplitPoint *deltaoptimal;
1057
1058 deltaoptimal = state->splits;
1059 *leftinterval = NULL;
1060 *rightinterval = NULL;
1061
1062 /*
1063 * Delta is an absolute distance to optimal split point, so both the
1064 * leftmost and rightmost split point will usually be at the end of the
1065 * array
1066 */
1067 for (int i = highsplit - 1; i >= 0; i--)
1068 {
1069 SplitPoint *distant = state->splits + i;
1070
1071 if (distant->firstrightoff < deltaoptimal->firstrightoff)
1072 {
1073 if (*leftinterval == NULL)
1074 *leftinterval = distant;
1075 }
1076 else if (distant->firstrightoff > deltaoptimal->firstrightoff)
1077 {
1078 if (*rightinterval == NULL)
1079 *rightinterval = distant;
1080 }
1081 else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
1082 {
1083 /*
1084 * "incoming tuple will become firstright" (distant) is to the
1085 * left of "incoming tuple will become lastleft" (delta-optimal)
1086 */
1087 Assert(distant->firstrightoff == state->newitemoff);
1088 if (*leftinterval == NULL)
1089 *leftinterval = distant;
1090 }
1091 else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
1092 {
1093 /*
1094 * "incoming tuple will become lastleft" (distant) is to the right
1095 * of "incoming tuple will become firstright" (delta-optimal)
1096 */
1097 Assert(distant->firstrightoff == state->newitemoff);
1098 if (*rightinterval == NULL)
1099 *rightinterval = distant;
1100 }
1101 else
1102 {
1103 /* There was only one or two splits in initial split interval */
1104 Assert(distant == deltaoptimal);
1105 if (*leftinterval == NULL)
1106 *leftinterval = distant;
1107 if (*rightinterval == NULL)
1108 *rightinterval = distant;
1109 }
1110
1111 if (*leftinterval && *rightinterval)
1112 return;
1113 }
1114
1115 Assert(false);
1116}

References Assert, SplitPoint::firstrightoff, i, Min, and SplitPoint::newitemonleft.

Referenced by _bt_strategy().

◆ _bt_recsplitloc()

static void _bt_recsplitloc ( FindSplitData state,
OffsetNumber  firstrightoff,
bool  newitemonleft,
int  olddataitemstoleft,
Size  firstrightofforigpagetuplesz 
)
static

Definition at line 449 of file nbtsplitloc.c.

454{
455 int16 leftfree,
456 rightfree;
457 Size firstrightsz;
458 Size postingsz = 0;
459 bool newitemisfirstright;
460
461 /* Is the new item going to be split point's firstright tuple? */
462 newitemisfirstright = (firstrightoff == state->newitemoff &&
463 !newitemonleft);
464
465 if (newitemisfirstright)
466 firstrightsz = state->newitemsz;
467 else
468 {
469 firstrightsz = firstrightofforigpagetuplesz;
470
471 /*
472 * Calculate suffix truncation space saving when firstright tuple is a
473 * posting list tuple, though only when the tuple is over 64 bytes
474 * including line pointer overhead (arbitrary). This avoids accessing
475 * the tuple in cases where its posting list must be very small (if
476 * tuple has one at all).
477 *
478 * Note: We don't do this in the case where firstright tuple is
479 * newitem, since newitem cannot have a posting list.
480 */
481 if (state->is_leaf && firstrightsz > 64)
482 {
483 ItemId itemid;
484 IndexTuple newhighkey;
485
486 itemid = PageGetItemId(state->origpage, firstrightoff);
487 newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
488
489 if (BTreeTupleIsPosting(newhighkey))
490 postingsz = IndexTupleSize(newhighkey) -
491 BTreeTupleGetPostingOffset(newhighkey);
492 }
493 }
494
495 /* Account for all the old tuples */
496 leftfree = state->leftspace - olddataitemstoleft;
497 rightfree = state->rightspace -
498 (state->olddataitemstotal - olddataitemstoleft);
499
500 /*
501 * The first item on the right page becomes the high key of the left page;
502 * therefore it counts against left space as well as right space (we
503 * cannot assume that suffix truncation will make it any smaller). When
504 * index has included attributes, then those attributes of left page high
505 * key will be truncated leaving that page with slightly more free space.
506 * However, that shouldn't affect our ability to find valid split
507 * location, since we err in the direction of being pessimistic about free
508 * space on the left half. Besides, even when suffix truncation of
509 * non-TID attributes occurs, the new high key often won't even be a
510 * single MAXALIGN() quantum smaller than the firstright tuple it's based
511 * on.
512 *
513 * If we are on the leaf level, assume that suffix truncation cannot avoid
514 * adding a heap TID to the left half's new high key when splitting at the
515 * leaf level. In practice the new high key will often be smaller and
516 * will rarely be larger, but conservatively assume the worst case. We do
517 * go to the trouble of subtracting away posting list overhead, though
518 * only when it looks like it will make an appreciable difference.
519 * (Posting lists are the only case where truncation will typically make
520 * the final high key far smaller than firstright, so being a bit more
521 * precise there noticeably improves the balance of free space.)
522 */
523 if (state->is_leaf)
524 leftfree -= (int16) (firstrightsz +
525 MAXALIGN(sizeof(ItemPointerData)) -
526 postingsz);
527 else
528 leftfree -= (int16) firstrightsz;
529
530 /* account for the new item */
531 if (newitemonleft)
532 leftfree -= (int16) state->newitemsz;
533 else
534 rightfree -= (int16) state->newitemsz;
535
536 /*
537 * If we are not on the leaf level, we will be able to discard the key
538 * data from the first item that winds up on the right page.
539 */
540 if (!state->is_leaf)
541 rightfree += (int16) firstrightsz -
542 (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
543
544 /* Record split if legal */
545 if (leftfree >= 0 && rightfree >= 0)
546 {
547 Assert(state->nsplits < state->maxsplits);
548
549 /* Determine smallest firstright tuple size among legal splits */
550 state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
551
552 state->splits[state->nsplits].curdelta = 0;
553 state->splits[state->nsplits].leftfree = leftfree;
554 state->splits[state->nsplits].rightfree = rightfree;
555 state->splits[state->nsplits].firstrightoff = firstrightoff;
556 state->splits[state->nsplits].newitemonleft = newitemonleft;
557 state->nsplits++;
558 }
559}
#define IndexTupleSize(itup)
Definition: itup.h:70
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529

References Assert, BTreeTupleGetPostingOffset(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, Min, PageGetItem(), and PageGetItemId().

Referenced by _bt_findsplitloc().

◆ _bt_split_firstright()

static IndexTuple _bt_split_firstright ( FindSplitData state,
SplitPoint split 
)
inlinestatic

Definition at line 1175 of file nbtsplitloc.c.

1176{
1177 ItemId itemid;
1178
1179 if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
1180 return state->newitem;
1181
1182 itemid = PageGetItemId(state->origpage, split->firstrightoff);
1183 return (IndexTuple) PageGetItem(state->origpage, itemid);
1184}

References SplitPoint::firstrightoff, SplitPoint::newitemonleft, PageGetItem(), and PageGetItemId().

Referenced by _bt_split_penalty(), and _bt_strategy().

◆ _bt_split_lastleft()

static IndexTuple _bt_split_lastleft ( FindSplitData state,
SplitPoint split 
)
inlinestatic

Definition at line 1159 of file nbtsplitloc.c.

1160{
1161 ItemId itemid;
1162
1163 if (split->newitemonleft && split->firstrightoff == state->newitemoff)
1164 return state->newitem;
1165
1166 itemid = PageGetItemId(state->origpage,
1168 return (IndexTuple) PageGetItem(state->origpage, itemid);
1169}

References SplitPoint::firstrightoff, SplitPoint::newitemonleft, OffsetNumberPrev, PageGetItem(), and PageGetItemId().

Referenced by _bt_split_penalty(), and _bt_strategy().

◆ _bt_split_penalty()

static int _bt_split_penalty ( FindSplitData state,
SplitPoint split 
)
inlinestatic

Definition at line 1131 of file nbtsplitloc.c.

1132{
1133 IndexTuple lastleft;
1134 IndexTuple firstright;
1135
1136 if (!state->is_leaf)
1137 {
1138 ItemId itemid;
1139
1140 if (!split->newitemonleft &&
1141 split->firstrightoff == state->newitemoff)
1142 return state->newitemsz;
1143
1144 itemid = PageGetItemId(state->origpage, split->firstrightoff);
1145
1146 return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1147 }
1148
1149 lastleft = _bt_split_lastleft(state, split);
1150 firstright = _bt_split_firstright(state, split);
1151
1152 return _bt_keep_natts_fast(state->rel, lastleft, firstright);
1153}
static IndexTuple _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
Definition: nbtsplitloc.c:1159
static IndexTuple _bt_split_firstright(FindSplitData *state, SplitPoint *split)
Definition: nbtsplitloc.c:1175

References _bt_keep_natts_fast(), _bt_split_firstright(), _bt_split_lastleft(), SplitPoint::firstrightoff, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, and PageGetItemId().

Referenced by _bt_bestsplitloc().

◆ _bt_splitcmp()

static int _bt_splitcmp ( const void *  arg1,
const void *  arg2 
)
static

Definition at line 594 of file nbtsplitloc.c.

595{
596 SplitPoint *split1 = (SplitPoint *) arg1;
597 SplitPoint *split2 = (SplitPoint *) arg2;
598
599 return pg_cmp_s16(split1->curdelta, split2->curdelta);
600}
static int pg_cmp_s16(int16 a, int16 b)
Definition: int.h:634

References SplitPoint::curdelta, and pg_cmp_s16().

Referenced by _bt_deltasortsplits().

◆ _bt_strategy()

static int _bt_strategy ( FindSplitData state,
SplitPoint leftpage,
SplitPoint rightpage,
FindSplitStrat strategy 
)
static

Definition at line 934 of file nbtsplitloc.c.

936{
937 IndexTuple leftmost,
938 rightmost;
939 SplitPoint *leftinterval,
940 *rightinterval;
941 int perfectpenalty;
942 int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
943
944 /* Assume that alternative strategy won't be used for now */
945 *strategy = SPLIT_DEFAULT;
946
947 /*
948 * Use smallest observed firstright item size for entire page (actually,
949 * entire imaginary version of page that includes newitem) as perfect
950 * penalty on internal pages. This can save cycles in the common case
951 * where most or all splits (not just splits within interval) have
952 * firstright tuples that are the same size.
953 */
954 if (!state->is_leaf)
955 return state->minfirstrightsz;
956
957 /*
958 * Use leftmost and rightmost tuples from leftmost and rightmost splits in
959 * current split interval
960 */
961 _bt_interval_edges(state, &leftinterval, &rightinterval);
962 leftmost = _bt_split_lastleft(state, leftinterval);
963 rightmost = _bt_split_firstright(state, rightinterval);
964
965 /*
966 * If initial split interval can produce a split point that will at least
967 * avoid appending a heap TID in new high key, we're done. Finish split
968 * with default strategy and initial split interval.
969 */
970 perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
971 if (perfectpenalty <= indnkeyatts)
972 return perfectpenalty;
973
974 /*
975 * Work out how caller should finish split when even their "perfect"
976 * penalty for initial/default split interval indicates that the interval
977 * does not contain even a single split that avoids appending a heap TID.
978 *
979 * Use the leftmost split's lastleft tuple and the rightmost split's
980 * firstright tuple to assess every possible split.
981 */
982 leftmost = _bt_split_lastleft(state, leftpage);
983 rightmost = _bt_split_firstright(state, rightpage);
984
985 /*
986 * If page (including new item) has many duplicates but is not entirely
987 * full of duplicates, a many duplicates strategy split will be performed.
988 * If page is entirely full of duplicates, a single value strategy split
989 * will be performed.
990 */
991 perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
992 if (perfectpenalty <= indnkeyatts)
993 {
994 *strategy = SPLIT_MANY_DUPLICATES;
995
996 /*
997 * Many duplicates strategy should split at either side the group of
998 * duplicates that enclose the delta-optimal split point. Return
999 * indnkeyatts rather than the true perfect penalty to make that
1000 * happen. (If perfectpenalty was returned here then low cardinality
1001 * composite indexes could have continual unbalanced splits.)
1002 *
1003 * Note that caller won't go through with a many duplicates split in
1004 * rare cases where it looks like there are ever-decreasing insertions
1005 * to the immediate right of the split point. This must happen just
1006 * before a final decision is made, within _bt_bestsplitloc().
1007 */
1008 return indnkeyatts;
1009 }
1010
1011 /*
1012 * Single value strategy is only appropriate with ever-increasing heap
1013 * TIDs; otherwise, original default strategy split should proceed to
1014 * avoid pathological performance. Use page high key to infer if this is
1015 * the rightmost page among pages that store the same duplicate value.
1016 * This should not prevent insertions of heap TIDs that are slightly out
1017 * of order from using single value strategy, since that's expected with
1018 * concurrent inserters of the same duplicate value.
1019 */
1020 else if (state->is_rightmost)
1021 *strategy = SPLIT_SINGLE_VALUE;
1022 else
1023 {
1024 ItemId itemid;
1025 IndexTuple hikey;
1026
1027 itemid = PageGetItemId(state->origpage, P_HIKEY);
1028 hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
1029 perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
1030 state->newitem);
1031 if (perfectpenalty <= indnkeyatts)
1032 *strategy = SPLIT_SINGLE_VALUE;
1033 else
1034 {
1035 /*
1036 * Have caller finish split using default strategy, since page
1037 * does not appear to be the rightmost page for duplicates of the
1038 * value the page is filled with
1039 */
1040 }
1041 }
1042
1043 return perfectpenalty;
1044}
static void _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, SplitPoint **rightinterval)
Definition: nbtsplitloc.c:1052

References _bt_interval_edges(), _bt_keep_natts_fast(), _bt_split_firstright(), _bt_split_lastleft(), IndexRelationGetNumberOfKeyAttributes, P_HIKEY, PageGetItem(), PageGetItemId(), SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_findsplitloc().