PostgreSQL Source Code  git master
nbtsplitloc.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsplitloc.c
4  * Choose split point code for Postgres btree implementation.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtsplitloc.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/nbtree.h"
18 #include "storage/lmgr.h"
19 
20 /* limits on split interval (default strategy only) */
21 #define MAX_LEAF_INTERVAL 9
22 #define MAX_INTERNAL_INTERVAL 18
23 
24 typedef enum
25 {
26  /* strategy for searching through materialized list of split points */
27  SPLIT_DEFAULT, /* give some weight to truncation */
28  SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */
29  SPLIT_SINGLE_VALUE /* leave left page almost full */
31 
32 typedef struct
33 {
34  /* details of free space left by split */
35  int16 curdelta; /* current leftfree/rightfree delta */
36  int16 leftfree; /* space left on left page post-split */
37  int16 rightfree; /* space left on right page post-split */
38 
39  /* split point identifying fields (returned by _bt_findsplitloc) */
40  OffsetNumber firstoldonright; /* first item on new right page */
41  bool newitemonleft; /* new item goes on left, or right? */
42 
43 } SplitPoint;
44 
45 typedef struct
46 {
47  /* context data for _bt_recsplitloc */
48  Relation rel; /* index relation */
49  Page page; /* page undergoing split */
50  IndexTuple newitem; /* new item (cause of page split) */
51  Size newitemsz; /* size of newitem (includes line pointer) */
52  bool is_leaf; /* T if splitting a leaf page */
53  bool is_rightmost; /* T if splitting rightmost page on level */
54  OffsetNumber newitemoff; /* where the new item is to be inserted */
55  int leftspace; /* space available for items on left page */
56  int rightspace; /* space available for items on right page */
57  int olddataitemstotal; /* space taken by old items */
58  Size minfirstrightsz; /* smallest firstoldonright tuple size */
59 
60  /* candidate split point data */
61  int maxsplits; /* maximum number of splits */
62  int nsplits; /* current number of splits */
63  SplitPoint *splits; /* all candidate split points for page */
64  int interval; /* current range of acceptable split points */
66 
68  OffsetNumber firstoldonright, bool newitemonleft,
69  int olddataitemstoleft, Size firstoldonrightsz);
70 static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
71  bool usemult);
72 static int _bt_splitcmp(const void *arg1, const void *arg2);
74  int leaffillfactor, bool *usemult);
75 static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
76 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
77  bool *newitemonleft, FindSplitStrat strategy);
78 static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
79  SplitPoint *rightpage, FindSplitStrat *strategy);
81  SplitPoint **leftinterval, SplitPoint **rightinterval);
82 static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
84  SplitPoint *split);
86  SplitPoint *split);
87 
88 
89 /*
90  * _bt_findsplitloc() -- find an appropriate place to split a page.
91  *
92  * The main goal here is to equalize the free space that will be on each
93  * split page, *after accounting for the inserted tuple*. (If we fail to
94  * account for it, we might find ourselves with too little room on the page
95  * that it needs to go into!)
96  *
97  * If the page is the rightmost page on its level, we instead try to arrange
98  * to leave the left split page fillfactor% full. In this way, when we are
99  * inserting successively increasing keys (consider sequences, timestamps,
100  * etc) we will end up with a tree whose pages are about fillfactor% full,
101  * instead of the 50% full result that we'd get without this special case.
102  * This is the same as nbtsort.c produces for a newly-created tree. Note
103  * that leaf and nonleaf pages use different fillfactors. Note also that
104  * there are a number of further special cases where fillfactor is not
105  * applied in the standard way.
106  *
107  * We are passed the intended insert position of the new tuple, expressed as
108  * the offsetnumber of the tuple it must go in front of (this could be
109  * maxoff+1 if the tuple is to go at the end). The new tuple itself is also
110  * passed, since it's needed to give some weight to how effective suffix
111  * truncation will be. The implementation picks the split point that
112  * maximizes the effectiveness of suffix truncation from a small list of
113  * alternative candidate split points that leave each side of the split with
114  * about the same share of free space. Suffix truncation is secondary to
115  * equalizing free space, except in cases with large numbers of duplicates.
116  * Note that it is always assumed that caller goes on to perform truncation,
117  * even with pg_upgrade'd indexes where that isn't actually the case
118  * (!heapkeyspace indexes). See nbtree/README for more information about
119  * suffix truncation.
120  *
121  * We return the index of the first existing tuple that should go on the
122  * righthand page, plus a boolean indicating whether the new tuple goes on
123  * the left or right page. The bool is necessary to disambiguate the case
124  * where firstright == newitemoff.
125  */
128  Page page,
129  OffsetNumber newitemoff,
130  Size newitemsz,
131  IndexTuple newitem,
132  bool *newitemonleft)
133 {
134  BTPageOpaque opaque;
135  int leftspace,
136  rightspace,
137  olddataitemstotal,
138  olddataitemstoleft,
139  perfectpenalty,
140  leaffillfactor;
142  FindSplitStrat strategy;
143  ItemId itemid;
144  OffsetNumber offnum,
145  maxoff,
146  foundfirstright;
147  double fillfactormult;
148  bool usemult;
149  SplitPoint leftpage,
150  rightpage;
151 
152  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
153  maxoff = PageGetMaxOffsetNumber(page);
154 
155  /* Total free space available on a btree page, after fixed overhead */
156  leftspace = rightspace =
158  MAXALIGN(sizeof(BTPageOpaqueData));
159 
160  /* The right page will have the same high key as the old page */
161  if (!P_RIGHTMOST(opaque))
162  {
163  itemid = PageGetItemId(page, P_HIKEY);
164  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
165  sizeof(ItemIdData));
166  }
167 
168  /* Count up total space in data items before actually scanning 'em */
169  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
170  leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
171 
172  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
173  newitemsz += sizeof(ItemIdData);
174  state.rel = rel;
175  state.page = page;
176  state.newitem = newitem;
177  state.newitemsz = newitemsz;
178  state.is_leaf = P_ISLEAF(opaque);
179  state.is_rightmost = P_RIGHTMOST(opaque);
180  state.leftspace = leftspace;
181  state.rightspace = rightspace;
182  state.olddataitemstotal = olddataitemstotal;
183  state.minfirstrightsz = SIZE_MAX;
184  state.newitemoff = newitemoff;
185 
186  /*
187  * maxsplits should never exceed maxoff because there will be at most as
188  * many candidate split points as there are points _between_ tuples, once
189  * you imagine that the new item is already on the original page (the
190  * final number of splits may be slightly lower because not all points
191  * between tuples will be legal).
192  */
193  state.maxsplits = maxoff;
194  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
195  state.nsplits = 0;
196 
197  /*
198  * Scan through the data items and calculate space usage for a split at
199  * each possible position
200  */
201  olddataitemstoleft = 0;
202 
203  for (offnum = P_FIRSTDATAKEY(opaque);
204  offnum <= maxoff;
205  offnum = OffsetNumberNext(offnum))
206  {
207  Size itemsz;
208 
209  itemid = PageGetItemId(page, offnum);
210  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
211 
212  /*
213  * When item offset number is not newitemoff, neither side of the
214  * split can be newitem. Record a split after the previous data item
215  * from original page, but before the current data item from original
216  * page. (_bt_recsplitloc() will reject the split when there are no
217  * previous items, which we rely on.)
218  */
219  if (offnum < newitemoff)
220  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
221  else if (offnum > newitemoff)
222  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
223  else
224  {
225  /*
226  * Record a split after all "offnum < newitemoff" original page
227  * data items, but before newitem
228  */
229  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
230 
231  /*
232  * Record a split after newitem, but before data item from
233  * original page at offset newitemoff/current offset
234  */
235  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
236  }
237 
238  olddataitemstoleft += itemsz;
239  }
240 
241  /*
242  * Record a split after all original page data items, but before newitem.
243  * (Though only when it's possible that newitem will end up alone on new
244  * right page.)
245  */
246  Assert(olddataitemstoleft == olddataitemstotal);
247  if (newitemoff > maxoff)
248  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
249 
250  /*
251  * I believe it is not possible to fail to find a feasible split, but just
252  * in case ...
253  */
254  if (state.nsplits == 0)
255  elog(ERROR, "could not find a feasible split point for index \"%s\"",
257 
258  /*
259  * Start search for a split point among list of legal split points. Give
260  * primary consideration to equalizing available free space in each half
261  * of the split initially (start with default strategy), while applying
262  * rightmost and split-after-new-item optimizations where appropriate.
263  * Either of the two other fallback strategies may be required for cases
264  * with a large number of duplicates around the original/space-optimal
265  * split point.
266  *
267  * Default strategy gives some weight to suffix truncation in deciding a
268  * split point on leaf pages. It attempts to select a split point where a
269  * distinguishing attribute appears earlier in the new high key for the
270  * left side of the split, in order to maximize the number of trailing
271  * attributes that can be truncated away. Only candidate split points
272  * that imply an acceptable balance of free space on each side are
273  * considered.
274  */
275  if (!state.is_leaf)
276  {
277  /* fillfactormult only used on rightmost page */
278  usemult = state.is_rightmost;
279  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
280  }
281  else if (state.is_rightmost)
282  {
283  /* Rightmost leaf page -- fillfactormult always used */
284  usemult = true;
285  fillfactormult = leaffillfactor / 100.0;
286  }
287  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
288  {
289  /*
290  * New item inserted at rightmost point among a localized grouping on
291  * a leaf page -- apply "split after new item" optimization, either by
292  * applying leaf fillfactor multiplier, or by choosing the exact split
293  * point that leaves the new item as last on the left. (usemult is set
294  * for us.)
295  */
296  if (usemult)
297  {
298  /* fillfactormult should be set based on leaf fillfactor */
299  fillfactormult = leaffillfactor / 100.0;
300  }
301  else
302  {
303  /* find precise split point after newitemoff */
304  for (int i = 0; i < state.nsplits; i++)
305  {
306  SplitPoint *split = state.splits + i;
307 
308  if (split->newitemonleft &&
309  newitemoff == split->firstoldonright)
310  {
311  pfree(state.splits);
312  *newitemonleft = true;
313  return newitemoff;
314  }
315  }
316 
317  /*
318  * Cannot legally split after newitemoff; proceed with split
319  * without using fillfactor multiplier. This is defensive, and
320  * should never be needed in practice.
321  */
322  fillfactormult = 0.50;
323  }
324  }
325  else
326  {
327  /* Other leaf page. 50:50 page split. */
328  usemult = false;
329  /* fillfactormult not used, but be tidy */
330  fillfactormult = 0.50;
331  }
332 
333  /*
334  * Set an initial limit on the split interval/number of candidate split
335  * points as appropriate. The "Prefix B-Trees" paper refers to this as
336  * sigma l for leaf splits and sigma b for internal ("branch") splits.
337  * It's hard to provide a theoretical justification for the initial size
338  * of the split interval, though it's clear that a small split interval
339  * makes suffix truncation much more effective without noticeably
340  * affecting space utilization over time.
341  */
342  state.interval = Min(Max(1, state.nsplits * 0.05),
343  state.is_leaf ? MAX_LEAF_INTERVAL :
345 
346  /*
347  * Save leftmost and rightmost splits for page before original ordinal
348  * sort order is lost by delta/fillfactormult sort
349  */
350  leftpage = state.splits[0];
351  rightpage = state.splits[state.nsplits - 1];
352 
353  /* Give split points a fillfactormult-wise delta, and sort on deltas */
354  _bt_deltasortsplits(&state, fillfactormult, usemult);
355 
356  /*
357  * Determine if default strategy/split interval will produce a
358  * sufficiently distinguishing split, or if we should change strategies.
359  * Alternative strategies change the range of split points that are
360  * considered acceptable (split interval), and possibly change
361  * fillfactormult, in order to deal with pages with a large number of
362  * duplicates gracefully.
363  *
364  * Pass low and high splits for the entire page (actually, they're for an
365  * imaginary version of the page that includes newitem). These are used
366  * when the initial split interval encloses split points that are full of
367  * duplicates, and we need to consider if it's even possible to avoid
368  * appending a heap TID.
369  */
370  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
371 
372  if (strategy == SPLIT_DEFAULT)
373  {
374  /*
375  * Default strategy worked out (always works out with internal page).
376  * Original split interval still stands.
377  */
378  }
379 
380  /*
381  * Many duplicates strategy is used when a heap TID would otherwise be
382  * appended, but the page isn't completely full of logical duplicates.
383  *
384  * The split interval is widened to include all legal candidate split
385  * points. There might be a few as two distinct values in the whole-page
386  * split interval, though it's also possible that most of the values on
387  * the page are unique. The final split point will either be to the
388  * immediate left or to the immediate right of the group of duplicate
389  * tuples that enclose the first/delta-optimal split point (perfect
390  * penalty was set so that the lowest delta split point that avoids
391  * appending a heap TID will be chosen). Maximizing the number of
392  * attributes that can be truncated away is not a goal of the many
393  * duplicates strategy.
394  *
395  * Single value strategy is used when it is impossible to avoid appending
396  * a heap TID. It arranges to leave the left page very full. This
397  * maximizes space utilization in cases where tuples with the same
398  * attribute values span many pages. Newly inserted duplicates will tend
399  * to have higher heap TID values, so we'll end up splitting to the right
400  * consistently. (Single value strategy is harmless though not
401  * particularly useful with !heapkeyspace indexes.)
402  */
403  else if (strategy == SPLIT_MANY_DUPLICATES)
404  {
405  Assert(state.is_leaf);
406  /* Shouldn't try to truncate away extra user attributes */
407  Assert(perfectpenalty ==
409  /* No need to resort splits -- no change in fillfactormult/deltas */
410  state.interval = state.nsplits;
411  }
412  else if (strategy == SPLIT_SINGLE_VALUE)
413  {
414  Assert(state.is_leaf);
415  /* Split near the end of the page */
416  usemult = true;
417  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
418  /* Resort split points with new delta */
419  _bt_deltasortsplits(&state, fillfactormult, usemult);
420  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
421  state.interval = 1;
422  }
423 
424  /*
425  * Search among acceptable split points (using final split interval) for
426  * the entry that has the lowest penalty, and is therefore expected to
427  * maximize fan-out. Sets *newitemonleft for us.
428  */
429  foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
430  strategy);
431  pfree(state.splits);
432 
433  return foundfirstright;
434 }
435 
436 /*
437  * Subroutine to record a particular point between two tuples (possibly the
438  * new item) on page (ie, combination of firstright and newitemonleft
439  * settings) in *state for later analysis. This is also a convenient point
440  * to check if the split is legal (if it isn't, it won't be recorded).
441  *
442  * firstoldonright is the offset of the first item on the original page that
443  * goes to the right page, and firstoldonrightsz is the size of that tuple.
444  * firstoldonright can be > max offset, which means that all the old items go
445  * to the left page and only the new item goes to the right page. In that
446  * case, firstoldonrightsz is not used.
447  *
448  * olddataitemstoleft is the total size of all old items to the left of the
449  * split point that is recorded here when legal. Should not include
450  * newitemsz, since that is handled here.
451  */
452 static void
454  OffsetNumber firstoldonright,
455  bool newitemonleft,
456  int olddataitemstoleft,
457  Size firstoldonrightsz)
458 {
459  int16 leftfree,
460  rightfree;
461  Size firstrightitemsz;
462  bool newitemisfirstonright;
463 
464  /* Is the new item going to be the first item on the right page? */
465  newitemisfirstonright = (firstoldonright == state->newitemoff
466  && !newitemonleft);
467 
468  if (newitemisfirstonright)
469  firstrightitemsz = state->newitemsz;
470  else
471  firstrightitemsz = firstoldonrightsz;
472 
473  /* Account for all the old tuples */
474  leftfree = state->leftspace - olddataitemstoleft;
475  rightfree = state->rightspace -
476  (state->olddataitemstotal - olddataitemstoleft);
477 
478  /*
479  * The first item on the right page becomes the high key of the left page;
480  * therefore it counts against left space as well as right space (we
481  * cannot assume that suffix truncation will make it any smaller). When
482  * index has included attributes, then those attributes of left page high
483  * key will be truncated leaving that page with slightly more free space.
484  * However, that shouldn't affect our ability to find valid split
485  * location, since we err in the direction of being pessimistic about free
486  * space on the left half. Besides, even when suffix truncation of
487  * non-TID attributes occurs, the new high key often won't even be a
488  * single MAXALIGN() quantum smaller than the firstright tuple it's based
489  * on.
490  *
491  * If we are on the leaf level, assume that suffix truncation cannot avoid
492  * adding a heap TID to the left half's new high key when splitting at the
493  * leaf level. In practice the new high key will often be smaller and
494  * will rarely be larger, but conservatively assume the worst case.
495  */
496  if (state->is_leaf)
497  leftfree -= (int16) (firstrightitemsz +
498  MAXALIGN(sizeof(ItemPointerData)));
499  else
500  leftfree -= (int16) firstrightitemsz;
501 
502  /* account for the new item */
503  if (newitemonleft)
504  leftfree -= (int16) state->newitemsz;
505  else
506  rightfree -= (int16) state->newitemsz;
507 
508  /*
509  * If we are not on the leaf level, we will be able to discard the key
510  * data from the first item that winds up on the right page.
511  */
512  if (!state->is_leaf)
513  rightfree += (int16) firstrightitemsz -
514  (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
515 
516  /* Record split if legal */
517  if (leftfree >= 0 && rightfree >= 0)
518  {
519  Assert(state->nsplits < state->maxsplits);
520 
521  /* Determine smallest firstright item size on page */
522  state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz);
523 
524  state->splits[state->nsplits].curdelta = 0;
525  state->splits[state->nsplits].leftfree = leftfree;
526  state->splits[state->nsplits].rightfree = rightfree;
527  state->splits[state->nsplits].firstoldonright = firstoldonright;
528  state->splits[state->nsplits].newitemonleft = newitemonleft;
529  state->nsplits++;
530  }
531 }
532 
533 /*
534  * Subroutine to assign space deltas to materialized array of candidate split
535  * points based on current fillfactor, and to sort array using that fillfactor
536  */
537 static void
538 _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
539  bool usemult)
540 {
541  for (int i = 0; i < state->nsplits; i++)
542  {
543  SplitPoint *split = state->splits + i;
544  int16 delta;
545 
546  if (usemult)
547  delta = fillfactormult * split->leftfree -
548  (1.0 - fillfactormult) * split->rightfree;
549  else
550  delta = split->leftfree - split->rightfree;
551 
552  if (delta < 0)
553  delta = -delta;
554 
555  /* Save delta */
556  split->curdelta = delta;
557  }
558 
559  qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
560 }
561 
562 /*
563  * qsort-style comparator used by _bt_deltasortsplits()
564  */
565 static int
566 _bt_splitcmp(const void *arg1, const void *arg2)
567 {
568  SplitPoint *split1 = (SplitPoint *) arg1;
569  SplitPoint *split2 = (SplitPoint *) arg2;
570 
571  if (split1->curdelta > split2->curdelta)
572  return 1;
573  if (split1->curdelta < split2->curdelta)
574  return -1;
575 
576  return 0;
577 }
578 
579 /*
580  * Subroutine to determine whether or not a non-rightmost leaf page should be
581  * split immediately after the would-be original page offset for the
582  * new/incoming tuple (or should have leaf fillfactor applied when new item is
583  * to the right on original page). This is appropriate when there is a
584  * pattern of localized monotonically increasing insertions into a composite
585  * index, where leading attribute values form local groupings, and we
586  * anticipate further insertions of the same/current grouping (new item's
587  * grouping) in the near future. This can be thought of as a variation on
588  * applying leaf fillfactor during rightmost leaf page splits, since cases
589  * that benefit will converge on packing leaf pages leaffillfactor% full over
590  * time.
591  *
592  * We may leave extra free space remaining on the rightmost page of a "most
593  * significant column" grouping of tuples if that grouping never ends up
594  * having future insertions that use the free space. That effect is
595  * self-limiting; a future grouping that becomes the "nearest on the right"
596  * grouping of the affected grouping usually puts the extra free space to good
597  * use.
598  *
599  * Caller uses optimization when routine returns true, though the exact action
600  * taken by caller varies. Caller uses original leaf page fillfactor in
601  * standard way rather than using the new item offset directly when *usemult
602  * was also set to true here. Otherwise, caller applies optimization by
603  * locating the legal split point that makes the new tuple the very last tuple
604  * on the left side of the split.
605  */
606 static bool
608  int leaffillfactor, bool *usemult)
609 {
610  int16 nkeyatts;
611  ItemId itemid;
612  IndexTuple tup;
613  int keepnatts;
614 
615  Assert(state->is_leaf && !state->is_rightmost);
616 
617  nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
618 
619  /* Single key indexes not considered here */
620  if (nkeyatts == 1)
621  return false;
622 
623  /* Ascending insertion pattern never inferred when new item is first */
624  if (state->newitemoff == P_FIRSTKEY)
625  return false;
626 
627  /*
628  * Only apply optimization on pages with equisized tuples, since ordinal
629  * keys are likely to be fixed-width. Testing if the new tuple is
630  * variable width directly might also work, but that fails to apply the
631  * optimization to indexes with a numeric_ops attribute.
632  *
633  * Conclude that page has equisized tuples when the new item is the same
634  * width as the smallest item observed during pass over page, and other
635  * non-pivot tuples must be the same width as well. (Note that the
636  * possibly-truncated existing high key isn't counted in
637  * olddataitemstotal, and must be subtracted from maxoff.)
638  */
639  if (state->newitemsz != state->minfirstrightsz)
640  return false;
641  if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
642  return false;
643 
644  /*
645  * Avoid applying optimization when tuples are wider than a tuple
646  * consisting of two non-NULL int8/int64 attributes (or four non-NULL
647  * int4/int32 attributes)
648  */
649  if (state->newitemsz >
650  MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
651  sizeof(ItemIdData))
652  return false;
653 
654  /*
655  * At least the first attribute's value must be equal to the corresponding
656  * value in previous tuple to apply optimization. New item cannot be a
657  * duplicate, either.
658  *
659  * Handle case where new item is to the right of all items on the existing
660  * page. This is suggestive of monotonically increasing insertions in
661  * itself, so the "heap TID adjacency" test is not applied here.
662  */
663  if (state->newitemoff > maxoff)
664  {
665  itemid = PageGetItemId(state->page, maxoff);
666  tup = (IndexTuple) PageGetItem(state->page, itemid);
667  keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
668 
669  if (keepnatts > 1 && keepnatts <= nkeyatts)
670  {
671  *usemult = true;
672  return true;
673  }
674 
675  return false;
676  }
677 
678  /*
679  * "Low cardinality leading column, high cardinality suffix column"
680  * indexes with a random insertion pattern (e.g., an index with a boolean
681  * column, such as an index on '(book_is_in_print, book_isbn)') present us
682  * with a risk of consistently misapplying the optimization. We're
683  * willing to accept very occasional misapplication of the optimization,
684  * provided the cases where we get it wrong are rare and self-limiting.
685  *
686  * Heap TID adjacency strongly suggests that the item just to the left was
687  * inserted very recently, which limits overapplication of the
688  * optimization. Besides, all inappropriate cases triggered here will
689  * still split in the middle of the page on average.
690  */
691  itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
692  tup = (IndexTuple) PageGetItem(state->page, itemid);
693  /* Do cheaper test first */
694  if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
695  return false;
696  /* Check same conditions as rightmost item case, too */
697  keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
698 
699  if (keepnatts > 1 && keepnatts <= nkeyatts)
700  {
701  double interp = (double) state->newitemoff / ((double) maxoff + 1);
702  double leaffillfactormult = (double) leaffillfactor / 100.0;
703 
704  /*
705  * Don't allow caller to split after a new item when it will result in
706  * a split point to the right of the point that a leaf fillfactor
707  * split would use -- have caller apply leaf fillfactor instead
708  */
709  *usemult = interp > leaffillfactormult;
710 
711  return true;
712  }
713 
714  return false;
715 }
716 
717 /*
718  * Subroutine for determining if two heap TIDS are "adjacent".
719  *
720  * Adjacent means that the high TID is very likely to have been inserted into
721  * heap relation immediately after the low TID, probably during the current
722  * transaction.
723  */
724 static bool
726 {
727  BlockNumber lowblk,
728  highblk;
729 
730  lowblk = ItemPointerGetBlockNumber(lowhtid);
731  highblk = ItemPointerGetBlockNumber(highhtid);
732 
733  /* Make optimistic assumption of adjacency when heap blocks match */
734  if (lowblk == highblk)
735  return true;
736 
737  /* When heap block one up, second offset should be FirstOffsetNumber */
738  if (lowblk + 1 == highblk &&
740  return true;
741 
742  return false;
743 }
744 
745 /*
746  * Subroutine to find the "best" split point among candidate split points.
747  * The best split point is the split point with the lowest penalty among split
748  * points that fall within current/final split interval. Penalty is an
749  * abstract score, with a definition that varies depending on whether we're
750  * splitting a leaf page or an internal page. See _bt_split_penalty() for
751  * details.
752  *
753  * "perfectpenalty" is assumed to be the lowest possible penalty among
754  * candidate split points. This allows us to return early without wasting
755  * cycles on calculating the first differing attribute for all candidate
756  * splits when that clearly cannot improve our choice (or when we only want a
757  * minimally distinguishing split point, and don't want to make the split any
758  * more unbalanced than is necessary).
759  *
760  * We return the index of the first existing tuple that should go on the right
761  * page, plus a boolean indicating if new item is on left of split point.
762  */
763 static OffsetNumber
764 _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
765  bool *newitemonleft, FindSplitStrat strategy)
766 {
767  int bestpenalty,
768  lowsplit;
769  int highsplit = Min(state->interval, state->nsplits);
770  SplitPoint *final;
771 
772  bestpenalty = INT_MAX;
773  lowsplit = 0;
774  for (int i = lowsplit; i < highsplit; i++)
775  {
776  int penalty;
777 
778  penalty = _bt_split_penalty(state, state->splits + i);
779 
780  if (penalty <= perfectpenalty)
781  {
782  bestpenalty = penalty;
783  lowsplit = i;
784  break;
785  }
786 
787  if (penalty < bestpenalty)
788  {
789  bestpenalty = penalty;
790  lowsplit = i;
791  }
792  }
793 
794  final = &state->splits[lowsplit];
795 
796  /*
797  * There is a risk that the "many duplicates" strategy will repeatedly do
798  * the wrong thing when there are monotonically decreasing insertions to
799  * the right of a large group of duplicates. Repeated splits could leave
800  * a succession of right half pages with free space that can never be
801  * used. This must be avoided.
802  *
803  * Consider the example of the leftmost page in a single integer attribute
804  * NULLS FIRST index which is almost filled with NULLs. Monotonically
805  * decreasing integer insertions might cause the same leftmost page to
806  * split repeatedly at the same point. Each split derives its new high
807  * key from the lowest current value to the immediate right of the large
808  * group of NULLs, which will always be higher than all future integer
809  * insertions, directing all future integer insertions to the same
810  * leftmost page.
811  */
812  if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
813  !final->newitemonleft && final->firstoldonright >= state->newitemoff &&
814  final->firstoldonright < state->newitemoff + MAX_LEAF_INTERVAL)
815  {
816  /*
817  * Avoid the problem by peforming a 50:50 split when the new item is
818  * just to the right of the would-be "many duplicates" split point.
819  */
820  final = &state->splits[0];
821  }
822 
823  *newitemonleft = final->newitemonleft;
824  return final->firstoldonright;
825 }
826 
827 /*
828  * Subroutine to decide whether split should use default strategy/initial
829  * split interval, or whether it should finish splitting the page using
830  * alternative strategies (this is only possible with leaf pages).
831  *
832  * Caller uses alternative strategy (or sticks with default strategy) based
833  * on how *strategy is set here. Return value is "perfect penalty", which is
834  * passed to _bt_bestsplitloc() as a final constraint on how far caller is
835  * willing to go to avoid appending a heap TID when using the many duplicates
836  * strategy (it also saves _bt_bestsplitloc() useless cycles).
837  */
838 static int
840  SplitPoint *rightpage, FindSplitStrat *strategy)
841 {
842  IndexTuple leftmost,
843  rightmost;
844  SplitPoint *leftinterval,
845  *rightinterval;
846  int perfectpenalty;
847  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
848 
849  /* Assume that alternative strategy won't be used for now */
850  *strategy = SPLIT_DEFAULT;
851 
852  /*
853  * Use smallest observed first right item size for entire page as perfect
854  * penalty on internal pages. This can save cycles in the common case
855  * where most or all splits (not just splits within interval) have first
856  * right tuples that are the same size.
857  */
858  if (!state->is_leaf)
859  return state->minfirstrightsz;
860 
861  /*
862  * Use leftmost and rightmost tuples from leftmost and rightmost splits in
863  * current split interval
864  */
865  _bt_interval_edges(state, &leftinterval, &rightinterval);
866  leftmost = _bt_split_lastleft(state, leftinterval);
867  rightmost = _bt_split_firstright(state, rightinterval);
868 
869  /*
870  * If initial split interval can produce a split point that will at least
871  * avoid appending a heap TID in new high key, we're done. Finish split
872  * with default strategy and initial split interval.
873  */
874  perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
875  if (perfectpenalty <= indnkeyatts)
876  return perfectpenalty;
877 
878  /*
879  * Work out how caller should finish split when even their "perfect"
880  * penalty for initial/default split interval indicates that the interval
881  * does not contain even a single split that avoids appending a heap TID.
882  *
883  * Use the leftmost split's lastleft tuple and the rightmost split's
884  * firstright tuple to assess every possible split.
885  */
886  leftmost = _bt_split_lastleft(state, leftpage);
887  rightmost = _bt_split_firstright(state, rightpage);
888 
889  /*
890  * If page (including new item) has many duplicates but is not entirely
891  * full of duplicates, a many duplicates strategy split will be performed.
892  * If page is entirely full of duplicates, a single value strategy split
893  * will be performed.
894  */
895  perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
896  if (perfectpenalty <= indnkeyatts)
897  {
898  *strategy = SPLIT_MANY_DUPLICATES;
899 
900  /*
901  * Many duplicates strategy should split at either side the group of
902  * duplicates that enclose the delta-optimal split point. Return
903  * indnkeyatts rather than the true perfect penalty to make that
904  * happen. (If perfectpenalty was returned here then low cardinality
905  * composite indexes could have continual unbalanced splits.)
906  *
907  * Note that caller won't go through with a many duplicates split in
908  * rare cases where it looks like there are ever-decreasing insertions
909  * to the immediate right of the split point. This must happen just
910  * before a final decision is made, within _bt_bestsplitloc().
911  */
912  return indnkeyatts;
913  }
914 
915  /*
916  * Single value strategy is only appropriate with ever-increasing heap
917  * TIDs; otherwise, original default strategy split should proceed to
918  * avoid pathological performance. Use page high key to infer if this is
919  * the rightmost page among pages that store the same duplicate value.
920  * This should not prevent insertions of heap TIDs that are slightly out
921  * of order from using single value strategy, since that's expected with
922  * concurrent inserters of the same duplicate value.
923  */
924  else if (state->is_rightmost)
925  *strategy = SPLIT_SINGLE_VALUE;
926  else
927  {
928  ItemId itemid;
929  IndexTuple hikey;
930 
931  itemid = PageGetItemId(state->page, P_HIKEY);
932  hikey = (IndexTuple) PageGetItem(state->page, itemid);
933  perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
934  state->newitem);
935  if (perfectpenalty <= indnkeyatts)
936  *strategy = SPLIT_SINGLE_VALUE;
937  else
938  {
939  /*
940  * Have caller finish split using default strategy, since page
941  * does not appear to be the rightmost page for duplicates of the
942  * value the page is filled with
943  */
944  }
945  }
946 
947  return perfectpenalty;
948 }
949 
950 /*
951  * Subroutine to locate leftmost and rightmost splits for current/default
952  * split interval. Note that it will be the same split iff there is only one
953  * split in interval.
954  */
955 static void
957  SplitPoint **rightinterval)
958 {
959  int highsplit = Min(state->interval, state->nsplits);
960  SplitPoint *deltaoptimal;
961 
962  deltaoptimal = state->splits;
963  *leftinterval = NULL;
964  *rightinterval = NULL;
965 
966  /*
967  * Delta is an absolute distance to optimal split point, so both the
968  * leftmost and rightmost split point will usually be at the end of the
969  * array
970  */
971  for (int i = highsplit - 1; i >= 0; i--)
972  {
973  SplitPoint *distant = state->splits + i;
974 
975  if (distant->firstoldonright < deltaoptimal->firstoldonright)
976  {
977  if (*leftinterval == NULL)
978  *leftinterval = distant;
979  }
980  else if (distant->firstoldonright > deltaoptimal->firstoldonright)
981  {
982  if (*rightinterval == NULL)
983  *rightinterval = distant;
984  }
985  else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
986  {
987  /*
988  * "incoming tuple will become first on right page" (distant) is
989  * to the left of "incoming tuple will become last on left page"
990  * (delta-optimal)
991  */
992  Assert(distant->firstoldonright == state->newitemoff);
993  if (*leftinterval == NULL)
994  *leftinterval = distant;
995  }
996  else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
997  {
998  /*
999  * "incoming tuple will become last on left page" (distant) is to
1000  * the right of "incoming tuple will become first on right page"
1001  * (delta-optimal)
1002  */
1003  Assert(distant->firstoldonright == state->newitemoff);
1004  if (*rightinterval == NULL)
1005  *rightinterval = distant;
1006  }
1007  else
1008  {
1009  /* There was only one or two splits in initial split interval */
1010  Assert(distant == deltaoptimal);
1011  if (*leftinterval == NULL)
1012  *leftinterval = distant;
1013  if (*rightinterval == NULL)
1014  *rightinterval = distant;
1015  }
1016 
1017  if (*leftinterval && *rightinterval)
1018  return;
1019  }
1020 
1021  Assert(false);
1022 }
1023 
1024 /*
1025  * Subroutine to find penalty for caller's candidate split point.
1026  *
1027  * On leaf pages, penalty is the attribute number that distinguishes each side
1028  * of a split. It's the last attribute that needs to be included in new high
1029  * key for left page. It can be greater than the number of key attributes in
1030  * cases where a heap TID will need to be appended during truncation.
1031  *
1032  * On internal pages, penalty is simply the size of the first item on the
1033  * right half of the split (including line pointer overhead). This tuple will
1034  * become the new high key for the left page.
1035  */
1036 static inline int
1038 {
1039  IndexTuple lastleftuple;
1040  IndexTuple firstrighttuple;
1041 
1042  if (!state->is_leaf)
1043  {
1044  ItemId itemid;
1045 
1046  if (!split->newitemonleft &&
1047  split->firstoldonright == state->newitemoff)
1048  return state->newitemsz;
1049 
1050  itemid = PageGetItemId(state->page, split->firstoldonright);
1051 
1052  return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1053  }
1054 
1055  lastleftuple = _bt_split_lastleft(state, split);
1056  firstrighttuple = _bt_split_firstright(state, split);
1057 
1058  Assert(lastleftuple != firstrighttuple);
1059  return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple);
1060 }
1061 
1062 /*
1063  * Subroutine to get a lastleft IndexTuple for a spit point from page
1064  */
1065 static inline IndexTuple
1067 {
1068  ItemId itemid;
1069 
1070  if (split->newitemonleft && split->firstoldonright == state->newitemoff)
1071  return state->newitem;
1072 
1073  itemid = PageGetItemId(state->page,
1075  return (IndexTuple) PageGetItem(state->page, itemid);
1076 }
1077 
1078 /*
1079  * Subroutine to get a firstright IndexTuple for a spit point from page
1080  */
1081 static inline IndexTuple
1083 {
1084  ItemId itemid;
1085 
1086  if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
1087  return state->newitem;
1088 
1089  itemid = PageGetItemId(state->page, split->firstoldonright);
1090  return (IndexTuple) PageGetItem(state->page, itemid);
1091 }
signed short int16
Definition: c.h:345
OffsetNumber firstoldonright
Definition: nbtsplitloc.c:40
static int _bt_splitcmp(const void *arg1, const void *arg2)
Definition: nbtsplitloc.c:566
bool is_rightmost
Definition: nbtsplitloc.c:53
Size minfirstrightsz
Definition: nbtsplitloc.c:58
#define MAX_LEAF_INTERVAL
Definition: nbtsplitloc.c:21
static int _bt_split_penalty(FindSplitData *state, SplitPoint *split)
Definition: nbtsplitloc.c:1037
IndexTuple newitem
Definition: nbtsplitloc.c:50
int16 leftfree
Definition: nbtsplitloc.c:36
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
#define Min(x, y)
Definition: c.h:890
bool newitemonleft
Definition: nbtsplitloc.c:41
uint32 BlockNumber
Definition: block.h:31
#define SizeOfPageHeaderData
Definition: bufpage.h:216
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:839
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define final(a, b, c)
Definition: hashfn.c:118
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
SplitPoint * splits
Definition: nbtsplitloc.c:63
uint16 OffsetNumber
Definition: off.h:24
static IndexTuple _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
Definition: nbtsplitloc.c:1066
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:170
#define ERROR
Definition: elog.h:43
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:764
static void _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, SplitPoint **rightinterval)
Definition: nbtsplitloc.c:956
OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:127
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define P_FIRSTKEY
Definition: nbtree.h:218
#define PageGetPageSize(page)
Definition: bufpage.h:268
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:607
#define RelationGetRelationName(relation)
Definition: rel.h:448
struct ItemIdData ItemIdData
Relation rel
Definition: nbtsplitloc.c:48
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:433
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:169
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:171
int16 rightfree
Definition: nbtsplitloc.c:37
static IndexTuple _bt_split_firstright(FindSplitData *state, SplitPoint *split)
Definition: nbtsplitloc.c:1082
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define RelationGetFillFactor(relation, defaultff)
Definition: rel.h:290
FindSplitStrat
Definition: nbtsplitloc.c:24
int16 curdelta
Definition: nbtsplitloc.c:35
#define SIZE_MAX
Definition: c.h:452
#define Max(x, y)
Definition: c.h:884
#define MAX_INTERNAL_INTERVAL
Definition: nbtsplitloc.c:22
int olddataitemstotal
Definition: nbtsplitloc.c:57
#define Assert(condition)
Definition: c.h:732
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtsplitloc.c:54
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
#define MAXALIGN(LEN)
Definition: c.h:685
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
Definition: nbtsplitloc.c:725
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int olddataitemstoleft, Size firstoldonrightsz)
Definition: nbtsplitloc.c:453
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:632
#define P_HIKEY
Definition: nbtree.h:217
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2335
void * palloc(Size size)
Definition: mcxt.c:924
#define elog(elevel,...)
Definition: elog.h:226
int i
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define qsort(a, b, c, d)
Definition: port.h:491
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:538
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189