PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/transam.h"
21 #include "access/xloginsert.h"
22 #include "lib/qunique.h"
23 #include "miscadmin.h"
24 #include "storage/lmgr.h"
25 #include "storage/predicate.h"
26 #include "storage/smgr.h"
27 
28 /* Minimum tree height for application of fastpath optimization */
29 #define BTREE_FASTPATH_MIN_LEVEL 2
30 
31 
32 static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
34  Relation heapRel,
35  IndexUniqueCheck checkUnique, bool *is_unique,
36  uint32 *speculativeToken);
38  BTInsertState insertstate,
39  bool checkingunique,
40  bool indexUnchanged,
41  BTStack stack,
42  Relation heapRel);
43 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
44 static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
45  Buffer buf,
46  Buffer cbuf,
47  BTStack stack,
48  IndexTuple itup,
49  Size itemsz,
50  OffsetNumber newitemoff,
51  int postingoff,
52  bool split_only_page);
53 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
54  Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
55  IndexTuple newitem, IndexTuple orignewitem,
56  IndexTuple nposting, uint16 postingoff);
57 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
58  BTStack stack, bool isroot, bool isonly);
59 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
60 static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
61  OffsetNumber itup_off, bool newfirstdataitem);
62 static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
63  BTInsertState insertstate,
64  bool simpleonly, bool checkingunique,
65  bool uniquedup, bool indexUnchanged);
66 static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
67  OffsetNumber *deletable, int ndeletable,
68  IndexTuple newitem, OffsetNumber minoff,
69  OffsetNumber maxoff);
70 static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
71  int ndeletable, IndexTuple newitem,
72  int *nblocks);
73 static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
74 
75 /*
76  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
77  *
78  * This routine is called by the public interface routine, btinsert.
79  * By here, itup is filled in, including the TID.
80  *
81  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
82  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
83  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
84  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
85  * don't actually insert.
86  *
87  * indexUnchanged executor hint indicates if itup is from an
88  * UPDATE that didn't logically change the indexed value, but
89  * must nevertheless have a new entry to point to a successor
90  * version.
91  *
92  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
93  * it must be true if the entry is known unique, else false.
94  * (In the current implementation we'll also return true after a
95  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
96  * that's just a coding artifact.)
97  */
98 bool
100  IndexUniqueCheck checkUnique, bool indexUnchanged,
101  Relation heapRel)
102 {
103  bool is_unique = false;
104  BTInsertStateData insertstate;
105  BTScanInsert itup_key;
106  BTStack stack;
107  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
108 
109  /* we need an insertion scan key to do our search, so build one */
110  itup_key = _bt_mkscankey(rel, itup);
111 
112  if (checkingunique)
113  {
114  if (!itup_key->anynullkeys)
115  {
116  /* No (heapkeyspace) scantid until uniqueness established */
117  itup_key->scantid = NULL;
118  }
119  else
120  {
121  /*
122  * Scan key for new tuple contains NULL key values. Bypass
123  * checkingunique steps. They are unnecessary because core code
124  * considers NULL unequal to every value, including NULL.
125  *
126  * This optimization avoids O(N^2) behavior within the
127  * _bt_findinsertloc() heapkeyspace path when a unique index has a
128  * large number of "duplicates" with NULL key values.
129  */
130  checkingunique = false;
131  /* Tuple is unique in the sense that core code cares about */
132  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
133  is_unique = true;
134  }
135  }
136 
137  /*
138  * Fill in the BTInsertState working area, to track the current page and
139  * position within the page to insert on.
140  *
141  * Note that itemsz is passed down to lower level code that deals with
142  * inserting the item. It must be MAXALIGN()'d. This ensures that space
143  * accounting code consistently considers the alignment overhead that we
144  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
145  * already conservative about alignment, but we don't rely on that from
146  * this distance. Besides, preserving the "true" tuple size in index
147  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
148  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
149  */
150  insertstate.itup = itup;
151  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
152  insertstate.itup_key = itup_key;
153  insertstate.bounds_valid = false;
154  insertstate.buf = InvalidBuffer;
155  insertstate.postingoff = 0;
156 
157 search:
158 
159  /*
160  * Find and lock the leaf page that the tuple should be added to by
161  * searching from the root page. insertstate.buf will hold a buffer that
162  * is locked in exclusive mode afterwards.
163  */
164  stack = _bt_search_insert(rel, &insertstate);
165 
166  /*
167  * checkingunique inserts are not allowed to go ahead when two tuples with
168  * equal key attribute values would be visible to new MVCC snapshots once
169  * the xact commits. Check for conflicts in the locked page/buffer (if
170  * needed) here.
171  *
172  * It might be necessary to check a page to the right in _bt_check_unique,
173  * though that should be very rare. In practice the first page the value
174  * could be on (with scantid omitted) is almost always also the only page
175  * that a matching tuple might be found on. This is due to the behavior
176  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
177  * only be allowed to cross a page boundary when there is no candidate
178  * leaf page split point that avoids it. Also, _bt_check_unique can use
179  * the leaf page high key to determine that there will be no duplicates on
180  * the right sibling without actually visiting it (it uses the high key in
181  * cases where the new item happens to belong at the far right of the leaf
182  * page).
183  *
184  * NOTE: obviously, _bt_check_unique can only detect keys that are already
185  * in the index; so it cannot defend against concurrent insertions of the
186  * same key. We protect against that by means of holding a write lock on
187  * the first page the value could be on, with omitted/-inf value for the
188  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
189  * the same key must acquire a write lock on the same page, so only one
190  * would-be inserter can be making the check at one time. Furthermore,
191  * once we are past the check we hold write locks continuously until we
192  * have performed our insertion, so no later inserter can fail to see our
193  * insertion. (This requires some care in _bt_findinsertloc.)
194  *
195  * If we must wait for another xact, we release the lock while waiting,
196  * and then must perform a new search.
197  *
198  * For a partial uniqueness check, we don't wait for the other xact. Just
199  * let the tuple in and return false for possibly non-unique, or true for
200  * definitely unique.
201  */
202  if (checkingunique)
203  {
204  TransactionId xwait;
205  uint32 speculativeToken;
206 
207  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
208  &is_unique, &speculativeToken);
209 
210  if (unlikely(TransactionIdIsValid(xwait)))
211  {
212  /* Have to wait for the other guy ... */
213  _bt_relbuf(rel, insertstate.buf);
214  insertstate.buf = InvalidBuffer;
215 
216  /*
217  * If it's a speculative insertion, wait for it to finish (ie. to
218  * go ahead with the insertion, or kill the tuple). Otherwise
219  * wait for the transaction to finish as usual.
220  */
221  if (speculativeToken)
222  SpeculativeInsertionWait(xwait, speculativeToken);
223  else
224  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
225 
226  /* start over... */
227  if (stack)
228  _bt_freestack(stack);
229  goto search;
230  }
231 
232  /* Uniqueness is established -- restore heap tid as scantid */
233  if (itup_key->heapkeyspace)
234  itup_key->scantid = &itup->t_tid;
235  }
236 
237  if (checkUnique != UNIQUE_CHECK_EXISTING)
238  {
239  OffsetNumber newitemoff;
240 
241  /*
242  * The only conflict predicate locking cares about for indexes is when
243  * an index tuple insert conflicts with an existing lock. We don't
244  * know the actual page we're going to insert on for sure just yet in
245  * checkingunique and !heapkeyspace cases, but it's okay to use the
246  * first page the value could be on (with scantid omitted) instead.
247  */
249 
250  /*
251  * Do the insertion. Note that insertstate contains cached binary
252  * search bounds established within _bt_check_unique when insertion is
253  * checkingunique.
254  */
255  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
256  indexUnchanged, stack, heapRel);
257  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
258  itup, insertstate.itemsz, newitemoff,
259  insertstate.postingoff, false);
260  }
261  else
262  {
263  /* just release the buffer */
264  _bt_relbuf(rel, insertstate.buf);
265  }
266 
267  /* be tidy */
268  if (stack)
269  _bt_freestack(stack);
270  pfree(itup_key);
271 
272  return is_unique;
273 }
274 
275 /*
276  * _bt_search_insert() -- _bt_search() wrapper for inserts
277  *
278  * Search the tree for a particular scankey, or more precisely for the first
279  * leaf page it could be on. Try to make use of the fastpath optimization's
280  * rightmost leaf page cache before actually searching the tree from the root
281  * page, though.
282  *
283  * Return value is a stack of parent-page pointers (though see notes about
284  * fastpath optimization and page splits below). insertstate->buf is set to
285  * the address of the leaf-page buffer, which is write-locked and pinned in
286  * all cases (if necessary by creating a new empty root page for caller).
287  *
288  * The fastpath optimization avoids most of the work of searching the tree
289  * repeatedly when a single backend inserts successive new tuples on the
290  * rightmost leaf page of an index. A backend cache of the rightmost leaf
291  * page is maintained within _bt_insertonpg(), and used here. The cache is
292  * invalidated here when an insert of a non-pivot tuple must take place on a
293  * non-rightmost leaf page.
294  *
295  * The optimization helps with indexes on an auto-incremented field. It also
296  * helps with indexes on datetime columns, as well as indexes with lots of
297  * NULL values. (NULLs usually get inserted in the rightmost page for single
298  * column indexes, since they usually get treated as coming after everything
299  * else in the key space. Individual NULL tuples will generally be placed on
300  * the rightmost leaf page due to the influence of the heap TID column.)
301  *
302  * Note that we avoid applying the optimization when there is insufficient
303  * space on the rightmost page to fit caller's new item. This is necessary
304  * because we'll need to return a real descent stack when a page split is
305  * expected (actually, caller can cope with a leaf page split that uses a NULL
306  * stack, but that's very slow and so must be avoided). Note also that the
307  * fastpath optimization acquires the lock on the page conditionally as a way
308  * of reducing extra contention when there are concurrent insertions into the
309  * rightmost page (we give up if we'd have to wait for the lock). We assume
310  * that it isn't useful to apply the optimization when there is contention,
311  * since each per-backend cache won't stay valid for long.
312  */
313 static BTStack
315 {
316  Assert(insertstate->buf == InvalidBuffer);
317  Assert(!insertstate->bounds_valid);
318  Assert(insertstate->postingoff == 0);
319 
321  {
322  /* Simulate a _bt_getbuf() call with conditional locking */
323  insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
324  if (_bt_conditionallockbuf(rel, insertstate->buf))
325  {
326  Page page;
327  BTPageOpaque opaque;
328 
329  _bt_checkpage(rel, insertstate->buf);
330  page = BufferGetPage(insertstate->buf);
331  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
332 
333  /*
334  * Check if the page is still the rightmost leaf page and has
335  * enough free space to accommodate the new tuple. Also check
336  * that the insertion scan key is strictly greater than the first
337  * non-pivot tuple on the page. (Note that we expect itup_key's
338  * scantid to be unset when our caller is a checkingunique
339  * inserter.)
340  */
341  if (P_RIGHTMOST(opaque) &&
342  P_ISLEAF(opaque) &&
343  !P_IGNORE(opaque) &&
344  PageGetFreeSpace(page) > insertstate->itemsz &&
345  PageGetMaxOffsetNumber(page) >= P_HIKEY &&
346  _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
347  {
348  /*
349  * Caller can use the fastpath optimization because cached
350  * block is still rightmost leaf page, which can fit caller's
351  * new tuple without splitting. Keep block in local cache for
352  * next insert, and have caller use NULL stack.
353  *
354  * Note that _bt_insert_parent() has an assertion that catches
355  * leaf page splits that somehow follow from a fastpath insert
356  * (it should only be passed a NULL stack when it must deal
357  * with a concurrent root page split, and never because a NULL
358  * stack was returned here).
359  */
360  return NULL;
361  }
362 
363  /* Page unsuitable for caller, drop lock and pin */
364  _bt_relbuf(rel, insertstate->buf);
365  }
366  else
367  {
368  /* Lock unavailable, drop pin */
369  ReleaseBuffer(insertstate->buf);
370  }
371 
372  /* Forget block, since cache doesn't appear to be useful */
374  }
375 
376  /* Cannot use optimization -- descend tree, return proper descent stack */
377  return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
378  NULL);
379 }
380 
381 /*
382  * _bt_check_unique() -- Check for violation of unique index constraint
383  *
384  * Returns InvalidTransactionId if there is no conflict, else an xact ID
385  * we must wait for to see if it commits a conflicting tuple. If an actual
386  * conflict is detected, no return --- just ereport(). If an xact ID is
387  * returned, and the conflicting tuple still has a speculative insertion in
388  * progress, *speculativeToken is set to non-zero, and the caller can wait for
389  * the verdict on the insertion using SpeculativeInsertionWait().
390  *
391  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
392  * InvalidTransactionId because we don't want to wait. In this case we
393  * set *is_unique to false if there is a potential conflict, and the
394  * core code must redo the uniqueness check later.
395  *
396  * As a side-effect, sets state in insertstate that can later be used by
397  * _bt_findinsertloc() to reuse most of the binary search work we do
398  * here.
399  *
400  * Do not call here when there are NULL values in scan key. NULL should be
401  * considered unequal to NULL when checking for duplicates, but we are not
402  * prepared to handle that correctly.
403  */
404 static TransactionId
406  IndexUniqueCheck checkUnique, bool *is_unique,
407  uint32 *speculativeToken)
408 {
409  IndexTuple itup = insertstate->itup;
410  IndexTuple curitup = NULL;
411  ItemId curitemid;
412  BTScanInsert itup_key = insertstate->itup_key;
413  SnapshotData SnapshotDirty;
414  OffsetNumber offset;
415  OffsetNumber maxoff;
416  Page page;
417  BTPageOpaque opaque;
418  Buffer nbuf = InvalidBuffer;
419  bool found = false;
420  bool inposting = false;
421  bool prevalldead = true;
422  int curposti = 0;
423 
424  /* Assume unique until we find a duplicate */
425  *is_unique = true;
426 
427  InitDirtySnapshot(SnapshotDirty);
428 
429  page = BufferGetPage(insertstate->buf);
430  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
431  maxoff = PageGetMaxOffsetNumber(page);
432 
433  /*
434  * Find the first tuple with the same key.
435  *
436  * This also saves the binary search bounds in insertstate. We use them
437  * in the fastpath below, but also in the _bt_findinsertloc() call later.
438  */
439  Assert(!insertstate->bounds_valid);
440  offset = _bt_binsrch_insert(rel, insertstate);
441 
442  /*
443  * Scan over all equal tuples, looking for live conflicts.
444  */
445  Assert(!insertstate->bounds_valid || insertstate->low == offset);
446  Assert(!itup_key->anynullkeys);
447  Assert(itup_key->scantid == NULL);
448  for (;;)
449  {
450  /*
451  * Each iteration of the loop processes one heap TID, not one index
452  * tuple. Current offset number for page isn't usually advanced on
453  * iterations that process heap TIDs from posting list tuples.
454  *
455  * "inposting" state is set when _inside_ a posting list --- not when
456  * we're at the start (or end) of a posting list. We advance curposti
457  * at the end of the iteration when inside a posting list tuple. In
458  * general, every loop iteration either advances the page offset or
459  * advances curposti --- an iteration that handles the rightmost/max
460  * heap TID in a posting list finally advances the page offset (and
461  * unsets "inposting").
462  *
463  * Make sure the offset points to an actual index tuple before trying
464  * to examine it...
465  */
466  if (offset <= maxoff)
467  {
468  /*
469  * Fastpath: In most cases, we can use cached search bounds to
470  * limit our consideration to items that are definitely
471  * duplicates. This fastpath doesn't apply when the original page
472  * is empty, or when initial offset is past the end of the
473  * original page, which may indicate that we need to examine a
474  * second or subsequent page.
475  *
476  * Note that this optimization allows us to avoid calling
477  * _bt_compare() directly when there are no duplicates, as long as
478  * the offset where the key will go is not at the end of the page.
479  */
480  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
481  {
482  Assert(insertstate->bounds_valid);
483  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
484  Assert(insertstate->low <= insertstate->stricthigh);
485  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
486  break;
487  }
488 
489  /*
490  * We can skip items that are already marked killed.
491  *
492  * In the presence of heavy update activity an index may contain
493  * many killed items with the same key; running _bt_compare() on
494  * each killed item gets expensive. Just advance over killed
495  * items as quickly as we can. We only apply _bt_compare() when
496  * we get to a non-killed item. We could reuse the bounds to
497  * avoid _bt_compare() calls for known equal tuples, but it
498  * doesn't seem worth it.
499  */
500  if (!inposting)
501  curitemid = PageGetItemId(page, offset);
502  if (inposting || !ItemIdIsDead(curitemid))
503  {
504  ItemPointerData htid;
505  bool all_dead = false;
506 
507  if (!inposting)
508  {
509  /* Plain tuple, or first TID in posting list tuple */
510  if (_bt_compare(rel, itup_key, page, offset) != 0)
511  break; /* we're past all the equal tuples */
512 
513  /* Advanced curitup */
514  curitup = (IndexTuple) PageGetItem(page, curitemid);
515  Assert(!BTreeTupleIsPivot(curitup));
516  }
517 
518  /* okay, we gotta fetch the heap tuple using htid ... */
519  if (!BTreeTupleIsPosting(curitup))
520  {
521  /* ... htid is from simple non-pivot tuple */
522  Assert(!inposting);
523  htid = curitup->t_tid;
524  }
525  else if (!inposting)
526  {
527  /* ... htid is first TID in new posting list */
528  inposting = true;
529  prevalldead = true;
530  curposti = 0;
531  htid = *BTreeTupleGetPostingN(curitup, 0);
532  }
533  else
534  {
535  /* ... htid is second or subsequent TID in posting list */
536  Assert(curposti > 0);
537  htid = *BTreeTupleGetPostingN(curitup, curposti);
538  }
539 
540  /*
541  * If we are doing a recheck, we expect to find the tuple we
542  * are rechecking. It's not a duplicate, but we have to keep
543  * scanning.
544  */
545  if (checkUnique == UNIQUE_CHECK_EXISTING &&
546  ItemPointerCompare(&htid, &itup->t_tid) == 0)
547  {
548  found = true;
549  }
550 
551  /*
552  * Check if there's any table tuples for this index entry
553  * satisfying SnapshotDirty. This is necessary because for AMs
554  * with optimizations like heap's HOT, we have just a single
555  * index entry for the entire chain.
556  */
557  else if (table_index_fetch_tuple_check(heapRel, &htid,
558  &SnapshotDirty,
559  &all_dead))
560  {
561  TransactionId xwait;
562 
563  /*
564  * It is a duplicate. If we are only doing a partial
565  * check, then don't bother checking if the tuple is being
566  * updated in another transaction. Just return the fact
567  * that it is a potential conflict and leave the full
568  * check till later. Don't invalidate binary search
569  * bounds.
570  */
571  if (checkUnique == UNIQUE_CHECK_PARTIAL)
572  {
573  if (nbuf != InvalidBuffer)
574  _bt_relbuf(rel, nbuf);
575  *is_unique = false;
576  return InvalidTransactionId;
577  }
578 
579  /*
580  * If this tuple is being updated by other transaction
581  * then we have to wait for its commit/abort.
582  */
583  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
584  SnapshotDirty.xmin : SnapshotDirty.xmax;
585 
586  if (TransactionIdIsValid(xwait))
587  {
588  if (nbuf != InvalidBuffer)
589  _bt_relbuf(rel, nbuf);
590  /* Tell _bt_doinsert to wait... */
591  *speculativeToken = SnapshotDirty.speculativeToken;
592  /* Caller releases lock on buf immediately */
593  insertstate->bounds_valid = false;
594  return xwait;
595  }
596 
597  /*
598  * Otherwise we have a definite conflict. But before
599  * complaining, look to see if the tuple we want to insert
600  * is itself now committed dead --- if so, don't complain.
601  * This is a waste of time in normal scenarios but we must
602  * do it to support CREATE INDEX CONCURRENTLY.
603  *
604  * We must follow HOT-chains here because during
605  * concurrent index build, we insert the root TID though
606  * the actual tuple may be somewhere in the HOT-chain.
607  * While following the chain we might not stop at the
608  * exact tuple which triggered the insert, but that's OK
609  * because if we find a live tuple anywhere in this chain,
610  * we have a unique key conflict. The other live tuple is
611  * not part of this chain because it had a different index
612  * entry.
613  */
614  htid = itup->t_tid;
615  if (table_index_fetch_tuple_check(heapRel, &htid,
616  SnapshotSelf, NULL))
617  {
618  /* Normal case --- it's still live */
619  }
620  else
621  {
622  /*
623  * It's been deleted, so no error, and no need to
624  * continue searching
625  */
626  break;
627  }
628 
629  /*
630  * Check for a conflict-in as we would if we were going to
631  * write to this page. We aren't actually going to write,
632  * but we want a chance to report SSI conflicts that would
633  * otherwise be masked by this unique constraint
634  * violation.
635  */
636  CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
637 
638  /*
639  * This is a definite conflict. Break the tuple down into
640  * datums and report the error. But first, make sure we
641  * release the buffer locks we're holding ---
642  * BuildIndexValueDescription could make catalog accesses,
643  * which in the worst case might touch this same index and
644  * cause deadlocks.
645  */
646  if (nbuf != InvalidBuffer)
647  _bt_relbuf(rel, nbuf);
648  _bt_relbuf(rel, insertstate->buf);
649  insertstate->buf = InvalidBuffer;
650  insertstate->bounds_valid = false;
651 
652  {
654  bool isnull[INDEX_MAX_KEYS];
655  char *key_desc;
656 
658  values, isnull);
659 
660  key_desc = BuildIndexValueDescription(rel, values,
661  isnull);
662 
663  ereport(ERROR,
664  (errcode(ERRCODE_UNIQUE_VIOLATION),
665  errmsg("duplicate key value violates unique constraint \"%s\"",
667  key_desc ? errdetail("Key %s already exists.",
668  key_desc) : 0,
669  errtableconstraint(heapRel,
670  RelationGetRelationName(rel))));
671  }
672  }
673  else if (all_dead && (!inposting ||
674  (prevalldead &&
675  curposti == BTreeTupleGetNPosting(curitup) - 1)))
676  {
677  /*
678  * The conflicting tuple (or all HOT chains pointed to by
679  * all posting list TIDs) is dead to everyone, so mark the
680  * index entry killed.
681  */
682  ItemIdMarkDead(curitemid);
683  opaque->btpo_flags |= BTP_HAS_GARBAGE;
684 
685  /*
686  * Mark buffer with a dirty hint, since state is not
687  * crucial. Be sure to mark the proper buffer dirty.
688  */
689  if (nbuf != InvalidBuffer)
690  MarkBufferDirtyHint(nbuf, true);
691  else
692  MarkBufferDirtyHint(insertstate->buf, true);
693  }
694 
695  /*
696  * Remember if posting list tuple has even a single HOT chain
697  * whose members are not all dead
698  */
699  if (!all_dead && inposting)
700  prevalldead = false;
701  }
702  }
703 
704  if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
705  {
706  /* Advance to next TID in same posting list */
707  curposti++;
708  continue;
709  }
710  else if (offset < maxoff)
711  {
712  /* Advance to next tuple */
713  curposti = 0;
714  inposting = false;
715  offset = OffsetNumberNext(offset);
716  }
717  else
718  {
719  int highkeycmp;
720 
721  /* If scankey == hikey we gotta check the next page too */
722  if (P_RIGHTMOST(opaque))
723  break;
724  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
725  Assert(highkeycmp <= 0);
726  if (highkeycmp != 0)
727  break;
728  /* Advance to next non-dead page --- there must be one */
729  for (;;)
730  {
731  BlockNumber nblkno = opaque->btpo_next;
732 
733  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
734  page = BufferGetPage(nbuf);
735  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
736  if (!P_IGNORE(opaque))
737  break;
738  if (P_RIGHTMOST(opaque))
739  elog(ERROR, "fell off the end of index \"%s\"",
741  }
742  /* Will also advance to next tuple */
743  curposti = 0;
744  inposting = false;
745  maxoff = PageGetMaxOffsetNumber(page);
746  offset = P_FIRSTDATAKEY(opaque);
747  /* Don't invalidate binary search bounds */
748  }
749  }
750 
751  /*
752  * If we are doing a recheck then we should have found the tuple we are
753  * checking. Otherwise there's something very wrong --- probably, the
754  * index is on a non-immutable expression.
755  */
756  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
757  ereport(ERROR,
758  (errcode(ERRCODE_INTERNAL_ERROR),
759  errmsg("failed to re-find tuple within index \"%s\"",
761  errhint("This may be because of a non-immutable index expression."),
762  errtableconstraint(heapRel,
763  RelationGetRelationName(rel))));
764 
765  if (nbuf != InvalidBuffer)
766  _bt_relbuf(rel, nbuf);
767 
768  return InvalidTransactionId;
769 }
770 
771 
772 /*
773  * _bt_findinsertloc() -- Finds an insert location for a tuple
774  *
775  * On entry, insertstate buffer contains the page the new tuple belongs
776  * on. It is exclusive-locked and pinned by the caller.
777  *
778  * If 'checkingunique' is true, the buffer on entry is the first page
779  * that contains duplicates of the new key. If there are duplicates on
780  * multiple pages, the correct insertion position might be some page to
781  * the right, rather than the first page. In that case, this function
782  * moves right to the correct target page.
783  *
784  * (In a !heapkeyspace index, there can be multiple pages with the same
785  * high key, where the new tuple could legitimately be placed on. In
786  * that case, the caller passes the first page containing duplicates,
787  * just like when checkingunique=true. If that page doesn't have enough
788  * room for the new tuple, this function moves right, trying to find a
789  * legal page that does.)
790  *
791  * If 'indexUnchanged' is true, this is for an UPDATE that didn't
792  * logically change the indexed value, but must nevertheless have a new
793  * entry to point to a successor version. This hint from the executor
794  * will influence our behavior when the page might have to be split and
795  * we must consider our options. Bottom-up index deletion can avoid
796  * pathological version-driven page splits, but we only want to go to the
797  * trouble of trying it when we already have moderate confidence that
798  * it's appropriate. The hint should not significantly affect our
799  * behavior over time unless practically all inserts on to the leaf page
800  * get the hint.
801  *
802  * On exit, insertstate buffer contains the chosen insertion page, and
803  * the offset within that page is returned. If _bt_findinsertloc needed
804  * to move right, the lock and pin on the original page are released, and
805  * the new buffer is exclusively locked and pinned instead.
806  *
807  * If insertstate contains cached binary search bounds, we will take
808  * advantage of them. This avoids repeating comparisons that we made in
809  * _bt_check_unique() already.
810  *
811  * If there is not enough room on the page for the new tuple, we try to
812  * make room by removing any LP_DEAD tuples.
813  */
814 static OffsetNumber
816  BTInsertState insertstate,
817  bool checkingunique,
818  bool indexUnchanged,
819  BTStack stack,
820  Relation heapRel)
821 {
822  BTScanInsert itup_key = insertstate->itup_key;
823  Page page = BufferGetPage(insertstate->buf);
824  BTPageOpaque opaque;
825  OffsetNumber newitemoff;
826 
827  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
828 
829  /* Check 1/3 of a page restriction */
830  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
831  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
832  insertstate->itup);
833 
834  Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
835  Assert(!insertstate->bounds_valid || checkingunique);
836  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
837  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
838  Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
839 
840  if (itup_key->heapkeyspace)
841  {
842  /* Keep track of whether checkingunique duplicate seen */
843  bool uniquedup = indexUnchanged;
844 
845  /*
846  * If we're inserting into a unique index, we may have to walk right
847  * through leaf pages to find the one leaf page that we must insert on
848  * to.
849  *
850  * This is needed for checkingunique callers because a scantid was not
851  * used when we called _bt_search(). scantid can only be set after
852  * _bt_check_unique() has checked for duplicates. The buffer
853  * initially stored in insertstate->buf has the page where the first
854  * duplicate key might be found, which isn't always the page that new
855  * tuple belongs on. The heap TID attribute for new tuple (scantid)
856  * could force us to insert on a sibling page, though that should be
857  * very rare in practice.
858  */
859  if (checkingunique)
860  {
861  if (insertstate->low < insertstate->stricthigh)
862  {
863  /* Encountered a duplicate in _bt_check_unique() */
864  Assert(insertstate->bounds_valid);
865  uniquedup = true;
866  }
867 
868  for (;;)
869  {
870  /*
871  * Does the new tuple belong on this page?
872  *
873  * The earlier _bt_check_unique() call may well have
874  * established a strict upper bound on the offset for the new
875  * item. If it's not the last item of the page (i.e. if there
876  * is at least one tuple on the page that goes after the tuple
877  * we're inserting) then we know that the tuple belongs on
878  * this page. We can skip the high key check.
879  */
880  if (insertstate->bounds_valid &&
881  insertstate->low <= insertstate->stricthigh &&
882  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
883  break;
884 
885  /* Test '<=', not '!=', since scantid is set now */
886  if (P_RIGHTMOST(opaque) ||
887  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
888  break;
889 
890  _bt_stepright(rel, insertstate, stack);
891  /* Update local state after stepping right */
892  page = BufferGetPage(insertstate->buf);
893  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
894  /* Assume duplicates (if checkingunique) */
895  uniquedup = true;
896  }
897  }
898 
899  /*
900  * If the target page cannot fit newitem, try to avoid splitting the
901  * page on insert by performing deletion or deduplication now
902  */
903  if (PageGetFreeSpace(page) < insertstate->itemsz)
904  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
905  checkingunique, uniquedup,
906  indexUnchanged);
907  }
908  else
909  {
910  /*----------
911  * This is a !heapkeyspace (version 2 or 3) index. The current page
912  * is the first page that we could insert the new tuple to, but there
913  * may be other pages to the right that we could opt to use instead.
914  *
915  * If the new key is equal to one or more existing keys, we can
916  * legitimately place it anywhere in the series of equal keys. In
917  * fact, if the new key is equal to the page's "high key" we can place
918  * it on the next page. If it is equal to the high key, and there's
919  * not room to insert the new tuple on the current page without
920  * splitting, then we move right hoping to find more free space and
921  * avoid a split.
922  *
923  * Keep scanning right until we
924  * (a) find a page with enough free space,
925  * (b) reach the last page where the tuple can legally go, or
926  * (c) get tired of searching.
927  * (c) is not flippant; it is important because if there are many
928  * pages' worth of equal keys, it's better to split one of the early
929  * pages than to scan all the way to the end of the run of equal keys
930  * on every insert. We implement "get tired" as a random choice,
931  * since stopping after scanning a fixed number of pages wouldn't work
932  * well (we'd never reach the right-hand side of previously split
933  * pages). The probability of moving right is set at 0.99, which may
934  * seem too high to change the behavior much, but it does an excellent
935  * job of preventing O(N^2) behavior with many equal keys.
936  *----------
937  */
938  while (PageGetFreeSpace(page) < insertstate->itemsz)
939  {
940  /*
941  * Before considering moving right, see if we can obtain enough
942  * space by erasing LP_DEAD items
943  */
944  if (P_HAS_GARBAGE(opaque))
945  {
946  /* Perform simple deletion */
947  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
948  false, false, false);
949 
950  if (PageGetFreeSpace(page) >= insertstate->itemsz)
951  break; /* OK, now we have enough space */
952  }
953 
954  /*
955  * Nope, so check conditions (b) and (c) enumerated above
956  *
957  * The earlier _bt_check_unique() call may well have established a
958  * strict upper bound on the offset for the new item. If it's not
959  * the last item of the page (i.e. if there is at least one tuple
960  * on the page that's greater than the tuple we're inserting to)
961  * then we know that the tuple belongs on this page. We can skip
962  * the high key check.
963  */
964  if (insertstate->bounds_valid &&
965  insertstate->low <= insertstate->stricthigh &&
966  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
967  break;
968 
969  if (P_RIGHTMOST(opaque) ||
970  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
971  random() <= (MAX_RANDOM_VALUE / 100))
972  break;
973 
974  _bt_stepright(rel, insertstate, stack);
975  /* Update local state after stepping right */
976  page = BufferGetPage(insertstate->buf);
977  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
978  }
979  }
980 
981  /*
982  * We should now be on the correct page. Find the offset within the page
983  * for the new tuple. (Possibly reusing earlier search bounds.)
984  */
985  Assert(P_RIGHTMOST(opaque) ||
986  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
987 
988  newitemoff = _bt_binsrch_insert(rel, insertstate);
989 
990  if (insertstate->postingoff == -1)
991  {
992  /*
993  * There is an overlapping posting list tuple with its LP_DEAD bit
994  * set. We don't want to unnecessarily unset its LP_DEAD bit while
995  * performing a posting list split, so perform simple index tuple
996  * deletion early.
997  */
998  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
999  false, false, false);
1000 
1001  /*
1002  * Do new binary search. New insert location cannot overlap with any
1003  * posting list now.
1004  */
1005  Assert(!insertstate->bounds_valid);
1006  insertstate->postingoff = 0;
1007  newitemoff = _bt_binsrch_insert(rel, insertstate);
1008  Assert(insertstate->postingoff == 0);
1009  }
1010 
1011  return newitemoff;
1012 }
1013 
1014 /*
1015  * Step right to next non-dead page, during insertion.
1016  *
1017  * This is a bit more complicated than moving right in a search. We must
1018  * write-lock the target page before releasing write lock on current page;
1019  * else someone else's _bt_check_unique scan could fail to see our insertion.
1020  * Write locks on intermediate dead pages won't do because we don't know when
1021  * they will get de-linked from the tree.
1022  *
1023  * This is more aggressive than it needs to be for non-unique !heapkeyspace
1024  * indexes.
1025  */
1026 static void
1028 {
1029  Page page;
1030  BTPageOpaque opaque;
1031  Buffer rbuf;
1032  BlockNumber rblkno;
1033 
1034  page = BufferGetPage(insertstate->buf);
1035  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1036 
1037  rbuf = InvalidBuffer;
1038  rblkno = opaque->btpo_next;
1039  for (;;)
1040  {
1041  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1042  page = BufferGetPage(rbuf);
1043  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1044 
1045  /*
1046  * If this page was incompletely split, finish the split now. We do
1047  * this while holding a lock on the left sibling, which is not good
1048  * because finishing the split could be a fairly lengthy operation.
1049  * But this should happen very seldom.
1050  */
1051  if (P_INCOMPLETE_SPLIT(opaque))
1052  {
1053  _bt_finish_split(rel, rbuf, stack);
1054  rbuf = InvalidBuffer;
1055  continue;
1056  }
1057 
1058  if (!P_IGNORE(opaque))
1059  break;
1060  if (P_RIGHTMOST(opaque))
1061  elog(ERROR, "fell off the end of index \"%s\"",
1063 
1064  rblkno = opaque->btpo_next;
1065  }
1066  /* rbuf locked; unlock buf, update state for caller */
1067  _bt_relbuf(rel, insertstate->buf);
1068  insertstate->buf = rbuf;
1069  insertstate->bounds_valid = false;
1070 }
1071 
1072 /*----------
1073  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1074  *
1075  * This recursive procedure does the following things:
1076  *
1077  * + if postingoff != 0, splits existing posting list tuple
1078  * (since it overlaps with new 'itup' tuple).
1079  * + if necessary, splits the target page, using 'itup_key' for
1080  * suffix truncation on leaf pages (caller passes NULL for
1081  * non-leaf pages).
1082  * + inserts the new tuple (might be split from posting list).
1083  * + if the page was split, pops the parent stack, and finds the
1084  * right place to insert the new child pointer (by walking
1085  * right using information stored in the parent stack).
1086  * + invokes itself with the appropriate tuple for the right
1087  * child page on the parent.
1088  * + updates the metapage if a true root or fast root is split.
1089  *
1090  * On entry, we must have the correct buffer in which to do the
1091  * insertion, and the buffer must be pinned and write-locked. On return,
1092  * we will have dropped both the pin and the lock on the buffer.
1093  *
1094  * This routine only performs retail tuple insertions. 'itup' should
1095  * always be either a non-highkey leaf item, or a downlink (new high
1096  * key items are created indirectly, when a page is split). When
1097  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1098  * we're inserting the downlink for. This function will clear the
1099  * INCOMPLETE_SPLIT flag on it, and release the buffer.
1100  *----------
1101  */
1102 static void
1104  BTScanInsert itup_key,
1105  Buffer buf,
1106  Buffer cbuf,
1107  BTStack stack,
1108  IndexTuple itup,
1109  Size itemsz,
1110  OffsetNumber newitemoff,
1111  int postingoff,
1112  bool split_only_page)
1113 {
1114  Page page;
1115  BTPageOpaque opaque;
1116  bool isleaf,
1117  isroot,
1118  isrightmost,
1119  isonly;
1120  IndexTuple oposting = NULL;
1121  IndexTuple origitup = NULL;
1122  IndexTuple nposting = NULL;
1123 
1124  page = BufferGetPage(buf);
1125  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1126  isleaf = P_ISLEAF(opaque);
1127  isroot = P_ISROOT(opaque);
1128  isrightmost = P_RIGHTMOST(opaque);
1129  isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1130 
1131  /* child buffer must be given iff inserting on an internal page */
1132  Assert(isleaf == !BufferIsValid(cbuf));
1133  /* tuple must have appropriate number of attributes */
1134  Assert(!isleaf ||
1135  BTreeTupleGetNAtts(itup, rel) ==
1137  Assert(isleaf ||
1138  BTreeTupleGetNAtts(itup, rel) <=
1140  Assert(!BTreeTupleIsPosting(itup));
1141  Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1142  /* Caller must always finish incomplete split for us */
1143  Assert(!P_INCOMPLETE_SPLIT(opaque));
1144 
1145  /*
1146  * Every internal page should have exactly one negative infinity item at
1147  * all times. Only _bt_split() and _bt_newroot() should add items that
1148  * become negative infinity items through truncation, since they're the
1149  * only routines that allocate new internal pages.
1150  */
1151  Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1152 
1153  /*
1154  * Do we need to split an existing posting list item?
1155  */
1156  if (postingoff != 0)
1157  {
1158  ItemId itemid = PageGetItemId(page, newitemoff);
1159 
1160  /*
1161  * The new tuple is a duplicate with a heap TID that falls inside the
1162  * range of an existing posting list tuple on a leaf page. Prepare to
1163  * split an existing posting list. Overwriting the posting list with
1164  * its post-split version is treated as an extra step in either the
1165  * insert or page split critical section.
1166  */
1167  Assert(isleaf && !ItemIdIsDead(itemid));
1168  Assert(itup_key->heapkeyspace && itup_key->allequalimage);
1169  oposting = (IndexTuple) PageGetItem(page, itemid);
1170 
1171  /* use a mutable copy of itup as our itup from here on */
1172  origitup = itup;
1173  itup = CopyIndexTuple(origitup);
1174  nposting = _bt_swap_posting(itup, oposting, postingoff);
1175  /* itup now contains rightmost/max TID from oposting */
1176 
1177  /* Alter offset so that newitem goes after posting list */
1178  newitemoff = OffsetNumberNext(newitemoff);
1179  }
1180 
1181  /*
1182  * Do we need to split the page to fit the item on it?
1183  *
1184  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1185  * so this comparison is correct even though we appear to be accounting
1186  * only for the item and not for its line pointer.
1187  */
1188  if (PageGetFreeSpace(page) < itemsz)
1189  {
1190  Buffer rbuf;
1191 
1192  Assert(!split_only_page);
1193 
1194  /* split the buffer into left and right halves */
1195  rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
1196  origitup, nposting, postingoff);
1198  BufferGetBlockNumber(buf),
1199  BufferGetBlockNumber(rbuf));
1200 
1201  /*----------
1202  * By here,
1203  *
1204  * + our target page has been split;
1205  * + the original tuple has been inserted;
1206  * + we have write locks on both the old (left half)
1207  * and new (right half) buffers, after the split; and
1208  * + we know the key we want to insert into the parent
1209  * (it's the "high key" on the left child page).
1210  *
1211  * We're ready to do the parent insertion. We need to hold onto the
1212  * locks for the child pages until we locate the parent, but we can
1213  * at least release the lock on the right child before doing the
1214  * actual insertion. The lock on the left child will be released
1215  * last of all by parent insertion, where it is the 'cbuf' of parent
1216  * page.
1217  *----------
1218  */
1219  _bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly);
1220  }
1221  else
1222  {
1223  Buffer metabuf = InvalidBuffer;
1224  Page metapg = NULL;
1225  BTMetaPageData *metad = NULL;
1226  BlockNumber blockcache;
1227 
1228  /*
1229  * If we are doing this insert because we split a page that was the
1230  * only one on its tree level, but was not the root, it may have been
1231  * the "fast root". We need to ensure that the fast root link points
1232  * at or above the current page. We can safely acquire a lock on the
1233  * metapage here --- see comments for _bt_newroot().
1234  */
1235  if (unlikely(split_only_page))
1236  {
1237  Assert(!isleaf);
1238  Assert(BufferIsValid(cbuf));
1239 
1240  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1241  metapg = BufferGetPage(metabuf);
1242  metad = BTPageGetMeta(metapg);
1243 
1244  if (metad->btm_fastlevel >= opaque->btpo_level)
1245  {
1246  /* no update wanted */
1247  _bt_relbuf(rel, metabuf);
1248  metabuf = InvalidBuffer;
1249  }
1250  }
1251 
1252  /* Do the update. No ereport(ERROR) until changes are logged */
1254 
1255  if (postingoff != 0)
1256  memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1257 
1258  if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1259  false) == InvalidOffsetNumber)
1260  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1262 
1263  MarkBufferDirty(buf);
1264 
1265  if (BufferIsValid(metabuf))
1266  {
1267  /* upgrade meta-page if needed */
1268  if (metad->btm_version < BTREE_NOVAC_VERSION)
1269  _bt_upgrademetapage(metapg);
1270  metad->btm_fastroot = BufferGetBlockNumber(buf);
1271  metad->btm_fastlevel = opaque->btpo_level;
1272  MarkBufferDirty(metabuf);
1273  }
1274 
1275  /*
1276  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1277  * finishes a split
1278  */
1279  if (!isleaf)
1280  {
1281  Page cpage = BufferGetPage(cbuf);
1282  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1283 
1284  Assert(P_INCOMPLETE_SPLIT(cpageop));
1285  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1286  MarkBufferDirty(cbuf);
1287  }
1288 
1289  /* XLOG stuff */
1290  if (RelationNeedsWAL(rel))
1291  {
1292  xl_btree_insert xlrec;
1293  xl_btree_metadata xlmeta;
1294  uint8 xlinfo;
1295  XLogRecPtr recptr;
1296  uint16 upostingoff;
1297 
1298  xlrec.offnum = newitemoff;
1299 
1300  XLogBeginInsert();
1301  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1302 
1303  if (isleaf && postingoff == 0)
1304  {
1305  /* Simple leaf insert */
1306  xlinfo = XLOG_BTREE_INSERT_LEAF;
1307  }
1308  else if (postingoff != 0)
1309  {
1310  /*
1311  * Leaf insert with posting list split. Must include
1312  * postingoff field before newitem/orignewitem.
1313  */
1314  Assert(isleaf);
1315  xlinfo = XLOG_BTREE_INSERT_POST;
1316  }
1317  else
1318  {
1319  /* Internal page insert, which finishes a split on cbuf */
1320  xlinfo = XLOG_BTREE_INSERT_UPPER;
1322 
1323  if (BufferIsValid(metabuf))
1324  {
1325  /* Actually, it's an internal page insert + meta update */
1326  xlinfo = XLOG_BTREE_INSERT_META;
1327 
1329  xlmeta.version = metad->btm_version;
1330  xlmeta.root = metad->btm_root;
1331  xlmeta.level = metad->btm_level;
1332  xlmeta.fastroot = metad->btm_fastroot;
1333  xlmeta.fastlevel = metad->btm_fastlevel;
1337  xlmeta.allequalimage = metad->btm_allequalimage;
1338 
1339  XLogRegisterBuffer(2, metabuf,
1341  XLogRegisterBufData(2, (char *) &xlmeta,
1342  sizeof(xl_btree_metadata));
1343  }
1344  }
1345 
1347  if (postingoff == 0)
1348  {
1349  /* Just log itup from caller */
1350  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1351  }
1352  else
1353  {
1354  /*
1355  * Insert with posting list split (XLOG_BTREE_INSERT_POST
1356  * record) case.
1357  *
1358  * Log postingoff. Also log origitup, not itup. REDO routine
1359  * must reconstruct final itup (as well as nposting) using
1360  * _bt_swap_posting().
1361  */
1362  upostingoff = postingoff;
1363 
1364  XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1365  XLogRegisterBufData(0, (char *) origitup,
1366  IndexTupleSize(origitup));
1367  }
1368 
1369  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1370 
1371  if (BufferIsValid(metabuf))
1372  PageSetLSN(metapg, recptr);
1373  if (!isleaf)
1374  PageSetLSN(BufferGetPage(cbuf), recptr);
1375 
1376  PageSetLSN(page, recptr);
1377  }
1378 
1379  END_CRIT_SECTION();
1380 
1381  /* Release subsidiary buffers */
1382  if (BufferIsValid(metabuf))
1383  _bt_relbuf(rel, metabuf);
1384  if (!isleaf)
1385  _bt_relbuf(rel, cbuf);
1386 
1387  /*
1388  * Cache the block number if this is the rightmost leaf page. Cache
1389  * may be used by a future inserter within _bt_search_insert().
1390  */
1391  blockcache = InvalidBlockNumber;
1392  if (isrightmost && isleaf && !isroot)
1393  blockcache = BufferGetBlockNumber(buf);
1394 
1395  /* Release buffer for insertion target block */
1396  _bt_relbuf(rel, buf);
1397 
1398  /*
1399  * If we decided to cache the insertion target block before releasing
1400  * its buffer lock, then cache it now. Check the height of the tree
1401  * first, though. We don't go for the optimization with small
1402  * indexes. Defer final check to this point to ensure that we don't
1403  * call _bt_getrootheight while holding a buffer lock.
1404  */
1405  if (BlockNumberIsValid(blockcache) &&
1407  RelationSetTargetBlock(rel, blockcache);
1408  }
1409 
1410  /* be tidy */
1411  if (postingoff != 0)
1412  {
1413  /* itup is actually a modified copy of caller's original */
1414  pfree(nposting);
1415  pfree(itup);
1416  }
1417 }
1418 
1419 /*
1420  * _bt_split() -- split a page in the btree.
1421  *
1422  * On entry, buf is the page to split, and is pinned and write-locked.
1423  * newitemoff etc. tell us about the new item that must be inserted
1424  * along with the data from the original page.
1425  *
1426  * itup_key is used for suffix truncation on leaf pages (internal
1427  * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1428  * is the left-sibling of the page we're inserting the downlink for.
1429  * This function will clear the INCOMPLETE_SPLIT flag on it, and
1430  * release the buffer.
1431  *
1432  * orignewitem, nposting, and postingoff are needed when an insert of
1433  * orignewitem results in both a posting list split and a page split.
1434  * These extra posting list split details are used here in the same
1435  * way as they are used in the more common case where a posting list
1436  * split does not coincide with a page split. We need to deal with
1437  * posting list splits directly in order to ensure that everything
1438  * that follows from the insert of orignewitem is handled as a single
1439  * atomic operation (though caller's insert of a new pivot/downlink
1440  * into parent page will still be a separate operation). See
1441  * nbtree/README for details on the design of posting list splits.
1442  *
1443  * Returns the new right sibling of buf, pinned and write-locked.
1444  * The pin and lock on buf are maintained.
1445  */
1446 static Buffer
1448  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1449  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1450 {
1451  Buffer rbuf;
1452  Page origpage;
1453  Page leftpage,
1454  rightpage;
1455  BlockNumber origpagenumber,
1456  rightpagenumber;
1457  BTPageOpaque ropaque,
1458  lopaque,
1459  oopaque;
1460  Buffer sbuf = InvalidBuffer;
1461  Page spage = NULL;
1462  BTPageOpaque sopaque = NULL;
1463  Size itemsz;
1464  ItemId itemid;
1465  IndexTuple firstright,
1466  lefthighkey;
1467  OffsetNumber firstrightoff;
1468  OffsetNumber afterleftoff,
1469  afterrightoff,
1470  minusinfoff;
1471  OffsetNumber origpagepostingoff;
1472  OffsetNumber maxoff;
1473  OffsetNumber i;
1474  bool newitemonleft,
1475  isleaf,
1476  isrightmost;
1477 
1478  /*
1479  * origpage is the original page to be split. leftpage is a temporary
1480  * buffer that receives the left-sibling data, which will be copied back
1481  * into origpage on success. rightpage is the new page that will receive
1482  * the right-sibling data.
1483  *
1484  * leftpage is allocated after choosing a split point. rightpage's new
1485  * buffer isn't acquired until after leftpage is initialized and has new
1486  * high key, the last point where splitting the page may fail (barring
1487  * corruption). Failing before acquiring new buffer won't have lasting
1488  * consequences, since origpage won't have been modified and leftpage is
1489  * only workspace.
1490  */
1491  origpage = BufferGetPage(buf);
1492  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1493  isleaf = P_ISLEAF(oopaque);
1494  isrightmost = P_RIGHTMOST(oopaque);
1495  maxoff = PageGetMaxOffsetNumber(origpage);
1496  origpagenumber = BufferGetBlockNumber(buf);
1497 
1498  /*
1499  * Choose a point to split origpage at.
1500  *
1501  * A split point can be thought of as a point _between_ two existing data
1502  * items on origpage (the lastleft and firstright tuples), provided you
1503  * pretend that the new item that didn't fit is already on origpage.
1504  *
1505  * Since origpage does not actually contain newitem, the representation of
1506  * split points needs to work with two boundary cases: splits where
1507  * newitem is lastleft, and splits where newitem is firstright.
1508  * newitemonleft resolves the ambiguity that would otherwise exist when
1509  * newitemoff == firstrightoff. In all other cases it's clear which side
1510  * of the split every tuple goes on from context. newitemonleft is
1511  * usually (but not always) redundant information.
1512  *
1513  * firstrightoff is supposed to be an origpage offset number, but it's
1514  * possible that its value will be maxoff+1, which is "past the end" of
1515  * origpage. This happens in the rare case where newitem goes after all
1516  * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1517  * origpage at the point that leaves newitem alone on new right page. Any
1518  * "!newitemonleft && newitemoff == firstrightoff" split point makes
1519  * newitem the firstright tuple, though, so this case isn't a special
1520  * case.
1521  */
1522  firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1523  newitem, &newitemonleft);
1524 
1525  /* Allocate temp buffer for leftpage */
1526  leftpage = PageGetTempPage(origpage);
1527  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1528  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1529 
1530  /*
1531  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1532  * and HAS_GARBAGE flags.
1533  */
1534  lopaque->btpo_flags = oopaque->btpo_flags;
1535  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1536  /* set flag in leftpage indicating that rightpage has no downlink yet */
1537  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1538  lopaque->btpo_prev = oopaque->btpo_prev;
1539  /* handle btpo_next after rightpage buffer acquired */
1540  lopaque->btpo_level = oopaque->btpo_level;
1541  /* handle btpo_cycleid after rightpage buffer acquired */
1542 
1543  /*
1544  * Copy the original page's LSN into leftpage, which will become the
1545  * updated version of the page. We need this because XLogInsert will
1546  * examine the LSN and possibly dump it in a page image.
1547  */
1548  PageSetLSN(leftpage, PageGetLSN(origpage));
1549 
1550  /*
1551  * Determine page offset number of existing overlapped-with-orignewitem
1552  * posting list when it is necessary to perform a posting list split in
1553  * passing. Note that newitem was already changed by caller (newitem no
1554  * longer has the orignewitem TID).
1555  *
1556  * This page offset number (origpagepostingoff) will be used to pretend
1557  * that the posting split has already taken place, even though the
1558  * required modifications to origpage won't occur until we reach the
1559  * critical section. The lastleft and firstright tuples of our page split
1560  * point should, in effect, come from an imaginary version of origpage
1561  * that has the nposting tuple instead of the original posting list tuple.
1562  *
1563  * Note: _bt_findsplitloc() should have compensated for coinciding posting
1564  * list splits in just the same way, at least in theory. It doesn't
1565  * bother with that, though. In practice it won't affect its choice of
1566  * split point.
1567  */
1568  origpagepostingoff = InvalidOffsetNumber;
1569  if (postingoff != 0)
1570  {
1571  Assert(isleaf);
1572  Assert(ItemPointerCompare(&orignewitem->t_tid,
1573  &newitem->t_tid) < 0);
1574  Assert(BTreeTupleIsPosting(nposting));
1575  origpagepostingoff = OffsetNumberPrev(newitemoff);
1576  }
1577 
1578  /*
1579  * The high key for the new left page is a possibly-truncated copy of
1580  * firstright on the leaf level (it's "firstright itself" on internal
1581  * pages; see !isleaf comments below). This may seem to be contrary to
1582  * Lehman & Yao's approach of using a copy of lastleft as the new high key
1583  * when splitting on the leaf level. It isn't, though.
1584  *
1585  * Suffix truncation will leave the left page's high key fully equal to
1586  * lastleft when lastleft and firstright are equal prior to heap TID (that
1587  * is, the tiebreaker TID value comes from lastleft). It isn't actually
1588  * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1589  * "subtree" invariant to hold. It's sufficient to make sure that the new
1590  * leaf high key is strictly less than firstright, and greater than or
1591  * equal to (not necessarily equal to) lastleft. In other words, when
1592  * suffix truncation isn't possible during a leaf page split, we take
1593  * L&Y's exact approach to generating a new high key for the left page.
1594  * (Actually, that is slightly inaccurate. We don't just use a copy of
1595  * lastleft. A tuple with all the keys from firstright but the max heap
1596  * TID from lastleft is used, to avoid introducing a special case.)
1597  */
1598  if (!newitemonleft && newitemoff == firstrightoff)
1599  {
1600  /* incoming tuple becomes firstright */
1601  itemsz = newitemsz;
1602  firstright = newitem;
1603  }
1604  else
1605  {
1606  /* existing item at firstrightoff becomes firstright */
1607  itemid = PageGetItemId(origpage, firstrightoff);
1608  itemsz = ItemIdGetLength(itemid);
1609  firstright = (IndexTuple) PageGetItem(origpage, itemid);
1610  if (firstrightoff == origpagepostingoff)
1611  firstright = nposting;
1612  }
1613 
1614  if (isleaf)
1615  {
1616  IndexTuple lastleft;
1617 
1618  /* Attempt suffix truncation for leaf page splits */
1619  if (newitemonleft && newitemoff == firstrightoff)
1620  {
1621  /* incoming tuple becomes lastleft */
1622  lastleft = newitem;
1623  }
1624  else
1625  {
1626  OffsetNumber lastleftoff;
1627 
1628  /* existing item before firstrightoff becomes lastleft */
1629  lastleftoff = OffsetNumberPrev(firstrightoff);
1630  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1631  itemid = PageGetItemId(origpage, lastleftoff);
1632  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1633  if (lastleftoff == origpagepostingoff)
1634  lastleft = nposting;
1635  }
1636 
1637  lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1638  itemsz = IndexTupleSize(lefthighkey);
1639  }
1640  else
1641  {
1642  /*
1643  * Don't perform suffix truncation on a copy of firstright to make
1644  * left page high key for internal page splits. Must use firstright
1645  * as new high key directly.
1646  *
1647  * Each distinct separator key value originates as a leaf level high
1648  * key; all other separator keys/pivot tuples are copied from one
1649  * level down. A separator key in a grandparent page must be
1650  * identical to high key in rightmost parent page of the subtree to
1651  * its left, which must itself be identical to high key in rightmost
1652  * child page of that same subtree (this even applies to separator
1653  * from grandparent's high key). There must always be an unbroken
1654  * "seam" of identical separator keys that guide index scans at every
1655  * level, starting from the grandparent. That's why suffix truncation
1656  * is unsafe here.
1657  *
1658  * Internal page splits will truncate firstright into a "negative
1659  * infinity" data item when it gets inserted on the new right page
1660  * below, though. This happens during the call to _bt_pgaddtup() for
1661  * the new first data item for right page. Do not confuse this
1662  * mechanism with suffix truncation. It is just a convenient way of
1663  * implementing page splits that split the internal page "inside"
1664  * firstright. The lefthighkey separator key cannot appear a second
1665  * time in the right page (only firstright's downlink goes in right
1666  * page).
1667  */
1668  lefthighkey = firstright;
1669  }
1670 
1671  /*
1672  * Add new high key to leftpage
1673  */
1674  afterleftoff = P_HIKEY;
1675 
1676  Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1677  Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1679  Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1680  if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1681  false) == InvalidOffsetNumber)
1682  elog(ERROR, "failed to add high key to the left sibling"
1683  " while splitting block %u of index \"%s\"",
1684  origpagenumber, RelationGetRelationName(rel));
1685  afterleftoff = OffsetNumberNext(afterleftoff);
1686 
1687  /*
1688  * Acquire a new right page to split into, now that left page has a new
1689  * high key. From here on, it's not okay to throw an error without
1690  * zeroing rightpage first. This coding rule ensures that we won't
1691  * confuse future VACUUM operations, which might otherwise try to re-find
1692  * a downlink to a leftover junk page as the page undergoes deletion.
1693  *
1694  * It would be reasonable to start the critical section just after the new
1695  * rightpage buffer is acquired instead; that would allow us to avoid
1696  * leftover junk pages without bothering to zero rightpage. We do it this
1697  * way because it avoids an unnecessary PANIC when either origpage or its
1698  * existing sibling page are corrupt.
1699  */
1700  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1701  rightpage = BufferGetPage(rbuf);
1702  rightpagenumber = BufferGetBlockNumber(rbuf);
1703  /* rightpage was initialized by _bt_getbuf */
1704  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1705 
1706  /*
1707  * Finish off remaining leftpage special area fields. They cannot be set
1708  * before both origpage (leftpage) and rightpage buffers are acquired and
1709  * locked.
1710  *
1711  * btpo_cycleid is only used with leaf pages, though we set it here in all
1712  * cases just to be consistent.
1713  */
1714  lopaque->btpo_next = rightpagenumber;
1715  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1716 
1717  /*
1718  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1719  * and HAS_GARBAGE flags.
1720  */
1721  ropaque->btpo_flags = oopaque->btpo_flags;
1722  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1723  ropaque->btpo_prev = origpagenumber;
1724  ropaque->btpo_next = oopaque->btpo_next;
1725  ropaque->btpo_level = oopaque->btpo_level;
1726  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1727 
1728  /*
1729  * Add new high key to rightpage where necessary.
1730  *
1731  * If the page we're splitting is not the rightmost page at its level in
1732  * the tree, then the first entry on the page is the high key from
1733  * origpage.
1734  */
1735  afterrightoff = P_HIKEY;
1736 
1737  if (!isrightmost)
1738  {
1739  IndexTuple righthighkey;
1740 
1741  itemid = PageGetItemId(origpage, P_HIKEY);
1742  itemsz = ItemIdGetLength(itemid);
1743  righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1744  Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1745  Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1747  if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1748  false, false) == InvalidOffsetNumber)
1749  {
1750  memset(rightpage, 0, BufferGetPageSize(rbuf));
1751  elog(ERROR, "failed to add high key to the right sibling"
1752  " while splitting block %u of index \"%s\"",
1753  origpagenumber, RelationGetRelationName(rel));
1754  }
1755  afterrightoff = OffsetNumberNext(afterrightoff);
1756  }
1757 
1758  /*
1759  * Internal page splits truncate first data item on right page -- it
1760  * becomes "minus infinity" item for the page. Set this up here.
1761  */
1762  minusinfoff = InvalidOffsetNumber;
1763  if (!isleaf)
1764  minusinfoff = afterrightoff;
1765 
1766  /*
1767  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1768  * additional pivot tuples in !isleaf case) to the appropriate page.
1769  *
1770  * Note: we *must* insert at least the right page's items in item-number
1771  * order, for the benefit of _bt_restore_page().
1772  */
1773  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1774  {
1775  IndexTuple dataitem;
1776 
1777  itemid = PageGetItemId(origpage, i);
1778  itemsz = ItemIdGetLength(itemid);
1779  dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1780 
1781  /* replace original item with nposting due to posting split? */
1782  if (i == origpagepostingoff)
1783  {
1784  Assert(BTreeTupleIsPosting(dataitem));
1785  Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1786  dataitem = nposting;
1787  }
1788 
1789  /* does new item belong before this one? */
1790  else if (i == newitemoff)
1791  {
1792  if (newitemonleft)
1793  {
1794  Assert(newitemoff <= firstrightoff);
1795  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1796  false))
1797  {
1798  memset(rightpage, 0, BufferGetPageSize(rbuf));
1799  elog(ERROR, "failed to add new item to the left sibling"
1800  " while splitting block %u of index \"%s\"",
1801  origpagenumber, RelationGetRelationName(rel));
1802  }
1803  afterleftoff = OffsetNumberNext(afterleftoff);
1804  }
1805  else
1806  {
1807  Assert(newitemoff >= firstrightoff);
1808  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1809  afterrightoff == minusinfoff))
1810  {
1811  memset(rightpage, 0, BufferGetPageSize(rbuf));
1812  elog(ERROR, "failed to add new item to the right sibling"
1813  " while splitting block %u of index \"%s\"",
1814  origpagenumber, RelationGetRelationName(rel));
1815  }
1816  afterrightoff = OffsetNumberNext(afterrightoff);
1817  }
1818  }
1819 
1820  /* decide which page to put it on */
1821  if (i < firstrightoff)
1822  {
1823  if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1824  {
1825  memset(rightpage, 0, BufferGetPageSize(rbuf));
1826  elog(ERROR, "failed to add old item to the left sibling"
1827  " while splitting block %u of index \"%s\"",
1828  origpagenumber, RelationGetRelationName(rel));
1829  }
1830  afterleftoff = OffsetNumberNext(afterleftoff);
1831  }
1832  else
1833  {
1834  if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1835  afterrightoff == minusinfoff))
1836  {
1837  memset(rightpage, 0, BufferGetPageSize(rbuf));
1838  elog(ERROR, "failed to add old item to the right sibling"
1839  " while splitting block %u of index \"%s\"",
1840  origpagenumber, RelationGetRelationName(rel));
1841  }
1842  afterrightoff = OffsetNumberNext(afterrightoff);
1843  }
1844  }
1845 
1846  /* Handle case where newitem goes at the end of rightpage */
1847  if (i <= newitemoff)
1848  {
1849  /*
1850  * Can't have newitemonleft here; that would imply we were told to put
1851  * *everything* on the left page, which cannot fit (if it could, we'd
1852  * not be splitting the page).
1853  */
1854  Assert(!newitemonleft && newitemoff == maxoff + 1);
1855  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1856  afterrightoff == minusinfoff))
1857  {
1858  memset(rightpage, 0, BufferGetPageSize(rbuf));
1859  elog(ERROR, "failed to add new item to the right sibling"
1860  " while splitting block %u of index \"%s\"",
1861  origpagenumber, RelationGetRelationName(rel));
1862  }
1863  afterrightoff = OffsetNumberNext(afterrightoff);
1864  }
1865 
1866  /*
1867  * We have to grab the original right sibling (if any) and update its prev
1868  * link. We are guaranteed that this is deadlock-free, since we couple
1869  * the locks in the standard order: left to right.
1870  */
1871  if (!isrightmost)
1872  {
1873  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1874  spage = BufferGetPage(sbuf);
1875  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1876  if (sopaque->btpo_prev != origpagenumber)
1877  {
1878  memset(rightpage, 0, BufferGetPageSize(rbuf));
1879  ereport(ERROR,
1880  (errcode(ERRCODE_INDEX_CORRUPTED),
1881  errmsg_internal("right sibling's left-link doesn't match: "
1882  "block %u links to %u instead of expected %u in index \"%s\"",
1883  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1884  RelationGetRelationName(rel))));
1885  }
1886 
1887  /*
1888  * Check to see if we can set the SPLIT_END flag in the right-hand
1889  * split page; this can save some I/O for vacuum since it need not
1890  * proceed to the right sibling. We can set the flag if the right
1891  * sibling has a different cycleid: that means it could not be part of
1892  * a group of pages that were all split off from the same ancestor
1893  * page. If you're confused, imagine that page A splits to A B and
1894  * then again, yielding A C B, while vacuum is in progress. Tuples
1895  * originally in A could now be in either B or C, hence vacuum must
1896  * examine both pages. But if D, our right sibling, has a different
1897  * cycleid then it could not contain any tuples that were in A when
1898  * the vacuum started.
1899  */
1900  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1901  ropaque->btpo_flags |= BTP_SPLIT_END;
1902  }
1903 
1904  /*
1905  * Right sibling is locked, new siblings are prepared, but original page
1906  * is not updated yet.
1907  *
1908  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1909  * not starting the critical section till here because we haven't been
1910  * scribbling on the original page yet; see comments above.
1911  */
1913 
1914  /*
1915  * By here, the original data page has been split into two new halves, and
1916  * these are correct. The algorithm requires that the left page never
1917  * move during a split, so we copy the new left page back on top of the
1918  * original. We need to do this before writing the WAL record, so that
1919  * XLogInsert can WAL log an image of the page if necessary.
1920  */
1921  PageRestoreTempPage(leftpage, origpage);
1922  /* leftpage, lopaque must not be used below here */
1923 
1924  MarkBufferDirty(buf);
1925  MarkBufferDirty(rbuf);
1926 
1927  if (!isrightmost)
1928  {
1929  sopaque->btpo_prev = rightpagenumber;
1930  MarkBufferDirty(sbuf);
1931  }
1932 
1933  /*
1934  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1935  * a split
1936  */
1937  if (!isleaf)
1938  {
1939  Page cpage = BufferGetPage(cbuf);
1940  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1941 
1942  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1943  MarkBufferDirty(cbuf);
1944  }
1945 
1946  /* XLOG stuff */
1947  if (RelationNeedsWAL(rel))
1948  {
1949  xl_btree_split xlrec;
1950  uint8 xlinfo;
1951  XLogRecPtr recptr;
1952 
1953  xlrec.level = ropaque->btpo_level;
1954  /* See comments below on newitem, orignewitem, and posting lists */
1955  xlrec.firstrightoff = firstrightoff;
1956  xlrec.newitemoff = newitemoff;
1957  xlrec.postingoff = 0;
1958  if (postingoff != 0 && origpagepostingoff < firstrightoff)
1959  xlrec.postingoff = postingoff;
1960 
1961  XLogBeginInsert();
1962  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1963 
1966  /* Log original right sibling, since we've changed its prev-pointer */
1967  if (!isrightmost)
1969  if (!isleaf)
1971 
1972  /*
1973  * Log the new item, if it was inserted on the left page. (If it was
1974  * put on the right page, we don't need to explicitly WAL log it
1975  * because it's included with all the other items on the right page.)
1976  * Show the new item as belonging to the left page buffer, so that it
1977  * is not stored if XLogInsert decides it needs a full-page image of
1978  * the left page. We always store newitemoff in the record, though.
1979  *
1980  * The details are sometimes slightly different for page splits that
1981  * coincide with a posting list split. If both the replacement
1982  * posting list and newitem go on the right page, then we don't need
1983  * to log anything extra, just like the simple !newitemonleft
1984  * no-posting-split case (postingoff is set to zero in the WAL record,
1985  * so recovery doesn't need to process a posting list split at all).
1986  * Otherwise, we set postingoff and log orignewitem instead of
1987  * newitem, despite having actually inserted newitem. REDO routine
1988  * must reconstruct nposting and newitem using _bt_swap_posting().
1989  *
1990  * Note: It's possible that our page split point is the point that
1991  * makes the posting list lastleft and newitem firstright. This is
1992  * the only case where we log orignewitem/newitem despite newitem
1993  * going on the right page. If XLogInsert decides that it can omit
1994  * orignewitem due to logging a full-page image of the left page,
1995  * everything still works out, since recovery only needs to log
1996  * orignewitem for items on the left page (just like the regular
1997  * newitem-logged case).
1998  */
1999  if (newitemonleft && xlrec.postingoff == 0)
2000  XLogRegisterBufData(0, (char *) newitem, newitemsz);
2001  else if (xlrec.postingoff != 0)
2002  {
2003  Assert(isleaf);
2004  Assert(newitemonleft || firstrightoff == newitemoff);
2005  Assert(newitemsz == IndexTupleSize(orignewitem));
2006  XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
2007  }
2008 
2009  /* Log the left page's new high key */
2010  if (!isleaf)
2011  {
2012  /* lefthighkey isn't local copy, get current pointer */
2013  itemid = PageGetItemId(origpage, P_HIKEY);
2014  lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2015  }
2016  XLogRegisterBufData(0, (char *) lefthighkey,
2017  MAXALIGN(IndexTupleSize(lefthighkey)));
2018 
2019  /*
2020  * Log the contents of the right page in the format understood by
2021  * _bt_restore_page(). The whole right page will be recreated.
2022  *
2023  * Direct access to page is not good but faster - we should implement
2024  * some new func in page API. Note we only store the tuples
2025  * themselves, knowing that they were inserted in item-number order
2026  * and so the line pointers can be reconstructed. See comments for
2027  * _bt_restore_page().
2028  */
2030  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2031  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2032 
2033  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2034  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2035 
2036  PageSetLSN(origpage, recptr);
2037  PageSetLSN(rightpage, recptr);
2038  if (!isrightmost)
2039  PageSetLSN(spage, recptr);
2040  if (!isleaf)
2041  PageSetLSN(BufferGetPage(cbuf), recptr);
2042  }
2043 
2044  END_CRIT_SECTION();
2045 
2046  /* release the old right sibling */
2047  if (!isrightmost)
2048  _bt_relbuf(rel, sbuf);
2049 
2050  /* release the child */
2051  if (!isleaf)
2052  _bt_relbuf(rel, cbuf);
2053 
2054  /* be tidy */
2055  if (isleaf)
2056  pfree(lefthighkey);
2057 
2058  /* split's done */
2059  return rbuf;
2060 }
2061 
2062 /*
2063  * _bt_insert_parent() -- Insert downlink into parent, completing split.
2064  *
2065  * On entry, buf and rbuf are the left and right split pages, which we
2066  * still hold write locks on. Both locks will be released here. We
2067  * release the rbuf lock once we have a write lock on the page that we
2068  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2069  * The lock on buf is released at the same point as the lock on the parent
2070  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2071  * atomic operation that completes the split by inserting a new downlink.
2072  *
2073  * stack - stack showing how we got here. Will be NULL when splitting true
2074  * root, or during concurrent root split, where we can be inefficient
2075  * isroot - we split the true root
2076  * isonly - we split a page alone on its level (might have been fast root)
2077  */
2078 static void
2080  Buffer buf,
2081  Buffer rbuf,
2082  BTStack stack,
2083  bool isroot,
2084  bool isonly)
2085 {
2086  /*
2087  * Here we have to do something Lehman and Yao don't talk about: deal with
2088  * a root split and construction of a new root. If our stack is empty
2089  * then we have just split a node on what had been the root level when we
2090  * descended the tree. If it was still the root then we perform a
2091  * new-root construction. If it *wasn't* the root anymore, search to find
2092  * the next higher level that someone constructed meanwhile, and find the
2093  * right place to insert as for the normal case.
2094  *
2095  * If we have to search for the parent level, we do so by re-descending
2096  * from the root. This is not super-efficient, but it's rare enough not
2097  * to matter.
2098  */
2099  if (isroot)
2100  {
2101  Buffer rootbuf;
2102 
2103  Assert(stack == NULL);
2104  Assert(isonly);
2105  /* create a new root node and update the metapage */
2106  rootbuf = _bt_newroot(rel, buf, rbuf);
2107  /* release the split buffers */
2108  _bt_relbuf(rel, rootbuf);
2109  _bt_relbuf(rel, rbuf);
2110  _bt_relbuf(rel, buf);
2111  }
2112  else
2113  {
2114  BlockNumber bknum = BufferGetBlockNumber(buf);
2115  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2116  Page page = BufferGetPage(buf);
2117  IndexTuple new_item;
2118  BTStackData fakestack;
2119  IndexTuple ritem;
2120  Buffer pbuf;
2121 
2122  if (stack == NULL)
2123  {
2124  BTPageOpaque opaque;
2125 
2126  elog(DEBUG2, "concurrent ROOT page split");
2127  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2128 
2129  /*
2130  * We should never reach here when a leaf page split takes place
2131  * despite the insert of newitem being able to apply the fastpath
2132  * optimization. Make sure of that with an assertion.
2133  *
2134  * This is more of a performance issue than a correctness issue.
2135  * The fastpath won't have a descent stack. Using a phony stack
2136  * here works, but never rely on that. The fastpath should be
2137  * rejected within _bt_search_insert() when the rightmost leaf
2138  * page will split, since it's faster to go through _bt_search()
2139  * and get a stack in the usual way.
2140  */
2141  Assert(!(P_ISLEAF(opaque) &&
2143 
2144  /* Find the leftmost page at the next level up */
2145  pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
2146  /* Set up a phony stack entry pointing there */
2147  stack = &fakestack;
2148  stack->bts_blkno = BufferGetBlockNumber(pbuf);
2150  stack->bts_parent = NULL;
2151  _bt_relbuf(rel, pbuf);
2152  }
2153 
2154  /* get high key from left, a strict lower bound for new right page */
2155  ritem = (IndexTuple) PageGetItem(page,
2156  PageGetItemId(page, P_HIKEY));
2157 
2158  /* form an index tuple that points at the new right page */
2159  new_item = CopyIndexTuple(ritem);
2160  BTreeTupleSetDownLink(new_item, rbknum);
2161 
2162  /*
2163  * Re-find and write lock the parent of buf.
2164  *
2165  * It's possible that the location of buf's downlink has changed since
2166  * our initial _bt_search() descent. _bt_getstackbuf() will detect
2167  * and recover from this, updating the stack, which ensures that the
2168  * new downlink will be inserted at the correct offset. Even buf's
2169  * parent may have changed.
2170  */
2171  pbuf = _bt_getstackbuf(rel, stack, bknum);
2172 
2173  /*
2174  * Unlock the right child. The left child will be unlocked in
2175  * _bt_insertonpg().
2176  *
2177  * Unlocking the right child must be delayed until here to ensure that
2178  * no concurrent VACUUM operation can become confused. Page deletion
2179  * cannot be allowed to fail to re-find a downlink for the rbuf page.
2180  * (Actually, this is just a vestige of how things used to work. The
2181  * page deletion code is expected to check for the INCOMPLETE_SPLIT
2182  * flag on the left child. It won't attempt deletion of the right
2183  * child until the split is complete. Despite all this, we opt to
2184  * conservatively delay unlocking the right child until here.)
2185  */
2186  _bt_relbuf(rel, rbuf);
2187 
2188  if (pbuf == InvalidBuffer)
2189  ereport(ERROR,
2190  (errcode(ERRCODE_INDEX_CORRUPTED),
2191  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2192  RelationGetRelationName(rel), bknum, rbknum)));
2193 
2194  /* Recursively insert into the parent */
2195  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
2196  new_item, MAXALIGN(IndexTupleSize(new_item)),
2197  stack->bts_offset + 1, 0, isonly);
2198 
2199  /* be tidy */
2200  pfree(new_item);
2201  }
2202 }
2203 
2204 /*
2205  * _bt_finish_split() -- Finish an incomplete split
2206  *
2207  * A crash or other failure can leave a split incomplete. The insertion
2208  * routines won't allow to insert on a page that is incompletely split.
2209  * Before inserting on such a page, call _bt_finish_split().
2210  *
2211  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2212  * and unpinned.
2213  */
2214 void
2216 {
2217  Page lpage = BufferGetPage(lbuf);
2218  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
2219  Buffer rbuf;
2220  Page rpage;
2221  BTPageOpaque rpageop;
2222  bool wasroot;
2223  bool wasonly;
2224 
2225  Assert(P_INCOMPLETE_SPLIT(lpageop));
2226 
2227  /* Lock right sibling, the one missing the downlink */
2228  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2229  rpage = BufferGetPage(rbuf);
2230  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
2231 
2232  /* Could this be a root split? */
2233  if (!stack)
2234  {
2235  Buffer metabuf;
2236  Page metapg;
2237  BTMetaPageData *metad;
2238 
2239  /* acquire lock on the metapage */
2240  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2241  metapg = BufferGetPage(metabuf);
2242  metad = BTPageGetMeta(metapg);
2243 
2244  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2245 
2246  _bt_relbuf(rel, metabuf);
2247  }
2248  else
2249  wasroot = false;
2250 
2251  /* Was this the only page on the level before split? */
2252  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2253 
2254  elog(DEBUG1, "finishing incomplete split of %u/%u",
2256 
2257  _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
2258 }
2259 
2260 /*
2261  * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2262  * tuple whose downlink points to child page.
2263  *
2264  * Caller passes child's block number, which is used to identify
2265  * associated pivot tuple in parent page using a linear search that
2266  * matches on pivot's downlink/block number. The expected location of
2267  * the pivot tuple is taken from the stack one level above the child
2268  * page. This is used as a starting point. Insertions into the
2269  * parent level could cause the pivot tuple to move right; deletions
2270  * could cause it to move left, but not left of the page we previously
2271  * found it on.
2272  *
2273  * Caller can use its stack to relocate the pivot tuple/downlink for
2274  * any same-level page to the right of the page found by its initial
2275  * descent. This is necessary because of the possibility that caller
2276  * moved right to recover from a concurrent page split. It's also
2277  * convenient for certain callers to be able to step right when there
2278  * wasn't a concurrent page split, while still using their original
2279  * stack. For example, the checkingunique _bt_doinsert() case may
2280  * have to step right when there are many physical duplicates, and its
2281  * scantid forces an insertion to the right of the "first page the
2282  * value could be on". (This is also relied on by all of our callers
2283  * when dealing with !heapkeyspace indexes.)
2284  *
2285  * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2286  * tuple not found (should not happen). Adjusts bts_blkno &
2287  * bts_offset if changed. Page split caller should insert its new
2288  * pivot tuple for its new right sibling page on parent page, at the
2289  * offset number bts_offset + 1.
2290  */
2291 Buffer
2293 {
2294  BlockNumber blkno;
2295  OffsetNumber start;
2296 
2297  blkno = stack->bts_blkno;
2298  start = stack->bts_offset;
2299 
2300  for (;;)
2301  {
2302  Buffer buf;
2303  Page page;
2304  BTPageOpaque opaque;
2305 
2306  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2307  page = BufferGetPage(buf);
2308  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2309 
2310  if (P_INCOMPLETE_SPLIT(opaque))
2311  {
2312  _bt_finish_split(rel, buf, stack->bts_parent);
2313  continue;
2314  }
2315 
2316  if (!P_IGNORE(opaque))
2317  {
2318  OffsetNumber offnum,
2319  minoff,
2320  maxoff;
2321  ItemId itemid;
2322  IndexTuple item;
2323 
2324  minoff = P_FIRSTDATAKEY(opaque);
2325  maxoff = PageGetMaxOffsetNumber(page);
2326 
2327  /*
2328  * start = InvalidOffsetNumber means "search the whole page". We
2329  * need this test anyway due to possibility that page has a high
2330  * key now when it didn't before.
2331  */
2332  if (start < minoff)
2333  start = minoff;
2334 
2335  /*
2336  * Need this check too, to guard against possibility that page
2337  * split since we visited it originally.
2338  */
2339  if (start > maxoff)
2340  start = OffsetNumberNext(maxoff);
2341 
2342  /*
2343  * These loops will check every item on the page --- but in an
2344  * order that's attuned to the probability of where it actually
2345  * is. Scan to the right first, then to the left.
2346  */
2347  for (offnum = start;
2348  offnum <= maxoff;
2349  offnum = OffsetNumberNext(offnum))
2350  {
2351  itemid = PageGetItemId(page, offnum);
2352  item = (IndexTuple) PageGetItem(page, itemid);
2353 
2354  if (BTreeTupleGetDownLink(item) == child)
2355  {
2356  /* Return accurate pointer to where link is now */
2357  stack->bts_blkno = blkno;
2358  stack->bts_offset = offnum;
2359  return buf;
2360  }
2361  }
2362 
2363  for (offnum = OffsetNumberPrev(start);
2364  offnum >= minoff;
2365  offnum = OffsetNumberPrev(offnum))
2366  {
2367  itemid = PageGetItemId(page, offnum);
2368  item = (IndexTuple) PageGetItem(page, itemid);
2369 
2370  if (BTreeTupleGetDownLink(item) == child)
2371  {
2372  /* Return accurate pointer to where link is now */
2373  stack->bts_blkno = blkno;
2374  stack->bts_offset = offnum;
2375  return buf;
2376  }
2377  }
2378  }
2379 
2380  /*
2381  * The item we're looking for moved right at least one page.
2382  *
2383  * Lehman and Yao couple/chain locks when moving right here, which we
2384  * can avoid. See nbtree/README.
2385  */
2386  if (P_RIGHTMOST(opaque))
2387  {
2388  _bt_relbuf(rel, buf);
2389  return InvalidBuffer;
2390  }
2391  blkno = opaque->btpo_next;
2392  start = InvalidOffsetNumber;
2393  _bt_relbuf(rel, buf);
2394  }
2395 }
2396 
2397 /*
2398  * _bt_newroot() -- Create a new root page for the index.
2399  *
2400  * We've just split the old root page and need to create a new one.
2401  * In order to do this, we add a new root page to the file, then lock
2402  * the metadata page and update it. This is guaranteed to be deadlock-
2403  * free, because all readers release their locks on the metadata page
2404  * before trying to lock the root, and all writers lock the root before
2405  * trying to lock the metadata page. We have a write lock on the old
2406  * root page, so we have not introduced any cycles into the waits-for
2407  * graph.
2408  *
2409  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2410  * locked. On exit, a new root page exists with entries for the
2411  * two new children, metapage is updated and unlocked/unpinned.
2412  * The new root buffer is returned to caller which has to unlock/unpin
2413  * lbuf, rbuf & rootbuf.
2414  */
2415 static Buffer
2417 {
2418  Buffer rootbuf;
2419  Page lpage,
2420  rootpage;
2421  BlockNumber lbkno,
2422  rbkno;
2423  BlockNumber rootblknum;
2424  BTPageOpaque rootopaque;
2425  BTPageOpaque lopaque;
2426  ItemId itemid;
2427  IndexTuple item;
2428  IndexTuple left_item;
2429  Size left_item_sz;
2430  IndexTuple right_item;
2431  Size right_item_sz;
2432  Buffer metabuf;
2433  Page metapg;
2434  BTMetaPageData *metad;
2435 
2436  lbkno = BufferGetBlockNumber(lbuf);
2437  rbkno = BufferGetBlockNumber(rbuf);
2438  lpage = BufferGetPage(lbuf);
2439  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2440 
2441  /* get a new root page */
2442  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2443  rootpage = BufferGetPage(rootbuf);
2444  rootblknum = BufferGetBlockNumber(rootbuf);
2445 
2446  /* acquire lock on the metapage */
2447  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2448  metapg = BufferGetPage(metabuf);
2449  metad = BTPageGetMeta(metapg);
2450 
2451  /*
2452  * Create downlink item for left page (old root). The key value used is
2453  * "minus infinity", a sentinel value that's reliably less than any real
2454  * key value that could appear in the left page.
2455  */
2456  left_item_sz = sizeof(IndexTupleData);
2457  left_item = (IndexTuple) palloc(left_item_sz);
2458  left_item->t_info = left_item_sz;
2459  BTreeTupleSetDownLink(left_item, lbkno);
2460  BTreeTupleSetNAtts(left_item, 0, false);
2461 
2462  /*
2463  * Create downlink item for right page. The key for it is obtained from
2464  * the "high key" position in the left page.
2465  */
2466  itemid = PageGetItemId(lpage, P_HIKEY);
2467  right_item_sz = ItemIdGetLength(itemid);
2468  item = (IndexTuple) PageGetItem(lpage, itemid);
2469  right_item = CopyIndexTuple(item);
2470  BTreeTupleSetDownLink(right_item, rbkno);
2471 
2472  /* NO EREPORT(ERROR) from here till newroot op is logged */
2474 
2475  /* upgrade metapage if needed */
2476  if (metad->btm_version < BTREE_NOVAC_VERSION)
2477  _bt_upgrademetapage(metapg);
2478 
2479  /* set btree special data */
2480  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2481  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2482  rootopaque->btpo_flags = BTP_ROOT;
2483  rootopaque->btpo_level =
2484  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1;
2485  rootopaque->btpo_cycleid = 0;
2486 
2487  /* update metapage data */
2488  metad->btm_root = rootblknum;
2489  metad->btm_level = rootopaque->btpo_level;
2490  metad->btm_fastroot = rootblknum;
2491  metad->btm_fastlevel = rootopaque->btpo_level;
2492 
2493  /*
2494  * Insert the left page pointer into the new root page. The root page is
2495  * the rightmost page on its level so there is no "high key" in it; the
2496  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2497  *
2498  * Note: we *must* insert the two items in item-number order, for the
2499  * benefit of _bt_restore_page().
2500  */
2501  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2502  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2503  false, false) == InvalidOffsetNumber)
2504  elog(PANIC, "failed to add leftkey to new root page"
2505  " while splitting block %u of index \"%s\"",
2507 
2508  /*
2509  * insert the right page pointer into the new root page.
2510  */
2511  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2512  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2514  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2515  false, false) == InvalidOffsetNumber)
2516  elog(PANIC, "failed to add rightkey to new root page"
2517  " while splitting block %u of index \"%s\"",
2519 
2520  /* Clear the incomplete-split flag in the left child */
2521  Assert(P_INCOMPLETE_SPLIT(lopaque));
2522  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2523  MarkBufferDirty(lbuf);
2524 
2525  MarkBufferDirty(rootbuf);
2526  MarkBufferDirty(metabuf);
2527 
2528  /* XLOG stuff */
2529  if (RelationNeedsWAL(rel))
2530  {
2531  xl_btree_newroot xlrec;
2532  XLogRecPtr recptr;
2533  xl_btree_metadata md;
2534 
2535  xlrec.rootblk = rootblknum;
2536  xlrec.level = metad->btm_level;
2537 
2538  XLogBeginInsert();
2539  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2540 
2541  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2544 
2546  md.version = metad->btm_version;
2547  md.root = rootblknum;
2548  md.level = metad->btm_level;
2549  md.fastroot = rootblknum;
2550  md.fastlevel = metad->btm_level;
2553  md.allequalimage = metad->btm_allequalimage;
2554 
2555  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2556 
2557  /*
2558  * Direct access to page is not good but faster - we should implement
2559  * some new func in page API.
2560  */
2562  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2563  ((PageHeader) rootpage)->pd_special -
2564  ((PageHeader) rootpage)->pd_upper);
2565 
2566  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2567 
2568  PageSetLSN(lpage, recptr);
2569  PageSetLSN(rootpage, recptr);
2570  PageSetLSN(metapg, recptr);
2571  }
2572 
2573  END_CRIT_SECTION();
2574 
2575  /* done with metapage */
2576  _bt_relbuf(rel, metabuf);
2577 
2578  pfree(left_item);
2579  pfree(right_item);
2580 
2581  return rootbuf;
2582 }
2583 
2584 /*
2585  * _bt_pgaddtup() -- add a data item to a particular page during split.
2586  *
2587  * The difference between this routine and a bare PageAddItem call is
2588  * that this code can deal with the first data item on an internal btree
2589  * page in passing. This data item (which is called "firstright" within
2590  * _bt_split()) has a key that must be treated as minus infinity after
2591  * the split. Therefore, we truncate away all attributes when caller
2592  * specifies it's the first data item on page (downlink is not changed,
2593  * though). This extra step is only needed for the right page of an
2594  * internal page split. There is no need to do this for the first data
2595  * item on the existing/left page, since that will already have been
2596  * truncated during an earlier page split.
2597  *
2598  * See _bt_split() for a high level explanation of why we truncate here.
2599  * Note that this routine has nothing to do with suffix truncation,
2600  * despite using some of the same infrastructure.
2601  */
2602 static inline bool
2604  Size itemsize,
2605  IndexTuple itup,
2606  OffsetNumber itup_off,
2607  bool newfirstdataitem)
2608 {
2609  IndexTupleData trunctuple;
2610 
2611  if (newfirstdataitem)
2612  {
2613  trunctuple = *itup;
2614  trunctuple.t_info = sizeof(IndexTupleData);
2615  BTreeTupleSetNAtts(&trunctuple, 0, false);
2616  itup = &trunctuple;
2617  itemsize = sizeof(IndexTupleData);
2618  }
2619 
2620  if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2621  false) == InvalidOffsetNumber))
2622  return false;
2623 
2624  return true;
2625 }
2626 
2627 /*
2628  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2629  *
2630  * There are three operations performed here: simple index deletion, bottom-up
2631  * index deletion, and deduplication. If all three operations fail to free
2632  * enough space for the incoming item then caller will go on to split the
2633  * page. We always consider simple deletion first. If that doesn't work out
2634  * we consider alternatives. Callers that only want us to consider simple
2635  * deletion (without any fallback) ask for that using the 'simpleonly'
2636  * argument.
2637  *
2638  * We usually pick only one alternative "complex" operation when simple
2639  * deletion alone won't prevent a page split. The 'checkingunique',
2640  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2641  *
2642  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2643  * level flag was found set. The flag was useful back when there wasn't
2644  * necessarily one single page for a duplicate tuple to go on (before heap TID
2645  * became a part of the key space in version 4 indexes). But we don't
2646  * actually look at the flag anymore (it's not a gating condition for our
2647  * caller). That would cause us to miss tuples that are safe to delete,
2648  * without getting any benefit in return. We know that the alternative is to
2649  * split the page; scanning the line pointer array in passing won't have
2650  * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2651  * all this because !heapkeyspace indexes must still do a "getting tired"
2652  * linear search, and so are likely to get some benefit from using it as a
2653  * gating condition.)
2654  */
2655 static void
2657  BTInsertState insertstate,
2658  bool simpleonly, bool checkingunique,
2659  bool uniquedup, bool indexUnchanged)
2660 {
2662  int ndeletable = 0;
2663  OffsetNumber offnum,
2664  minoff,
2665  maxoff;
2666  Buffer buffer = insertstate->buf;
2667  BTScanInsert itup_key = insertstate->itup_key;
2668  Page page = BufferGetPage(buffer);
2670 
2671  Assert(P_ISLEAF(opaque));
2672  Assert(simpleonly || itup_key->heapkeyspace);
2673  Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2674 
2675  /*
2676  * Scan over all items to see which ones need to be deleted according to
2677  * LP_DEAD flags. We'll usually manage to delete a few extra items that
2678  * are not marked LP_DEAD in passing. Often the extra items that actually
2679  * end up getting deleted are items that would have had their LP_DEAD bit
2680  * set before long anyway (if we opted not to include them as extras).
2681  */
2682  minoff = P_FIRSTDATAKEY(opaque);
2683  maxoff = PageGetMaxOffsetNumber(page);
2684  for (offnum = minoff;
2685  offnum <= maxoff;
2686  offnum = OffsetNumberNext(offnum))
2687  {
2688  ItemId itemId = PageGetItemId(page, offnum);
2689 
2690  if (ItemIdIsDead(itemId))
2691  deletable[ndeletable++] = offnum;
2692  }
2693 
2694  if (ndeletable > 0)
2695  {
2696  _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2697  insertstate->itup, minoff, maxoff);
2698  insertstate->bounds_valid = false;
2699 
2700  /* Return when a page split has already been avoided */
2701  if (PageGetFreeSpace(page) >= insertstate->itemsz)
2702  return;
2703 
2704  /* Might as well assume duplicates (if checkingunique) */
2705  uniquedup = true;
2706  }
2707 
2708  /*
2709  * We're done with simple deletion. Return early with callers that only
2710  * call here so that simple deletion can be considered. This includes
2711  * callers that explicitly ask for this and checkingunique callers that
2712  * probably don't have any version churn duplicates on the page.
2713  *
2714  * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2715  * return at this point (or when we go on the try either or both of our
2716  * other strategies and they also fail). We do not bother expending a
2717  * separate write to clear it, however. Caller will definitely clear it
2718  * when it goes on to split the page (note also that the deduplication
2719  * process will clear the flag in passing, just to keep things tidy).
2720  */
2721  if (simpleonly || (checkingunique && !uniquedup))
2722  {
2723  Assert(!indexUnchanged);
2724  return;
2725  }
2726 
2727  /* Assume bounds about to be invalidated (this is almost certain now) */
2728  insertstate->bounds_valid = false;
2729 
2730  /*
2731  * Perform bottom-up index deletion pass when executor hint indicated that
2732  * incoming item is logically unchanged, or for a unique index that is
2733  * known to have physical duplicates for some other reason. (There is a
2734  * large overlap between these two cases for a unique index. It's worth
2735  * having both triggering conditions in order to apply the optimization in
2736  * the event of successive related INSERT and DELETE statements.)
2737  *
2738  * We'll go on to do a deduplication pass when a bottom-up pass fails to
2739  * delete an acceptable amount of free space (a significant fraction of
2740  * the page, or space for the new item, whichever is greater).
2741  *
2742  * Note: Bottom-up index deletion uses the same equality/equivalence
2743  * routines as deduplication internally. However, it does not merge
2744  * together index tuples, so the same correctness considerations do not
2745  * apply. We deliberately omit an index-is-allequalimage test here.
2746  */
2747  if ((indexUnchanged || uniquedup) &&
2748  _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2749  return;
2750 
2751  /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2752  if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2753  _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
2754  insertstate->itemsz, checkingunique);
2755 }
2756 
2757 /*
2758  * _bt_simpledel_pass - Simple index tuple deletion pass.
2759  *
2760  * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2761  * of all such tuples are determined by caller (caller passes these to us as
2762  * its 'deletable' argument).
2763  *
2764  * We might also delete extra index tuples that turn out to be safe to delete
2765  * in passing (though they must be cheap to check in passing to begin with).
2766  * There is no certainty that any extra tuples will be deleted, though. The
2767  * high level goal of the approach we take is to get the most out of each call
2768  * here (without noticeably increasing the per-call overhead compared to what
2769  * we need to do just to be able to delete the page's LP_DEAD-marked index
2770  * tuples).
2771  *
2772  * The number of extra index tuples that turn out to be deletable might
2773  * greatly exceed the number of LP_DEAD-marked index tuples due to various
2774  * locality related effects. For example, it's possible that the total number
2775  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2776  * quite low, in which case we might end up checking if it's possible to
2777  * delete _most_ index tuples on the page (without the tableam needing to
2778  * access additional table blocks). The tableam will sometimes stumble upon
2779  * _many_ extra deletable index tuples in indexes where this pattern is
2780  * common.
2781  *
2782  * See nbtree/README for further details on simple index tuple deletion.
2783  */
2784 static void
2786  OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2787  OffsetNumber minoff, OffsetNumber maxoff)
2788 {
2789  Page page = BufferGetPage(buffer);
2790  BlockNumber *deadblocks;
2791  int ndeadblocks;
2792  TM_IndexDeleteOp delstate;
2793  OffsetNumber offnum;
2794 
2795  /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2796  deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2797  &ndeadblocks);
2798 
2799  /* Initialize tableam state that describes index deletion operation */
2800  delstate.bottomup = false;
2801  delstate.bottomupfreespace = 0;
2802  delstate.ndeltids = 0;
2803  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2804  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2805 
2806  for (offnum = minoff;
2807  offnum <= maxoff;
2808  offnum = OffsetNumberNext(offnum))
2809  {
2810  ItemId itemid = PageGetItemId(page, offnum);
2811  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2812  TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2813  TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2814  BlockNumber tidblock;
2815  void *match;
2816 
2817  if (!BTreeTupleIsPosting(itup))
2818  {
2819  tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2820  match = bsearch(&tidblock, deadblocks, ndeadblocks,
2821  sizeof(BlockNumber), _bt_blk_cmp);
2822 
2823  if (!match)
2824  {
2825  Assert(!ItemIdIsDead(itemid));
2826  continue;
2827  }
2828 
2829  /*
2830  * TID's table block is among those pointed to by the TIDs from
2831  * LP_DEAD-bit set tuples on page -- add TID to deltids
2832  */
2833  odeltid->tid = itup->t_tid;
2834  odeltid->id = delstate.ndeltids;
2835  ostatus->idxoffnum = offnum;
2836  ostatus->knowndeletable = ItemIdIsDead(itemid);
2837  ostatus->promising = false; /* unused */
2838  ostatus->freespace = 0; /* unused */
2839 
2840  delstate.ndeltids++;
2841  }
2842  else
2843  {
2844  int nitem = BTreeTupleGetNPosting(itup);
2845 
2846  for (int p = 0; p < nitem; p++)
2847  {
2848  ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2849 
2850  tidblock = ItemPointerGetBlockNumber(tid);
2851  match = bsearch(&tidblock, deadblocks, ndeadblocks,
2852  sizeof(BlockNumber), _bt_blk_cmp);
2853 
2854  if (!match)
2855  {
2856  Assert(!ItemIdIsDead(itemid));
2857  continue;
2858  }
2859 
2860  /*
2861  * TID's table block is among those pointed to by the TIDs
2862  * from LP_DEAD-bit set tuples on page -- add TID to deltids
2863  */
2864  odeltid->tid = *tid;
2865  odeltid->id = delstate.ndeltids;
2866  ostatus->idxoffnum = offnum;
2867  ostatus->knowndeletable = ItemIdIsDead(itemid);
2868  ostatus->promising = false; /* unused */
2869  ostatus->freespace = 0; /* unused */
2870 
2871  odeltid++;
2872  ostatus++;
2873  delstate.ndeltids++;
2874  }
2875  }
2876  }
2877 
2878  pfree(deadblocks);
2879 
2880  Assert(delstate.ndeltids >= ndeletable);
2881 
2882  /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2883  _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2884 
2885  pfree(delstate.deltids);
2886  pfree(delstate.status);
2887 }
2888 
2889 /*
2890  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2891  *
2892  * Builds sorted and unique-ified array of table block numbers from index
2893  * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2894  * block from incoming newitem just in case it isn't among the LP_DEAD-related
2895  * table blocks.
2896  *
2897  * Always counting the newitem's table block as an LP_DEAD related block makes
2898  * sense because the cost is consistently low; it is practically certain that
2899  * the table block will not incur a buffer miss in tableam. On the other hand
2900  * the benefit is often quite high. There is a decent chance that there will
2901  * be some deletable items from this block, since in general most garbage
2902  * tuples became garbage in the recent past (in many cases this won't be the
2903  * first logical row that core code added to/modified in table block
2904  * recently).
2905  *
2906  * Returns final array, and sets *nblocks to its final size for caller.
2907  */
2908 static BlockNumber *
2909 _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2910  IndexTuple newitem, int *nblocks)
2911 {
2912  int spacentids,
2913  ntids;
2914  BlockNumber *tidblocks;
2915 
2916  /*
2917  * Accumulate each TID's block in array whose initial size has space for
2918  * one table block per LP_DEAD-set tuple (plus space for the newitem table
2919  * block). Array will only need to grow when there are LP_DEAD-marked
2920  * posting list tuples (which is not that common).
2921  */
2922  spacentids = ndeletable + 1;
2923  ntids = 0;
2924  tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2925 
2926  /*
2927  * First add the table block for the incoming newitem. This is the one
2928  * case where simple deletion can visit a table block that doesn't have
2929  * any known deletable items.
2930  */
2931  Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2932  tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2933 
2934  for (int i = 0; i < ndeletable; i++)
2935  {
2936  ItemId itemid = PageGetItemId(page, deletable[i]);
2937  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2938 
2939  Assert(ItemIdIsDead(itemid));
2940 
2941  if (!BTreeTupleIsPosting(itup))
2942  {
2943  if (ntids + 1 > spacentids)
2944  {
2945  spacentids *= 2;
2946  tidblocks = (BlockNumber *)
2947  repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2948  }
2949 
2950  tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2951  }
2952  else
2953  {
2954  int nposting = BTreeTupleGetNPosting(itup);
2955 
2956  if (ntids + nposting > spacentids)
2957  {
2958  spacentids = Max(spacentids * 2, ntids + nposting);
2959  tidblocks = (BlockNumber *)
2960  repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2961  }
2962 
2963  for (int j = 0; j < nposting; j++)
2964  {
2965  ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2966 
2967  tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
2968  }
2969  }
2970  }
2971 
2972  qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
2973  *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
2974 
2975  return tidblocks;
2976 }
2977 
2978 /*
2979  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
2980  */
2981 static inline int
2982 _bt_blk_cmp(const void *arg1, const void *arg2)
2983 {
2984  BlockNumber b1 = *((BlockNumber *) arg1);
2985  BlockNumber b2 = *((BlockNumber *) arg2);
2986 
2987  if (b1 < b2)
2988  return -1;
2989  else if (b1 > b2)
2990  return 1;
2991 
2992  return 0;
2993 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1103
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
bool allequalimage
Definition: nbtxlog.h:58
BlockNumber rootblk
Definition: nbtxlog.h:335
TM_IndexDelete * deltids
Definition: tableam.h:228
#define BTP_ROOT
Definition: nbtree.h:75
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1935
#define BTP_SPLIT_END
Definition: nbtree.h:79
BlockNumber btpo_next
Definition: nbtree.h:65
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:75
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:1152
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1490
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:56
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:411
#define P_IGNORE(opaque)
Definition: nbtree.h:224
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:105
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:454
bool bounds_valid
Definition: nbtree.h:802
uint32 TransactionId
Definition: c.h:587
uint32 btm_version
Definition: nbtree.h:104
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:633
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:976
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3770
#define RelationGetDescr(relation)
Definition: rel.h:483
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
ItemPointer scantid
Definition: nbtree.h:770
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:832
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1483
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1083
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2079
long random(void)
Definition: random.c:22
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
Definition: nbtinsert.c:2785
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:350
#define SizeOfBtreeSplit
Definition: nbtxlog.h:162
ItemPointerData t_tid
Definition: itup.h:37
#define END_CRIT_SECTION()
Definition: miscadmin.h:135
BlockNumber root
Definition: nbtxlog.h:52
unsigned char uint8
Definition: c.h:439
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:211
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:530
#define InvalidBuffer
Definition: buf.h:25
bool knowndeletable
Definition: tableam.h:196
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:29
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:225
#define START_CRIT_SECTION()
Definition: miscadmin.h:133
OffsetNumber stricthigh
Definition: nbtree.h:804
int errcode(int sqlerrcode)
Definition: elog.c:694
uint32 level
Definition: nbtxlog.h:336
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:81
bool allequalimage
Definition: nbtree.h:766
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1027
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3700
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2416
#define P_NEW
Definition: bufmgr.h:91
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:226
OffsetNumber idxoffnum
Definition: tableam.h:195
OffsetNumber low
Definition: nbtree.h:803
#define MaxTIDsPerBTreePage
Definition: nbtree.h:184
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:551
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2295
#define PANIC
Definition: elog.h:55
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:1017
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:542
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:790
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1447
uint16 OffsetNumber
Definition: off.h:24
static int _bt_blk_cmp(const void *arg1, const void *arg2)
Definition: nbtinsert.c:2982
IndexUniqueCheck
Definition: genam.h:114
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:569
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2200
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5563
#define BT_READ
Definition: nbtree.h:693
BlockNumber btm_fastroot
Definition: nbtree.h:107
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
unsigned short uint16
Definition: c.h:440
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
void pfree(void *pointer)
Definition: mcxt.c:1057
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2215
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:755
#define ERROR
Definition: elog.h:45
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:57
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:27
OffsetNumber newitemoff
Definition: nbtxlog.h:158
BTCycleId btpo_cycleid
Definition: nbtree.h:68
#define DEBUG2
Definition: elog.h:24
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:796
#define BTPageGetMeta(p)
Definition: nbtree.h:119
bool btm_allequalimage
Definition: nbtree.h:116
BlockNumber btpo_prev
Definition: nbtree.h:64
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
#define SnapshotSelf
Definition: snapmgr.h:67
OffsetNumber bts_offset
Definition: nbtree.h:709
uint32 btpo_level
Definition: nbtree.h:66
static char * buf
Definition: pg_test_fsync.c:68
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:447
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
int errdetail(const char *fmt,...)
Definition: elog.c:1038
#define MAX_RANDOM_VALUE
#define P_FIRSTKEY
Definition: nbtree.h:349
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:469
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:491
#define P_LEFTMOST(opaque)
Definition: nbtree.h:217
unsigned int uint32
Definition: c.h:441
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:434
TransactionId xmax
Definition: snapshot.h:158
TransactionId xmin
Definition: snapshot.h:157
IndexTuple itup
Definition: nbtree.h:790
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:536
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:476
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:150
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:644
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:314
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:815
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:219
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:112
bool promising
Definition: tableam.h:199
#define BTREE_METAPAGE
Definition: nbtree.h:146
#define P_ISROOT(opaque)
Definition: nbtree.h:220
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:29
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:339
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:130
uint32 version
Definition: nbtxlog.h:51
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:108
bool anynullkeys
Definition: nbtree.h:767
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
Page PageGetTempPage(Page page)
Definition: bufpage.c:352
uintptr_t Datum
Definition: postgres.h:367
uint32 level
Definition: nbtxlog.h:156
OffsetNumber offnum
Definition: nbtxlog.h:82
struct IndexTupleData IndexTupleData
TM_IndexStatus * status
Definition: tableam.h:229
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:99
BlockNumber btm_root
Definition: nbtree.h:105
ItemPointerData tid
Definition: tableam.h:189
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4446
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:708
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:996
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:31
void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique)
Definition: nbtdedup.c:54
#define ereport(elevel,...)
Definition: elog.h:155
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:518
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:466
int errmsg_internal(const char *fmt,...)
Definition: elog.c:992
#define Max(x, y)
Definition: c.h:980
bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
Definition: nbtdedup.c:299
uint32 speculativeToken
Definition: snapshot.h:193
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
Definition: nbtinsert.c:2656
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:639
uint64 XLogRecPtr
Definition: xlogdefs.h:21
BTScanInsert itup_key
Definition: nbtree.h:792
#define Assert(condition)
Definition: c.h:804
static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
Definition: nbtinsert.c:2909
bool heapkeyspace
Definition: nbtree.h:765
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:619
#define INDEX_MAX_KEYS
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:549
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:540
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:114
#define MAXALIGN(LEN)
Definition: c.h:757
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition: qunique.h:21
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
struct BTStackData * bts_parent
Definition: nbtree.h:710
#define RelationNeedsWAL(relation)
Definition: rel.h:563
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:28
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2634
#define PageGetLSN(page)
Definition: bufpage.h:366
#define BTMaxItemSize(page)
Definition: nbtree.h:162
#define P_HIKEY
Definition: nbtree.h:348
static Datum values[MAXATTR]
Definition: bootstrap.c:165
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2674
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:405
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:905
uint32 fastlevel
Definition: nbtxlog.h:55
uint32 btm_level
Definition: nbtree.h:106
#define elog(elevel,...)
Definition: elog.h:227
uint32 level
Definition: nbtxlog.h:53
int16 freespace
Definition: tableam.h:200
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1103
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2603
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:30
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:492
#define unlikely(x)
Definition: c.h:273
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1066
BlockNumber fastroot
Definition: nbtxlog.h:54
int bottomupfreespace
Definition: tableam.h:224
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:177
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
unsigned short t_info
Definition: itup.h:49
#define qsort(a, b, c, d)
Definition: port.h:504
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:694
uint16 postingoff
Definition: nbtxlog.h:159
void XLogBeginInsert(void)
Definition: xloginsert.c:123
uint16 btpo_flags
Definition: nbtree.h:67
OffsetNumber firstrightoff
Definition: nbtxlog.h:157
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:218
#define SizeOfBtreeInsert
Definition: nbtxlog.h:88
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3167
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2292
#define BTP_HAS_GARBAGE
Definition: nbtree.h:80
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:219