PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/transam.h"
21 #include "access/xloginsert.h"
22 #include "common/pg_prng.h"
23 #include "lib/qunique.h"
24 #include "miscadmin.h"
25 #include "storage/lmgr.h"
26 #include "storage/predicate.h"
27 #include "storage/smgr.h"
28 
29 /* Minimum tree height for application of fastpath optimization */
30 #define BTREE_FASTPATH_MIN_LEVEL 2
31 
32 
33 static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
35  Relation heapRel,
36  IndexUniqueCheck checkUnique, bool *is_unique,
37  uint32 *speculativeToken);
39  BTInsertState insertstate,
40  bool checkingunique,
41  bool indexUnchanged,
42  BTStack stack,
43  Relation heapRel);
44 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
45 static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
46  Buffer buf,
47  Buffer cbuf,
48  BTStack stack,
49  IndexTuple itup,
50  Size itemsz,
51  OffsetNumber newitemoff,
52  int postingoff,
53  bool split_only_page);
54 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
55  Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
56  IndexTuple newitem, IndexTuple orignewitem,
57  IndexTuple nposting, uint16 postingoff);
58 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
59  BTStack stack, bool isroot, bool isonly);
60 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
61 static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
62  OffsetNumber itup_off, bool newfirstdataitem);
63 static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
64  BTInsertState insertstate,
65  bool simpleonly, bool checkingunique,
66  bool uniquedup, bool indexUnchanged);
67 static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
68  OffsetNumber *deletable, int ndeletable,
69  IndexTuple newitem, OffsetNumber minoff,
70  OffsetNumber maxoff);
71 static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
72  int ndeletable, IndexTuple newitem,
73  int *nblocks);
74 static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
75 
76 /*
77  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
78  *
79  * This routine is called by the public interface routine, btinsert.
80  * By here, itup is filled in, including the TID.
81  *
82  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
83  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
84  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
85  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
86  * don't actually insert.
87  *
88  * indexUnchanged executor hint indicates if itup is from an
89  * UPDATE that didn't logically change the indexed value, but
90  * must nevertheless have a new entry to point to a successor
91  * version.
92  *
93  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
94  * it must be true if the entry is known unique, else false.
95  * (In the current implementation we'll also return true after a
96  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
97  * that's just a coding artifact.)
98  */
99 bool
101  IndexUniqueCheck checkUnique, bool indexUnchanged,
102  Relation heapRel)
103 {
104  bool is_unique = false;
105  BTInsertStateData insertstate;
106  BTScanInsert itup_key;
107  BTStack stack;
108  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
109 
110  /* we need an insertion scan key to do our search, so build one */
111  itup_key = _bt_mkscankey(rel, itup);
112 
113  if (checkingunique)
114  {
115  if (!itup_key->anynullkeys)
116  {
117  /* No (heapkeyspace) scantid until uniqueness established */
118  itup_key->scantid = NULL;
119  }
120  else
121  {
122  /*
123  * Scan key for new tuple contains NULL key values. Bypass
124  * checkingunique steps. They are unnecessary because core code
125  * considers NULL unequal to every value, including NULL.
126  *
127  * This optimization avoids O(N^2) behavior within the
128  * _bt_findinsertloc() heapkeyspace path when a unique index has a
129  * large number of "duplicates" with NULL key values.
130  */
131  checkingunique = false;
132  /* Tuple is unique in the sense that core code cares about */
133  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
134  is_unique = true;
135  }
136  }
137 
138  /*
139  * Fill in the BTInsertState working area, to track the current page and
140  * position within the page to insert on.
141  *
142  * Note that itemsz is passed down to lower level code that deals with
143  * inserting the item. It must be MAXALIGN()'d. This ensures that space
144  * accounting code consistently considers the alignment overhead that we
145  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
146  * already conservative about alignment, but we don't rely on that from
147  * this distance. Besides, preserving the "true" tuple size in index
148  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
149  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
150  */
151  insertstate.itup = itup;
152  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
153  insertstate.itup_key = itup_key;
154  insertstate.bounds_valid = false;
155  insertstate.buf = InvalidBuffer;
156  insertstate.postingoff = 0;
157 
158 search:
159 
160  /*
161  * Find and lock the leaf page that the tuple should be added to by
162  * searching from the root page. insertstate.buf will hold a buffer that
163  * is locked in exclusive mode afterwards.
164  */
165  stack = _bt_search_insert(rel, &insertstate);
166 
167  /*
168  * checkingunique inserts are not allowed to go ahead when two tuples with
169  * equal key attribute values would be visible to new MVCC snapshots once
170  * the xact commits. Check for conflicts in the locked page/buffer (if
171  * needed) here.
172  *
173  * It might be necessary to check a page to the right in _bt_check_unique,
174  * though that should be very rare. In practice the first page the value
175  * could be on (with scantid omitted) is almost always also the only page
176  * that a matching tuple might be found on. This is due to the behavior
177  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
178  * only be allowed to cross a page boundary when there is no candidate
179  * leaf page split point that avoids it. Also, _bt_check_unique can use
180  * the leaf page high key to determine that there will be no duplicates on
181  * the right sibling without actually visiting it (it uses the high key in
182  * cases where the new item happens to belong at the far right of the leaf
183  * page).
184  *
185  * NOTE: obviously, _bt_check_unique can only detect keys that are already
186  * in the index; so it cannot defend against concurrent insertions of the
187  * same key. We protect against that by means of holding a write lock on
188  * the first page the value could be on, with omitted/-inf value for the
189  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
190  * the same key must acquire a write lock on the same page, so only one
191  * would-be inserter can be making the check at one time. Furthermore,
192  * once we are past the check we hold write locks continuously until we
193  * have performed our insertion, so no later inserter can fail to see our
194  * insertion. (This requires some care in _bt_findinsertloc.)
195  *
196  * If we must wait for another xact, we release the lock while waiting,
197  * and then must perform a new search.
198  *
199  * For a partial uniqueness check, we don't wait for the other xact. Just
200  * let the tuple in and return false for possibly non-unique, or true for
201  * definitely unique.
202  */
203  if (checkingunique)
204  {
205  TransactionId xwait;
206  uint32 speculativeToken;
207 
208  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
209  &is_unique, &speculativeToken);
210 
211  if (unlikely(TransactionIdIsValid(xwait)))
212  {
213  /* Have to wait for the other guy ... */
214  _bt_relbuf(rel, insertstate.buf);
215  insertstate.buf = InvalidBuffer;
216 
217  /*
218  * If it's a speculative insertion, wait for it to finish (ie. to
219  * go ahead with the insertion, or kill the tuple). Otherwise
220  * wait for the transaction to finish as usual.
221  */
222  if (speculativeToken)
223  SpeculativeInsertionWait(xwait, speculativeToken);
224  else
225  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
226 
227  /* start over... */
228  if (stack)
229  _bt_freestack(stack);
230  goto search;
231  }
232 
233  /* Uniqueness is established -- restore heap tid as scantid */
234  if (itup_key->heapkeyspace)
235  itup_key->scantid = &itup->t_tid;
236  }
237 
238  if (checkUnique != UNIQUE_CHECK_EXISTING)
239  {
240  OffsetNumber newitemoff;
241 
242  /*
243  * The only conflict predicate locking cares about for indexes is when
244  * an index tuple insert conflicts with an existing lock. We don't
245  * know the actual page we're going to insert on for sure just yet in
246  * checkingunique and !heapkeyspace cases, but it's okay to use the
247  * first page the value could be on (with scantid omitted) instead.
248  */
250 
251  /*
252  * Do the insertion. Note that insertstate contains cached binary
253  * search bounds established within _bt_check_unique when insertion is
254  * checkingunique.
255  */
256  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
257  indexUnchanged, stack, heapRel);
258  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
259  itup, insertstate.itemsz, newitemoff,
260  insertstate.postingoff, false);
261  }
262  else
263  {
264  /* just release the buffer */
265  _bt_relbuf(rel, insertstate.buf);
266  }
267 
268  /* be tidy */
269  if (stack)
270  _bt_freestack(stack);
271  pfree(itup_key);
272 
273  return is_unique;
274 }
275 
276 /*
277  * _bt_search_insert() -- _bt_search() wrapper for inserts
278  *
279  * Search the tree for a particular scankey, or more precisely for the first
280  * leaf page it could be on. Try to make use of the fastpath optimization's
281  * rightmost leaf page cache before actually searching the tree from the root
282  * page, though.
283  *
284  * Return value is a stack of parent-page pointers (though see notes about
285  * fastpath optimization and page splits below). insertstate->buf is set to
286  * the address of the leaf-page buffer, which is write-locked and pinned in
287  * all cases (if necessary by creating a new empty root page for caller).
288  *
289  * The fastpath optimization avoids most of the work of searching the tree
290  * repeatedly when a single backend inserts successive new tuples on the
291  * rightmost leaf page of an index. A backend cache of the rightmost leaf
292  * page is maintained within _bt_insertonpg(), and used here. The cache is
293  * invalidated here when an insert of a non-pivot tuple must take place on a
294  * non-rightmost leaf page.
295  *
296  * The optimization helps with indexes on an auto-incremented field. It also
297  * helps with indexes on datetime columns, as well as indexes with lots of
298  * NULL values. (NULLs usually get inserted in the rightmost page for single
299  * column indexes, since they usually get treated as coming after everything
300  * else in the key space. Individual NULL tuples will generally be placed on
301  * the rightmost leaf page due to the influence of the heap TID column.)
302  *
303  * Note that we avoid applying the optimization when there is insufficient
304  * space on the rightmost page to fit caller's new item. This is necessary
305  * because we'll need to return a real descent stack when a page split is
306  * expected (actually, caller can cope with a leaf page split that uses a NULL
307  * stack, but that's very slow and so must be avoided). Note also that the
308  * fastpath optimization acquires the lock on the page conditionally as a way
309  * of reducing extra contention when there are concurrent insertions into the
310  * rightmost page (we give up if we'd have to wait for the lock). We assume
311  * that it isn't useful to apply the optimization when there is contention,
312  * since each per-backend cache won't stay valid for long.
313  */
314 static BTStack
316 {
317  Assert(insertstate->buf == InvalidBuffer);
318  Assert(!insertstate->bounds_valid);
319  Assert(insertstate->postingoff == 0);
320 
322  {
323  /* Simulate a _bt_getbuf() call with conditional locking */
324  insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
325  if (_bt_conditionallockbuf(rel, insertstate->buf))
326  {
327  Page page;
328  BTPageOpaque opaque;
329 
330  _bt_checkpage(rel, insertstate->buf);
331  page = BufferGetPage(insertstate->buf);
332  opaque = BTPageGetOpaque(page);
333 
334  /*
335  * Check if the page is still the rightmost leaf page and has
336  * enough free space to accommodate the new tuple. Also check
337  * that the insertion scan key is strictly greater than the first
338  * non-pivot tuple on the page. (Note that we expect itup_key's
339  * scantid to be unset when our caller is a checkingunique
340  * inserter.)
341  */
342  if (P_RIGHTMOST(opaque) &&
343  P_ISLEAF(opaque) &&
344  !P_IGNORE(opaque) &&
345  PageGetFreeSpace(page) > insertstate->itemsz &&
346  PageGetMaxOffsetNumber(page) >= P_HIKEY &&
347  _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
348  {
349  /*
350  * Caller can use the fastpath optimization because cached
351  * block is still rightmost leaf page, which can fit caller's
352  * new tuple without splitting. Keep block in local cache for
353  * next insert, and have caller use NULL stack.
354  *
355  * Note that _bt_insert_parent() has an assertion that catches
356  * leaf page splits that somehow follow from a fastpath insert
357  * (it should only be passed a NULL stack when it must deal
358  * with a concurrent root page split, and never because a NULL
359  * stack was returned here).
360  */
361  return NULL;
362  }
363 
364  /* Page unsuitable for caller, drop lock and pin */
365  _bt_relbuf(rel, insertstate->buf);
366  }
367  else
368  {
369  /* Lock unavailable, drop pin */
370  ReleaseBuffer(insertstate->buf);
371  }
372 
373  /* Forget block, since cache doesn't appear to be useful */
375  }
376 
377  /* Cannot use optimization -- descend tree, return proper descent stack */
378  return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
379  NULL);
380 }
381 
382 /*
383  * _bt_check_unique() -- Check for violation of unique index constraint
384  *
385  * Returns InvalidTransactionId if there is no conflict, else an xact ID
386  * we must wait for to see if it commits a conflicting tuple. If an actual
387  * conflict is detected, no return --- just ereport(). If an xact ID is
388  * returned, and the conflicting tuple still has a speculative insertion in
389  * progress, *speculativeToken is set to non-zero, and the caller can wait for
390  * the verdict on the insertion using SpeculativeInsertionWait().
391  *
392  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
393  * InvalidTransactionId because we don't want to wait. In this case we
394  * set *is_unique to false if there is a potential conflict, and the
395  * core code must redo the uniqueness check later.
396  *
397  * As a side-effect, sets state in insertstate that can later be used by
398  * _bt_findinsertloc() to reuse most of the binary search work we do
399  * here.
400  *
401  * This code treats NULLs as equal, unlike the default semantics for unique
402  * indexes. So do not call here when there are NULL values in scan key and
403  * the index uses the default NULLS DISTINCT mode.
404  */
405 static TransactionId
407  IndexUniqueCheck checkUnique, bool *is_unique,
408  uint32 *speculativeToken)
409 {
410  IndexTuple itup = insertstate->itup;
411  IndexTuple curitup = NULL;
412  ItemId curitemid = NULL;
413  BTScanInsert itup_key = insertstate->itup_key;
414  SnapshotData SnapshotDirty;
415  OffsetNumber offset;
416  OffsetNumber maxoff;
417  Page page;
418  BTPageOpaque opaque;
419  Buffer nbuf = InvalidBuffer;
420  bool found = false;
421  bool inposting = false;
422  bool prevalldead = true;
423  int curposti = 0;
424 
425  /* Assume unique until we find a duplicate */
426  *is_unique = true;
427 
428  InitDirtySnapshot(SnapshotDirty);
429 
430  page = BufferGetPage(insertstate->buf);
431  opaque = BTPageGetOpaque(page);
432  maxoff = PageGetMaxOffsetNumber(page);
433 
434  /*
435  * Find the first tuple with the same key.
436  *
437  * This also saves the binary search bounds in insertstate. We use them
438  * in the fastpath below, but also in the _bt_findinsertloc() call later.
439  */
440  Assert(!insertstate->bounds_valid);
441  offset = _bt_binsrch_insert(rel, insertstate);
442 
443  /*
444  * Scan over all equal tuples, looking for live conflicts.
445  */
446  Assert(!insertstate->bounds_valid || insertstate->low == offset);
447  Assert(!itup_key->anynullkeys);
448  Assert(itup_key->scantid == NULL);
449  for (;;)
450  {
451  /*
452  * Each iteration of the loop processes one heap TID, not one index
453  * tuple. Current offset number for page isn't usually advanced on
454  * iterations that process heap TIDs from posting list tuples.
455  *
456  * "inposting" state is set when _inside_ a posting list --- not when
457  * we're at the start (or end) of a posting list. We advance curposti
458  * at the end of the iteration when inside a posting list tuple. In
459  * general, every loop iteration either advances the page offset or
460  * advances curposti --- an iteration that handles the rightmost/max
461  * heap TID in a posting list finally advances the page offset (and
462  * unsets "inposting").
463  *
464  * Make sure the offset points to an actual index tuple before trying
465  * to examine it...
466  */
467  if (offset <= maxoff)
468  {
469  /*
470  * Fastpath: In most cases, we can use cached search bounds to
471  * limit our consideration to items that are definitely
472  * duplicates. This fastpath doesn't apply when the original page
473  * is empty, or when initial offset is past the end of the
474  * original page, which may indicate that we need to examine a
475  * second or subsequent page.
476  *
477  * Note that this optimization allows us to avoid calling
478  * _bt_compare() directly when there are no duplicates, as long as
479  * the offset where the key will go is not at the end of the page.
480  */
481  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
482  {
483  Assert(insertstate->bounds_valid);
484  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
485  Assert(insertstate->low <= insertstate->stricthigh);
486  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
487  break;
488  }
489 
490  /*
491  * We can skip items that are already marked killed.
492  *
493  * In the presence of heavy update activity an index may contain
494  * many killed items with the same key; running _bt_compare() on
495  * each killed item gets expensive. Just advance over killed
496  * items as quickly as we can. We only apply _bt_compare() when
497  * we get to a non-killed item. We could reuse the bounds to
498  * avoid _bt_compare() calls for known equal tuples, but it
499  * doesn't seem worth it.
500  */
501  if (!inposting)
502  curitemid = PageGetItemId(page, offset);
503  if (inposting || !ItemIdIsDead(curitemid))
504  {
505  ItemPointerData htid;
506  bool all_dead = false;
507 
508  if (!inposting)
509  {
510  /* Plain tuple, or first TID in posting list tuple */
511  if (_bt_compare(rel, itup_key, page, offset) != 0)
512  break; /* we're past all the equal tuples */
513 
514  /* Advanced curitup */
515  curitup = (IndexTuple) PageGetItem(page, curitemid);
516  Assert(!BTreeTupleIsPivot(curitup));
517  }
518 
519  /* okay, we gotta fetch the heap tuple using htid ... */
520  if (!BTreeTupleIsPosting(curitup))
521  {
522  /* ... htid is from simple non-pivot tuple */
523  Assert(!inposting);
524  htid = curitup->t_tid;
525  }
526  else if (!inposting)
527  {
528  /* ... htid is first TID in new posting list */
529  inposting = true;
530  prevalldead = true;
531  curposti = 0;
532  htid = *BTreeTupleGetPostingN(curitup, 0);
533  }
534  else
535  {
536  /* ... htid is second or subsequent TID in posting list */
537  Assert(curposti > 0);
538  htid = *BTreeTupleGetPostingN(curitup, curposti);
539  }
540 
541  /*
542  * If we are doing a recheck, we expect to find the tuple we
543  * are rechecking. It's not a duplicate, but we have to keep
544  * scanning.
545  */
546  if (checkUnique == UNIQUE_CHECK_EXISTING &&
547  ItemPointerCompare(&htid, &itup->t_tid) == 0)
548  {
549  found = true;
550  }
551 
552  /*
553  * Check if there's any table tuples for this index entry
554  * satisfying SnapshotDirty. This is necessary because for AMs
555  * with optimizations like heap's HOT, we have just a single
556  * index entry for the entire chain.
557  */
558  else if (table_index_fetch_tuple_check(heapRel, &htid,
559  &SnapshotDirty,
560  &all_dead))
561  {
562  TransactionId xwait;
563 
564  /*
565  * It is a duplicate. If we are only doing a partial
566  * check, then don't bother checking if the tuple is being
567  * updated in another transaction. Just return the fact
568  * that it is a potential conflict and leave the full
569  * check till later. Don't invalidate binary search
570  * bounds.
571  */
572  if (checkUnique == UNIQUE_CHECK_PARTIAL)
573  {
574  if (nbuf != InvalidBuffer)
575  _bt_relbuf(rel, nbuf);
576  *is_unique = false;
577  return InvalidTransactionId;
578  }
579 
580  /*
581  * If this tuple is being updated by other transaction
582  * then we have to wait for its commit/abort.
583  */
584  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
585  SnapshotDirty.xmin : SnapshotDirty.xmax;
586 
587  if (TransactionIdIsValid(xwait))
588  {
589  if (nbuf != InvalidBuffer)
590  _bt_relbuf(rel, nbuf);
591  /* Tell _bt_doinsert to wait... */
592  *speculativeToken = SnapshotDirty.speculativeToken;
593  /* Caller releases lock on buf immediately */
594  insertstate->bounds_valid = false;
595  return xwait;
596  }
597 
598  /*
599  * Otherwise we have a definite conflict. But before
600  * complaining, look to see if the tuple we want to insert
601  * is itself now committed dead --- if so, don't complain.
602  * This is a waste of time in normal scenarios but we must
603  * do it to support CREATE INDEX CONCURRENTLY.
604  *
605  * We must follow HOT-chains here because during
606  * concurrent index build, we insert the root TID though
607  * the actual tuple may be somewhere in the HOT-chain.
608  * While following the chain we might not stop at the
609  * exact tuple which triggered the insert, but that's OK
610  * because if we find a live tuple anywhere in this chain,
611  * we have a unique key conflict. The other live tuple is
612  * not part of this chain because it had a different index
613  * entry.
614  */
615  htid = itup->t_tid;
616  if (table_index_fetch_tuple_check(heapRel, &htid,
617  SnapshotSelf, NULL))
618  {
619  /* Normal case --- it's still live */
620  }
621  else
622  {
623  /*
624  * It's been deleted, so no error, and no need to
625  * continue searching
626  */
627  break;
628  }
629 
630  /*
631  * Check for a conflict-in as we would if we were going to
632  * write to this page. We aren't actually going to write,
633  * but we want a chance to report SSI conflicts that would
634  * otherwise be masked by this unique constraint
635  * violation.
636  */
637  CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
638 
639  /*
640  * This is a definite conflict. Break the tuple down into
641  * datums and report the error. But first, make sure we
642  * release the buffer locks we're holding ---
643  * BuildIndexValueDescription could make catalog accesses,
644  * which in the worst case might touch this same index and
645  * cause deadlocks.
646  */
647  if (nbuf != InvalidBuffer)
648  _bt_relbuf(rel, nbuf);
649  _bt_relbuf(rel, insertstate->buf);
650  insertstate->buf = InvalidBuffer;
651  insertstate->bounds_valid = false;
652 
653  {
655  bool isnull[INDEX_MAX_KEYS];
656  char *key_desc;
657 
659  values, isnull);
660 
661  key_desc = BuildIndexValueDescription(rel, values,
662  isnull);
663 
664  ereport(ERROR,
665  (errcode(ERRCODE_UNIQUE_VIOLATION),
666  errmsg("duplicate key value violates unique constraint \"%s\"",
668  key_desc ? errdetail("Key %s already exists.",
669  key_desc) : 0,
670  errtableconstraint(heapRel,
671  RelationGetRelationName(rel))));
672  }
673  }
674  else if (all_dead && (!inposting ||
675  (prevalldead &&
676  curposti == BTreeTupleGetNPosting(curitup) - 1)))
677  {
678  /*
679  * The conflicting tuple (or all HOT chains pointed to by
680  * all posting list TIDs) is dead to everyone, so mark the
681  * index entry killed.
682  */
683  ItemIdMarkDead(curitemid);
684  opaque->btpo_flags |= BTP_HAS_GARBAGE;
685 
686  /*
687  * Mark buffer with a dirty hint, since state is not
688  * crucial. Be sure to mark the proper buffer dirty.
689  */
690  if (nbuf != InvalidBuffer)
691  MarkBufferDirtyHint(nbuf, true);
692  else
693  MarkBufferDirtyHint(insertstate->buf, true);
694  }
695 
696  /*
697  * Remember if posting list tuple has even a single HOT chain
698  * whose members are not all dead
699  */
700  if (!all_dead && inposting)
701  prevalldead = false;
702  }
703  }
704 
705  if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
706  {
707  /* Advance to next TID in same posting list */
708  curposti++;
709  continue;
710  }
711  else if (offset < maxoff)
712  {
713  /* Advance to next tuple */
714  curposti = 0;
715  inposting = false;
716  offset = OffsetNumberNext(offset);
717  }
718  else
719  {
720  int highkeycmp;
721 
722  /* If scankey == hikey we gotta check the next page too */
723  if (P_RIGHTMOST(opaque))
724  break;
725  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
726  Assert(highkeycmp <= 0);
727  if (highkeycmp != 0)
728  break;
729  /* Advance to next non-dead page --- there must be one */
730  for (;;)
731  {
732  BlockNumber nblkno = opaque->btpo_next;
733 
734  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
735  page = BufferGetPage(nbuf);
736  opaque = BTPageGetOpaque(page);
737  if (!P_IGNORE(opaque))
738  break;
739  if (P_RIGHTMOST(opaque))
740  elog(ERROR, "fell off the end of index \"%s\"",
742  }
743  /* Will also advance to next tuple */
744  curposti = 0;
745  inposting = false;
746  maxoff = PageGetMaxOffsetNumber(page);
747  offset = P_FIRSTDATAKEY(opaque);
748  /* Don't invalidate binary search bounds */
749  }
750  }
751 
752  /*
753  * If we are doing a recheck then we should have found the tuple we are
754  * checking. Otherwise there's something very wrong --- probably, the
755  * index is on a non-immutable expression.
756  */
757  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
758  ereport(ERROR,
759  (errcode(ERRCODE_INTERNAL_ERROR),
760  errmsg("failed to re-find tuple within index \"%s\"",
762  errhint("This may be because of a non-immutable index expression."),
763  errtableconstraint(heapRel,
764  RelationGetRelationName(rel))));
765 
766  if (nbuf != InvalidBuffer)
767  _bt_relbuf(rel, nbuf);
768 
769  return InvalidTransactionId;
770 }
771 
772 
773 /*
774  * _bt_findinsertloc() -- Finds an insert location for a tuple
775  *
776  * On entry, insertstate buffer contains the page the new tuple belongs
777  * on. It is exclusive-locked and pinned by the caller.
778  *
779  * If 'checkingunique' is true, the buffer on entry is the first page
780  * that contains duplicates of the new key. If there are duplicates on
781  * multiple pages, the correct insertion position might be some page to
782  * the right, rather than the first page. In that case, this function
783  * moves right to the correct target page.
784  *
785  * (In a !heapkeyspace index, there can be multiple pages with the same
786  * high key, where the new tuple could legitimately be placed on. In
787  * that case, the caller passes the first page containing duplicates,
788  * just like when checkingunique=true. If that page doesn't have enough
789  * room for the new tuple, this function moves right, trying to find a
790  * legal page that does.)
791  *
792  * If 'indexUnchanged' is true, this is for an UPDATE that didn't
793  * logically change the indexed value, but must nevertheless have a new
794  * entry to point to a successor version. This hint from the executor
795  * will influence our behavior when the page might have to be split and
796  * we must consider our options. Bottom-up index deletion can avoid
797  * pathological version-driven page splits, but we only want to go to the
798  * trouble of trying it when we already have moderate confidence that
799  * it's appropriate. The hint should not significantly affect our
800  * behavior over time unless practically all inserts on to the leaf page
801  * get the hint.
802  *
803  * On exit, insertstate buffer contains the chosen insertion page, and
804  * the offset within that page is returned. If _bt_findinsertloc needed
805  * to move right, the lock and pin on the original page are released, and
806  * the new buffer is exclusively locked and pinned instead.
807  *
808  * If insertstate contains cached binary search bounds, we will take
809  * advantage of them. This avoids repeating comparisons that we made in
810  * _bt_check_unique() already.
811  */
812 static OffsetNumber
814  BTInsertState insertstate,
815  bool checkingunique,
816  bool indexUnchanged,
817  BTStack stack,
818  Relation heapRel)
819 {
820  BTScanInsert itup_key = insertstate->itup_key;
821  Page page = BufferGetPage(insertstate->buf);
822  BTPageOpaque opaque;
823  OffsetNumber newitemoff;
824 
825  opaque = BTPageGetOpaque(page);
826 
827  /* Check 1/3 of a page restriction */
828  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
829  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
830  insertstate->itup);
831 
832  Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
833  Assert(!insertstate->bounds_valid || checkingunique);
834  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
835  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
836  Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
837 
838  if (itup_key->heapkeyspace)
839  {
840  /* Keep track of whether checkingunique duplicate seen */
841  bool uniquedup = indexUnchanged;
842 
843  /*
844  * If we're inserting into a unique index, we may have to walk right
845  * through leaf pages to find the one leaf page that we must insert on
846  * to.
847  *
848  * This is needed for checkingunique callers because a scantid was not
849  * used when we called _bt_search(). scantid can only be set after
850  * _bt_check_unique() has checked for duplicates. The buffer
851  * initially stored in insertstate->buf has the page where the first
852  * duplicate key might be found, which isn't always the page that new
853  * tuple belongs on. The heap TID attribute for new tuple (scantid)
854  * could force us to insert on a sibling page, though that should be
855  * very rare in practice.
856  */
857  if (checkingunique)
858  {
859  if (insertstate->low < insertstate->stricthigh)
860  {
861  /* Encountered a duplicate in _bt_check_unique() */
862  Assert(insertstate->bounds_valid);
863  uniquedup = true;
864  }
865 
866  for (;;)
867  {
868  /*
869  * Does the new tuple belong on this page?
870  *
871  * The earlier _bt_check_unique() call may well have
872  * established a strict upper bound on the offset for the new
873  * item. If it's not the last item of the page (i.e. if there
874  * is at least one tuple on the page that goes after the tuple
875  * we're inserting) then we know that the tuple belongs on
876  * this page. We can skip the high key check.
877  */
878  if (insertstate->bounds_valid &&
879  insertstate->low <= insertstate->stricthigh &&
880  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
881  break;
882 
883  /* Test '<=', not '!=', since scantid is set now */
884  if (P_RIGHTMOST(opaque) ||
885  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
886  break;
887 
888  _bt_stepright(rel, insertstate, stack);
889  /* Update local state after stepping right */
890  page = BufferGetPage(insertstate->buf);
891  opaque = BTPageGetOpaque(page);
892  /* Assume duplicates (if checkingunique) */
893  uniquedup = true;
894  }
895  }
896 
897  /*
898  * If the target page cannot fit newitem, try to avoid splitting the
899  * page on insert by performing deletion or deduplication now
900  */
901  if (PageGetFreeSpace(page) < insertstate->itemsz)
902  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
903  checkingunique, uniquedup,
904  indexUnchanged);
905  }
906  else
907  {
908  /*----------
909  * This is a !heapkeyspace (version 2 or 3) index. The current page
910  * is the first page that we could insert the new tuple to, but there
911  * may be other pages to the right that we could opt to use instead.
912  *
913  * If the new key is equal to one or more existing keys, we can
914  * legitimately place it anywhere in the series of equal keys. In
915  * fact, if the new key is equal to the page's "high key" we can place
916  * it on the next page. If it is equal to the high key, and there's
917  * not room to insert the new tuple on the current page without
918  * splitting, then we move right hoping to find more free space and
919  * avoid a split.
920  *
921  * Keep scanning right until we
922  * (a) find a page with enough free space,
923  * (b) reach the last page where the tuple can legally go, or
924  * (c) get tired of searching.
925  * (c) is not flippant; it is important because if there are many
926  * pages' worth of equal keys, it's better to split one of the early
927  * pages than to scan all the way to the end of the run of equal keys
928  * on every insert. We implement "get tired" as a random choice,
929  * since stopping after scanning a fixed number of pages wouldn't work
930  * well (we'd never reach the right-hand side of previously split
931  * pages). The probability of moving right is set at 0.99, which may
932  * seem too high to change the behavior much, but it does an excellent
933  * job of preventing O(N^2) behavior with many equal keys.
934  *----------
935  */
936  while (PageGetFreeSpace(page) < insertstate->itemsz)
937  {
938  /*
939  * Before considering moving right, see if we can obtain enough
940  * space by erasing LP_DEAD items
941  */
942  if (P_HAS_GARBAGE(opaque))
943  {
944  /* Perform simple deletion */
945  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
946  false, false, false);
947 
948  if (PageGetFreeSpace(page) >= insertstate->itemsz)
949  break; /* OK, now we have enough space */
950  }
951 
952  /*
953  * Nope, so check conditions (b) and (c) enumerated above
954  *
955  * The earlier _bt_check_unique() call may well have established a
956  * strict upper bound on the offset for the new item. If it's not
957  * the last item of the page (i.e. if there is at least one tuple
958  * on the page that's greater than the tuple we're inserting to)
959  * then we know that the tuple belongs on this page. We can skip
960  * the high key check.
961  */
962  if (insertstate->bounds_valid &&
963  insertstate->low <= insertstate->stricthigh &&
964  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
965  break;
966 
967  if (P_RIGHTMOST(opaque) ||
968  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
970  break;
971 
972  _bt_stepright(rel, insertstate, stack);
973  /* Update local state after stepping right */
974  page = BufferGetPage(insertstate->buf);
975  opaque = BTPageGetOpaque(page);
976  }
977  }
978 
979  /*
980  * We should now be on the correct page. Find the offset within the page
981  * for the new tuple. (Possibly reusing earlier search bounds.)
982  */
983  Assert(P_RIGHTMOST(opaque) ||
984  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
985 
986  newitemoff = _bt_binsrch_insert(rel, insertstate);
987 
988  if (insertstate->postingoff == -1)
989  {
990  /*
991  * There is an overlapping posting list tuple with its LP_DEAD bit
992  * set. We don't want to unnecessarily unset its LP_DEAD bit while
993  * performing a posting list split, so perform simple index tuple
994  * deletion early.
995  */
996  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
997  false, false, false);
998 
999  /*
1000  * Do new binary search. New insert location cannot overlap with any
1001  * posting list now.
1002  */
1003  Assert(!insertstate->bounds_valid);
1004  insertstate->postingoff = 0;
1005  newitemoff = _bt_binsrch_insert(rel, insertstate);
1006  Assert(insertstate->postingoff == 0);
1007  }
1008 
1009  return newitemoff;
1010 }
1011 
1012 /*
1013  * Step right to next non-dead page, during insertion.
1014  *
1015  * This is a bit more complicated than moving right in a search. We must
1016  * write-lock the target page before releasing write lock on current page;
1017  * else someone else's _bt_check_unique scan could fail to see our insertion.
1018  * Write locks on intermediate dead pages won't do because we don't know when
1019  * they will get de-linked from the tree.
1020  *
1021  * This is more aggressive than it needs to be for non-unique !heapkeyspace
1022  * indexes.
1023  */
1024 static void
1026 {
1027  Page page;
1028  BTPageOpaque opaque;
1029  Buffer rbuf;
1030  BlockNumber rblkno;
1031 
1032  page = BufferGetPage(insertstate->buf);
1033  opaque = BTPageGetOpaque(page);
1034 
1035  rbuf = InvalidBuffer;
1036  rblkno = opaque->btpo_next;
1037  for (;;)
1038  {
1039  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1040  page = BufferGetPage(rbuf);
1041  opaque = BTPageGetOpaque(page);
1042 
1043  /*
1044  * If this page was incompletely split, finish the split now. We do
1045  * this while holding a lock on the left sibling, which is not good
1046  * because finishing the split could be a fairly lengthy operation.
1047  * But this should happen very seldom.
1048  */
1049  if (P_INCOMPLETE_SPLIT(opaque))
1050  {
1051  _bt_finish_split(rel, rbuf, stack);
1052  rbuf = InvalidBuffer;
1053  continue;
1054  }
1055 
1056  if (!P_IGNORE(opaque))
1057  break;
1058  if (P_RIGHTMOST(opaque))
1059  elog(ERROR, "fell off the end of index \"%s\"",
1061 
1062  rblkno = opaque->btpo_next;
1063  }
1064  /* rbuf locked; unlock buf, update state for caller */
1065  _bt_relbuf(rel, insertstate->buf);
1066  insertstate->buf = rbuf;
1067  insertstate->bounds_valid = false;
1068 }
1069 
1070 /*----------
1071  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1072  *
1073  * This recursive procedure does the following things:
1074  *
1075  * + if postingoff != 0, splits existing posting list tuple
1076  * (since it overlaps with new 'itup' tuple).
1077  * + if necessary, splits the target page, using 'itup_key' for
1078  * suffix truncation on leaf pages (caller passes NULL for
1079  * non-leaf pages).
1080  * + inserts the new tuple (might be split from posting list).
1081  * + if the page was split, pops the parent stack, and finds the
1082  * right place to insert the new child pointer (by walking
1083  * right using information stored in the parent stack).
1084  * + invokes itself with the appropriate tuple for the right
1085  * child page on the parent.
1086  * + updates the metapage if a true root or fast root is split.
1087  *
1088  * On entry, we must have the correct buffer in which to do the
1089  * insertion, and the buffer must be pinned and write-locked. On return,
1090  * we will have dropped both the pin and the lock on the buffer.
1091  *
1092  * This routine only performs retail tuple insertions. 'itup' should
1093  * always be either a non-highkey leaf item, or a downlink (new high
1094  * key items are created indirectly, when a page is split). When
1095  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1096  * we're inserting the downlink for. This function will clear the
1097  * INCOMPLETE_SPLIT flag on it, and release the buffer.
1098  *----------
1099  */
1100 static void
1102  BTScanInsert itup_key,
1103  Buffer buf,
1104  Buffer cbuf,
1105  BTStack stack,
1106  IndexTuple itup,
1107  Size itemsz,
1108  OffsetNumber newitemoff,
1109  int postingoff,
1110  bool split_only_page)
1111 {
1112  Page page;
1113  BTPageOpaque opaque;
1114  bool isleaf,
1115  isroot,
1116  isrightmost,
1117  isonly;
1118  IndexTuple oposting = NULL;
1119  IndexTuple origitup = NULL;
1120  IndexTuple nposting = NULL;
1121 
1122  page = BufferGetPage(buf);
1123  opaque = BTPageGetOpaque(page);
1124  isleaf = P_ISLEAF(opaque);
1125  isroot = P_ISROOT(opaque);
1126  isrightmost = P_RIGHTMOST(opaque);
1127  isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1128 
1129  /* child buffer must be given iff inserting on an internal page */
1130  Assert(isleaf == !BufferIsValid(cbuf));
1131  /* tuple must have appropriate number of attributes */
1132  Assert(!isleaf ||
1133  BTreeTupleGetNAtts(itup, rel) ==
1135  Assert(isleaf ||
1136  BTreeTupleGetNAtts(itup, rel) <=
1138  Assert(!BTreeTupleIsPosting(itup));
1139  Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1140  /* Caller must always finish incomplete split for us */
1141  Assert(!P_INCOMPLETE_SPLIT(opaque));
1142 
1143  /*
1144  * Every internal page should have exactly one negative infinity item at
1145  * all times. Only _bt_split() and _bt_newroot() should add items that
1146  * become negative infinity items through truncation, since they're the
1147  * only routines that allocate new internal pages.
1148  */
1149  Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1150 
1151  /*
1152  * Do we need to split an existing posting list item?
1153  */
1154  if (postingoff != 0)
1155  {
1156  ItemId itemid = PageGetItemId(page, newitemoff);
1157 
1158  /*
1159  * The new tuple is a duplicate with a heap TID that falls inside the
1160  * range of an existing posting list tuple on a leaf page. Prepare to
1161  * split an existing posting list. Overwriting the posting list with
1162  * its post-split version is treated as an extra step in either the
1163  * insert or page split critical section.
1164  */
1165  Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1166  oposting = (IndexTuple) PageGetItem(page, itemid);
1167 
1168  /*
1169  * postingoff value comes from earlier call to _bt_binsrch_posting().
1170  * Its binary search might think that a plain tuple must be a posting
1171  * list tuple that needs to be split. This can happen with corruption
1172  * involving an existing plain tuple that is a duplicate of the new
1173  * item, up to and including its table TID. Check for that here in
1174  * passing.
1175  *
1176  * Also verify that our caller has made sure that the existing posting
1177  * list tuple does not have its LP_DEAD bit set.
1178  */
1179  if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1180  ereport(ERROR,
1181  (errcode(ERRCODE_INDEX_CORRUPTED),
1182  errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1185  newitemoff, BufferGetBlockNumber(buf),
1186  RelationGetRelationName(rel))));
1187 
1188  /* use a mutable copy of itup as our itup from here on */
1189  origitup = itup;
1190  itup = CopyIndexTuple(origitup);
1191  nposting = _bt_swap_posting(itup, oposting, postingoff);
1192  /* itup now contains rightmost/max TID from oposting */
1193 
1194  /* Alter offset so that newitem goes after posting list */
1195  newitemoff = OffsetNumberNext(newitemoff);
1196  }
1197 
1198  /*
1199  * Do we need to split the page to fit the item on it?
1200  *
1201  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1202  * so this comparison is correct even though we appear to be accounting
1203  * only for the item and not for its line pointer.
1204  */
1205  if (PageGetFreeSpace(page) < itemsz)
1206  {
1207  Buffer rbuf;
1208 
1209  Assert(!split_only_page);
1210 
1211  /* split the buffer into left and right halves */
1212  rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
1213  origitup, nposting, postingoff);
1216  BufferGetBlockNumber(rbuf));
1217 
1218  /*----------
1219  * By here,
1220  *
1221  * + our target page has been split;
1222  * + the original tuple has been inserted;
1223  * + we have write locks on both the old (left half)
1224  * and new (right half) buffers, after the split; and
1225  * + we know the key we want to insert into the parent
1226  * (it's the "high key" on the left child page).
1227  *
1228  * We're ready to do the parent insertion. We need to hold onto the
1229  * locks for the child pages until we locate the parent, but we can
1230  * at least release the lock on the right child before doing the
1231  * actual insertion. The lock on the left child will be released
1232  * last of all by parent insertion, where it is the 'cbuf' of parent
1233  * page.
1234  *----------
1235  */
1236  _bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly);
1237  }
1238  else
1239  {
1240  Buffer metabuf = InvalidBuffer;
1241  Page metapg = NULL;
1242  BTMetaPageData *metad = NULL;
1243  BlockNumber blockcache;
1244 
1245  /*
1246  * If we are doing this insert because we split a page that was the
1247  * only one on its tree level, but was not the root, it may have been
1248  * the "fast root". We need to ensure that the fast root link points
1249  * at or above the current page. We can safely acquire a lock on the
1250  * metapage here --- see comments for _bt_newroot().
1251  */
1252  if (unlikely(split_only_page))
1253  {
1254  Assert(!isleaf);
1255  Assert(BufferIsValid(cbuf));
1256 
1257  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1258  metapg = BufferGetPage(metabuf);
1259  metad = BTPageGetMeta(metapg);
1260 
1261  if (metad->btm_fastlevel >= opaque->btpo_level)
1262  {
1263  /* no update wanted */
1264  _bt_relbuf(rel, metabuf);
1265  metabuf = InvalidBuffer;
1266  }
1267  }
1268 
1269  /* Do the update. No ereport(ERROR) until changes are logged */
1271 
1272  if (postingoff != 0)
1273  memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1274 
1275  if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1276  false) == InvalidOffsetNumber)
1277  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1279 
1281 
1282  if (BufferIsValid(metabuf))
1283  {
1284  /* upgrade meta-page if needed */
1285  if (metad->btm_version < BTREE_NOVAC_VERSION)
1286  _bt_upgrademetapage(metapg);
1288  metad->btm_fastlevel = opaque->btpo_level;
1289  MarkBufferDirty(metabuf);
1290  }
1291 
1292  /*
1293  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1294  * finishes a split
1295  */
1296  if (!isleaf)
1297  {
1298  Page cpage = BufferGetPage(cbuf);
1299  BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1300 
1301  Assert(P_INCOMPLETE_SPLIT(cpageop));
1302  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1303  MarkBufferDirty(cbuf);
1304  }
1305 
1306  /* XLOG stuff */
1307  if (RelationNeedsWAL(rel))
1308  {
1309  xl_btree_insert xlrec;
1310  xl_btree_metadata xlmeta;
1311  uint8 xlinfo;
1312  XLogRecPtr recptr;
1313  uint16 upostingoff;
1314 
1315  xlrec.offnum = newitemoff;
1316 
1317  XLogBeginInsert();
1318  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1319 
1320  if (isleaf && postingoff == 0)
1321  {
1322  /* Simple leaf insert */
1323  xlinfo = XLOG_BTREE_INSERT_LEAF;
1324  }
1325  else if (postingoff != 0)
1326  {
1327  /*
1328  * Leaf insert with posting list split. Must include
1329  * postingoff field before newitem/orignewitem.
1330  */
1331  Assert(isleaf);
1332  xlinfo = XLOG_BTREE_INSERT_POST;
1333  }
1334  else
1335  {
1336  /* Internal page insert, which finishes a split on cbuf */
1337  xlinfo = XLOG_BTREE_INSERT_UPPER;
1339 
1340  if (BufferIsValid(metabuf))
1341  {
1342  /* Actually, it's an internal page insert + meta update */
1343  xlinfo = XLOG_BTREE_INSERT_META;
1344 
1346  xlmeta.version = metad->btm_version;
1347  xlmeta.root = metad->btm_root;
1348  xlmeta.level = metad->btm_level;
1349  xlmeta.fastroot = metad->btm_fastroot;
1350  xlmeta.fastlevel = metad->btm_fastlevel;
1352  xlmeta.allequalimage = metad->btm_allequalimage;
1353 
1354  XLogRegisterBuffer(2, metabuf,
1356  XLogRegisterBufData(2, (char *) &xlmeta,
1357  sizeof(xl_btree_metadata));
1358  }
1359  }
1360 
1362  if (postingoff == 0)
1363  {
1364  /* Just log itup from caller */
1365  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1366  }
1367  else
1368  {
1369  /*
1370  * Insert with posting list split (XLOG_BTREE_INSERT_POST
1371  * record) case.
1372  *
1373  * Log postingoff. Also log origitup, not itup. REDO routine
1374  * must reconstruct final itup (as well as nposting) using
1375  * _bt_swap_posting().
1376  */
1377  upostingoff = postingoff;
1378 
1379  XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1380  XLogRegisterBufData(0, (char *) origitup,
1381  IndexTupleSize(origitup));
1382  }
1383 
1384  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1385 
1386  if (BufferIsValid(metabuf))
1387  PageSetLSN(metapg, recptr);
1388  if (!isleaf)
1389  PageSetLSN(BufferGetPage(cbuf), recptr);
1390 
1391  PageSetLSN(page, recptr);
1392  }
1393 
1394  END_CRIT_SECTION();
1395 
1396  /* Release subsidiary buffers */
1397  if (BufferIsValid(metabuf))
1398  _bt_relbuf(rel, metabuf);
1399  if (!isleaf)
1400  _bt_relbuf(rel, cbuf);
1401 
1402  /*
1403  * Cache the block number if this is the rightmost leaf page. Cache
1404  * may be used by a future inserter within _bt_search_insert().
1405  */
1406  blockcache = InvalidBlockNumber;
1407  if (isrightmost && isleaf && !isroot)
1408  blockcache = BufferGetBlockNumber(buf);
1409 
1410  /* Release buffer for insertion target block */
1411  _bt_relbuf(rel, buf);
1412 
1413  /*
1414  * If we decided to cache the insertion target block before releasing
1415  * its buffer lock, then cache it now. Check the height of the tree
1416  * first, though. We don't go for the optimization with small
1417  * indexes. Defer final check to this point to ensure that we don't
1418  * call _bt_getrootheight while holding a buffer lock.
1419  */
1420  if (BlockNumberIsValid(blockcache) &&
1422  RelationSetTargetBlock(rel, blockcache);
1423  }
1424 
1425  /* be tidy */
1426  if (postingoff != 0)
1427  {
1428  /* itup is actually a modified copy of caller's original */
1429  pfree(nposting);
1430  pfree(itup);
1431  }
1432 }
1433 
1434 /*
1435  * _bt_split() -- split a page in the btree.
1436  *
1437  * On entry, buf is the page to split, and is pinned and write-locked.
1438  * newitemoff etc. tell us about the new item that must be inserted
1439  * along with the data from the original page.
1440  *
1441  * itup_key is used for suffix truncation on leaf pages (internal
1442  * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1443  * is the left-sibling of the page we're inserting the downlink for.
1444  * This function will clear the INCOMPLETE_SPLIT flag on it, and
1445  * release the buffer.
1446  *
1447  * orignewitem, nposting, and postingoff are needed when an insert of
1448  * orignewitem results in both a posting list split and a page split.
1449  * These extra posting list split details are used here in the same
1450  * way as they are used in the more common case where a posting list
1451  * split does not coincide with a page split. We need to deal with
1452  * posting list splits directly in order to ensure that everything
1453  * that follows from the insert of orignewitem is handled as a single
1454  * atomic operation (though caller's insert of a new pivot/downlink
1455  * into parent page will still be a separate operation). See
1456  * nbtree/README for details on the design of posting list splits.
1457  *
1458  * Returns the new right sibling of buf, pinned and write-locked.
1459  * The pin and lock on buf are maintained.
1460  */
1461 static Buffer
1463  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1464  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1465 {
1466  Buffer rbuf;
1467  Page origpage;
1468  Page leftpage,
1469  rightpage;
1470  BlockNumber origpagenumber,
1471  rightpagenumber;
1472  BTPageOpaque ropaque,
1473  lopaque,
1474  oopaque;
1475  Buffer sbuf = InvalidBuffer;
1476  Page spage = NULL;
1477  BTPageOpaque sopaque = NULL;
1478  Size itemsz;
1479  ItemId itemid;
1480  IndexTuple firstright,
1481  lefthighkey;
1482  OffsetNumber firstrightoff;
1483  OffsetNumber afterleftoff,
1484  afterrightoff,
1485  minusinfoff;
1486  OffsetNumber origpagepostingoff;
1487  OffsetNumber maxoff;
1488  OffsetNumber i;
1489  bool newitemonleft,
1490  isleaf,
1491  isrightmost;
1492 
1493  /*
1494  * origpage is the original page to be split. leftpage is a temporary
1495  * buffer that receives the left-sibling data, which will be copied back
1496  * into origpage on success. rightpage is the new page that will receive
1497  * the right-sibling data.
1498  *
1499  * leftpage is allocated after choosing a split point. rightpage's new
1500  * buffer isn't acquired until after leftpage is initialized and has new
1501  * high key, the last point where splitting the page may fail (barring
1502  * corruption). Failing before acquiring new buffer won't have lasting
1503  * consequences, since origpage won't have been modified and leftpage is
1504  * only workspace.
1505  */
1506  origpage = BufferGetPage(buf);
1507  oopaque = BTPageGetOpaque(origpage);
1508  isleaf = P_ISLEAF(oopaque);
1509  isrightmost = P_RIGHTMOST(oopaque);
1510  maxoff = PageGetMaxOffsetNumber(origpage);
1511  origpagenumber = BufferGetBlockNumber(buf);
1512 
1513  /*
1514  * Choose a point to split origpage at.
1515  *
1516  * A split point can be thought of as a point _between_ two existing data
1517  * items on origpage (the lastleft and firstright tuples), provided you
1518  * pretend that the new item that didn't fit is already on origpage.
1519  *
1520  * Since origpage does not actually contain newitem, the representation of
1521  * split points needs to work with two boundary cases: splits where
1522  * newitem is lastleft, and splits where newitem is firstright.
1523  * newitemonleft resolves the ambiguity that would otherwise exist when
1524  * newitemoff == firstrightoff. In all other cases it's clear which side
1525  * of the split every tuple goes on from context. newitemonleft is
1526  * usually (but not always) redundant information.
1527  *
1528  * firstrightoff is supposed to be an origpage offset number, but it's
1529  * possible that its value will be maxoff+1, which is "past the end" of
1530  * origpage. This happens in the rare case where newitem goes after all
1531  * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1532  * origpage at the point that leaves newitem alone on new right page. Any
1533  * "!newitemonleft && newitemoff == firstrightoff" split point makes
1534  * newitem the firstright tuple, though, so this case isn't a special
1535  * case.
1536  */
1537  firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1538  newitem, &newitemonleft);
1539 
1540  /* Allocate temp buffer for leftpage */
1541  leftpage = PageGetTempPage(origpage);
1542  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1543  lopaque = BTPageGetOpaque(leftpage);
1544 
1545  /*
1546  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1547  * and HAS_GARBAGE flags.
1548  */
1549  lopaque->btpo_flags = oopaque->btpo_flags;
1550  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1551  /* set flag in leftpage indicating that rightpage has no downlink yet */
1552  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1553  lopaque->btpo_prev = oopaque->btpo_prev;
1554  /* handle btpo_next after rightpage buffer acquired */
1555  lopaque->btpo_level = oopaque->btpo_level;
1556  /* handle btpo_cycleid after rightpage buffer acquired */
1557 
1558  /*
1559  * Copy the original page's LSN into leftpage, which will become the
1560  * updated version of the page. We need this because XLogInsert will
1561  * examine the LSN and possibly dump it in a page image.
1562  */
1563  PageSetLSN(leftpage, PageGetLSN(origpage));
1564 
1565  /*
1566  * Determine page offset number of existing overlapped-with-orignewitem
1567  * posting list when it is necessary to perform a posting list split in
1568  * passing. Note that newitem was already changed by caller (newitem no
1569  * longer has the orignewitem TID).
1570  *
1571  * This page offset number (origpagepostingoff) will be used to pretend
1572  * that the posting split has already taken place, even though the
1573  * required modifications to origpage won't occur until we reach the
1574  * critical section. The lastleft and firstright tuples of our page split
1575  * point should, in effect, come from an imaginary version of origpage
1576  * that has the nposting tuple instead of the original posting list tuple.
1577  *
1578  * Note: _bt_findsplitloc() should have compensated for coinciding posting
1579  * list splits in just the same way, at least in theory. It doesn't
1580  * bother with that, though. In practice it won't affect its choice of
1581  * split point.
1582  */
1583  origpagepostingoff = InvalidOffsetNumber;
1584  if (postingoff != 0)
1585  {
1586  Assert(isleaf);
1587  Assert(ItemPointerCompare(&orignewitem->t_tid,
1588  &newitem->t_tid) < 0);
1589  Assert(BTreeTupleIsPosting(nposting));
1590  origpagepostingoff = OffsetNumberPrev(newitemoff);
1591  }
1592 
1593  /*
1594  * The high key for the new left page is a possibly-truncated copy of
1595  * firstright on the leaf level (it's "firstright itself" on internal
1596  * pages; see !isleaf comments below). This may seem to be contrary to
1597  * Lehman & Yao's approach of using a copy of lastleft as the new high key
1598  * when splitting on the leaf level. It isn't, though.
1599  *
1600  * Suffix truncation will leave the left page's high key fully equal to
1601  * lastleft when lastleft and firstright are equal prior to heap TID (that
1602  * is, the tiebreaker TID value comes from lastleft). It isn't actually
1603  * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1604  * "subtree" invariant to hold. It's sufficient to make sure that the new
1605  * leaf high key is strictly less than firstright, and greater than or
1606  * equal to (not necessarily equal to) lastleft. In other words, when
1607  * suffix truncation isn't possible during a leaf page split, we take
1608  * L&Y's exact approach to generating a new high key for the left page.
1609  * (Actually, that is slightly inaccurate. We don't just use a copy of
1610  * lastleft. A tuple with all the keys from firstright but the max heap
1611  * TID from lastleft is used, to avoid introducing a special case.)
1612  */
1613  if (!newitemonleft && newitemoff == firstrightoff)
1614  {
1615  /* incoming tuple becomes firstright */
1616  itemsz = newitemsz;
1617  firstright = newitem;
1618  }
1619  else
1620  {
1621  /* existing item at firstrightoff becomes firstright */
1622  itemid = PageGetItemId(origpage, firstrightoff);
1623  itemsz = ItemIdGetLength(itemid);
1624  firstright = (IndexTuple) PageGetItem(origpage, itemid);
1625  if (firstrightoff == origpagepostingoff)
1626  firstright = nposting;
1627  }
1628 
1629  if (isleaf)
1630  {
1631  IndexTuple lastleft;
1632 
1633  /* Attempt suffix truncation for leaf page splits */
1634  if (newitemonleft && newitemoff == firstrightoff)
1635  {
1636  /* incoming tuple becomes lastleft */
1637  lastleft = newitem;
1638  }
1639  else
1640  {
1641  OffsetNumber lastleftoff;
1642 
1643  /* existing item before firstrightoff becomes lastleft */
1644  lastleftoff = OffsetNumberPrev(firstrightoff);
1645  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1646  itemid = PageGetItemId(origpage, lastleftoff);
1647  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1648  if (lastleftoff == origpagepostingoff)
1649  lastleft = nposting;
1650  }
1651 
1652  lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1653  itemsz = IndexTupleSize(lefthighkey);
1654  }
1655  else
1656  {
1657  /*
1658  * Don't perform suffix truncation on a copy of firstright to make
1659  * left page high key for internal page splits. Must use firstright
1660  * as new high key directly.
1661  *
1662  * Each distinct separator key value originates as a leaf level high
1663  * key; all other separator keys/pivot tuples are copied from one
1664  * level down. A separator key in a grandparent page must be
1665  * identical to high key in rightmost parent page of the subtree to
1666  * its left, which must itself be identical to high key in rightmost
1667  * child page of that same subtree (this even applies to separator
1668  * from grandparent's high key). There must always be an unbroken
1669  * "seam" of identical separator keys that guide index scans at every
1670  * level, starting from the grandparent. That's why suffix truncation
1671  * is unsafe here.
1672  *
1673  * Internal page splits will truncate firstright into a "negative
1674  * infinity" data item when it gets inserted on the new right page
1675  * below, though. This happens during the call to _bt_pgaddtup() for
1676  * the new first data item for right page. Do not confuse this
1677  * mechanism with suffix truncation. It is just a convenient way of
1678  * implementing page splits that split the internal page "inside"
1679  * firstright. The lefthighkey separator key cannot appear a second
1680  * time in the right page (only firstright's downlink goes in right
1681  * page).
1682  */
1683  lefthighkey = firstright;
1684  }
1685 
1686  /*
1687  * Add new high key to leftpage
1688  */
1689  afterleftoff = P_HIKEY;
1690 
1691  Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1692  Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1694  Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1695  if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1696  false) == InvalidOffsetNumber)
1697  elog(ERROR, "failed to add high key to the left sibling"
1698  " while splitting block %u of index \"%s\"",
1699  origpagenumber, RelationGetRelationName(rel));
1700  afterleftoff = OffsetNumberNext(afterleftoff);
1701 
1702  /*
1703  * Acquire a new right page to split into, now that left page has a new
1704  * high key. From here on, it's not okay to throw an error without
1705  * zeroing rightpage first. This coding rule ensures that we won't
1706  * confuse future VACUUM operations, which might otherwise try to re-find
1707  * a downlink to a leftover junk page as the page undergoes deletion.
1708  *
1709  * It would be reasonable to start the critical section just after the new
1710  * rightpage buffer is acquired instead; that would allow us to avoid
1711  * leftover junk pages without bothering to zero rightpage. We do it this
1712  * way because it avoids an unnecessary PANIC when either origpage or its
1713  * existing sibling page are corrupt.
1714  */
1715  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1716  rightpage = BufferGetPage(rbuf);
1717  rightpagenumber = BufferGetBlockNumber(rbuf);
1718  /* rightpage was initialized by _bt_getbuf */
1719  ropaque = BTPageGetOpaque(rightpage);
1720 
1721  /*
1722  * Finish off remaining leftpage special area fields. They cannot be set
1723  * before both origpage (leftpage) and rightpage buffers are acquired and
1724  * locked.
1725  *
1726  * btpo_cycleid is only used with leaf pages, though we set it here in all
1727  * cases just to be consistent.
1728  */
1729  lopaque->btpo_next = rightpagenumber;
1730  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1731 
1732  /*
1733  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1734  * and HAS_GARBAGE flags.
1735  */
1736  ropaque->btpo_flags = oopaque->btpo_flags;
1737  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1738  ropaque->btpo_prev = origpagenumber;
1739  ropaque->btpo_next = oopaque->btpo_next;
1740  ropaque->btpo_level = oopaque->btpo_level;
1741  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1742 
1743  /*
1744  * Add new high key to rightpage where necessary.
1745  *
1746  * If the page we're splitting is not the rightmost page at its level in
1747  * the tree, then the first entry on the page is the high key from
1748  * origpage.
1749  */
1750  afterrightoff = P_HIKEY;
1751 
1752  if (!isrightmost)
1753  {
1754  IndexTuple righthighkey;
1755 
1756  itemid = PageGetItemId(origpage, P_HIKEY);
1757  itemsz = ItemIdGetLength(itemid);
1758  righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1759  Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1760  Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1762  if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1763  false, false) == InvalidOffsetNumber)
1764  {
1765  memset(rightpage, 0, BufferGetPageSize(rbuf));
1766  elog(ERROR, "failed to add high key to the right sibling"
1767  " while splitting block %u of index \"%s\"",
1768  origpagenumber, RelationGetRelationName(rel));
1769  }
1770  afterrightoff = OffsetNumberNext(afterrightoff);
1771  }
1772 
1773  /*
1774  * Internal page splits truncate first data item on right page -- it
1775  * becomes "minus infinity" item for the page. Set this up here.
1776  */
1777  minusinfoff = InvalidOffsetNumber;
1778  if (!isleaf)
1779  minusinfoff = afterrightoff;
1780 
1781  /*
1782  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1783  * additional pivot tuples in !isleaf case) to the appropriate page.
1784  *
1785  * Note: we *must* insert at least the right page's items in item-number
1786  * order, for the benefit of _bt_restore_page().
1787  */
1788  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1789  {
1790  IndexTuple dataitem;
1791 
1792  itemid = PageGetItemId(origpage, i);
1793  itemsz = ItemIdGetLength(itemid);
1794  dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1795 
1796  /* replace original item with nposting due to posting split? */
1797  if (i == origpagepostingoff)
1798  {
1799  Assert(BTreeTupleIsPosting(dataitem));
1800  Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1801  dataitem = nposting;
1802  }
1803 
1804  /* does new item belong before this one? */
1805  else if (i == newitemoff)
1806  {
1807  if (newitemonleft)
1808  {
1809  Assert(newitemoff <= firstrightoff);
1810  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1811  false))
1812  {
1813  memset(rightpage, 0, BufferGetPageSize(rbuf));
1814  elog(ERROR, "failed to add new item to the left sibling"
1815  " while splitting block %u of index \"%s\"",
1816  origpagenumber, RelationGetRelationName(rel));
1817  }
1818  afterleftoff = OffsetNumberNext(afterleftoff);
1819  }
1820  else
1821  {
1822  Assert(newitemoff >= firstrightoff);
1823  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1824  afterrightoff == minusinfoff))
1825  {
1826  memset(rightpage, 0, BufferGetPageSize(rbuf));
1827  elog(ERROR, "failed to add new item to the right sibling"
1828  " while splitting block %u of index \"%s\"",
1829  origpagenumber, RelationGetRelationName(rel));
1830  }
1831  afterrightoff = OffsetNumberNext(afterrightoff);
1832  }
1833  }
1834 
1835  /* decide which page to put it on */
1836  if (i < firstrightoff)
1837  {
1838  if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1839  {
1840  memset(rightpage, 0, BufferGetPageSize(rbuf));
1841  elog(ERROR, "failed to add old item to the left sibling"
1842  " while splitting block %u of index \"%s\"",
1843  origpagenumber, RelationGetRelationName(rel));
1844  }
1845  afterleftoff = OffsetNumberNext(afterleftoff);
1846  }
1847  else
1848  {
1849  if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1850  afterrightoff == minusinfoff))
1851  {
1852  memset(rightpage, 0, BufferGetPageSize(rbuf));
1853  elog(ERROR, "failed to add old item to the right sibling"
1854  " while splitting block %u of index \"%s\"",
1855  origpagenumber, RelationGetRelationName(rel));
1856  }
1857  afterrightoff = OffsetNumberNext(afterrightoff);
1858  }
1859  }
1860 
1861  /* Handle case where newitem goes at the end of rightpage */
1862  if (i <= newitemoff)
1863  {
1864  /*
1865  * Can't have newitemonleft here; that would imply we were told to put
1866  * *everything* on the left page, which cannot fit (if it could, we'd
1867  * not be splitting the page).
1868  */
1869  Assert(!newitemonleft && newitemoff == maxoff + 1);
1870  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1871  afterrightoff == minusinfoff))
1872  {
1873  memset(rightpage, 0, BufferGetPageSize(rbuf));
1874  elog(ERROR, "failed to add new item to the right sibling"
1875  " while splitting block %u of index \"%s\"",
1876  origpagenumber, RelationGetRelationName(rel));
1877  }
1878  afterrightoff = OffsetNumberNext(afterrightoff);
1879  }
1880 
1881  /*
1882  * We have to grab the original right sibling (if any) and update its prev
1883  * link. We are guaranteed that this is deadlock-free, since we couple
1884  * the locks in the standard order: left to right.
1885  */
1886  if (!isrightmost)
1887  {
1888  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1889  spage = BufferGetPage(sbuf);
1890  sopaque = BTPageGetOpaque(spage);
1891  if (sopaque->btpo_prev != origpagenumber)
1892  {
1893  memset(rightpage, 0, BufferGetPageSize(rbuf));
1894  ereport(ERROR,
1895  (errcode(ERRCODE_INDEX_CORRUPTED),
1896  errmsg_internal("right sibling's left-link doesn't match: "
1897  "block %u links to %u instead of expected %u in index \"%s\"",
1898  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1899  RelationGetRelationName(rel))));
1900  }
1901 
1902  /*
1903  * Check to see if we can set the SPLIT_END flag in the right-hand
1904  * split page; this can save some I/O for vacuum since it need not
1905  * proceed to the right sibling. We can set the flag if the right
1906  * sibling has a different cycleid: that means it could not be part of
1907  * a group of pages that were all split off from the same ancestor
1908  * page. If you're confused, imagine that page A splits to A B and
1909  * then again, yielding A C B, while vacuum is in progress. Tuples
1910  * originally in A could now be in either B or C, hence vacuum must
1911  * examine both pages. But if D, our right sibling, has a different
1912  * cycleid then it could not contain any tuples that were in A when
1913  * the vacuum started.
1914  */
1915  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1916  ropaque->btpo_flags |= BTP_SPLIT_END;
1917  }
1918 
1919  /*
1920  * Right sibling is locked, new siblings are prepared, but original page
1921  * is not updated yet.
1922  *
1923  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1924  * not starting the critical section till here because we haven't been
1925  * scribbling on the original page yet; see comments above.
1926  */
1928 
1929  /*
1930  * By here, the original data page has been split into two new halves, and
1931  * these are correct. The algorithm requires that the left page never
1932  * move during a split, so we copy the new left page back on top of the
1933  * original. We need to do this before writing the WAL record, so that
1934  * XLogInsert can WAL log an image of the page if necessary.
1935  */
1936  PageRestoreTempPage(leftpage, origpage);
1937  /* leftpage, lopaque must not be used below here */
1938 
1940  MarkBufferDirty(rbuf);
1941 
1942  if (!isrightmost)
1943  {
1944  sopaque->btpo_prev = rightpagenumber;
1945  MarkBufferDirty(sbuf);
1946  }
1947 
1948  /*
1949  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1950  * a split
1951  */
1952  if (!isleaf)
1953  {
1954  Page cpage = BufferGetPage(cbuf);
1955  BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1956 
1957  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1958  MarkBufferDirty(cbuf);
1959  }
1960 
1961  /* XLOG stuff */
1962  if (RelationNeedsWAL(rel))
1963  {
1964  xl_btree_split xlrec;
1965  uint8 xlinfo;
1966  XLogRecPtr recptr;
1967 
1968  xlrec.level = ropaque->btpo_level;
1969  /* See comments below on newitem, orignewitem, and posting lists */
1970  xlrec.firstrightoff = firstrightoff;
1971  xlrec.newitemoff = newitemoff;
1972  xlrec.postingoff = 0;
1973  if (postingoff != 0 && origpagepostingoff < firstrightoff)
1974  xlrec.postingoff = postingoff;
1975 
1976  XLogBeginInsert();
1977  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1978 
1981  /* Log original right sibling, since we've changed its prev-pointer */
1982  if (!isrightmost)
1984  if (!isleaf)
1986 
1987  /*
1988  * Log the new item, if it was inserted on the left page. (If it was
1989  * put on the right page, we don't need to explicitly WAL log it
1990  * because it's included with all the other items on the right page.)
1991  * Show the new item as belonging to the left page buffer, so that it
1992  * is not stored if XLogInsert decides it needs a full-page image of
1993  * the left page. We always store newitemoff in the record, though.
1994  *
1995  * The details are sometimes slightly different for page splits that
1996  * coincide with a posting list split. If both the replacement
1997  * posting list and newitem go on the right page, then we don't need
1998  * to log anything extra, just like the simple !newitemonleft
1999  * no-posting-split case (postingoff is set to zero in the WAL record,
2000  * so recovery doesn't need to process a posting list split at all).
2001  * Otherwise, we set postingoff and log orignewitem instead of
2002  * newitem, despite having actually inserted newitem. REDO routine
2003  * must reconstruct nposting and newitem using _bt_swap_posting().
2004  *
2005  * Note: It's possible that our page split point is the point that
2006  * makes the posting list lastleft and newitem firstright. This is
2007  * the only case where we log orignewitem/newitem despite newitem
2008  * going on the right page. If XLogInsert decides that it can omit
2009  * orignewitem due to logging a full-page image of the left page,
2010  * everything still works out, since recovery only needs to log
2011  * orignewitem for items on the left page (just like the regular
2012  * newitem-logged case).
2013  */
2014  if (newitemonleft && xlrec.postingoff == 0)
2015  XLogRegisterBufData(0, (char *) newitem, newitemsz);
2016  else if (xlrec.postingoff != 0)
2017  {
2018  Assert(isleaf);
2019  Assert(newitemonleft || firstrightoff == newitemoff);
2020  Assert(newitemsz == IndexTupleSize(orignewitem));
2021  XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
2022  }
2023 
2024  /* Log the left page's new high key */
2025  if (!isleaf)
2026  {
2027  /* lefthighkey isn't local copy, get current pointer */
2028  itemid = PageGetItemId(origpage, P_HIKEY);
2029  lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2030  }
2031  XLogRegisterBufData(0, (char *) lefthighkey,
2032  MAXALIGN(IndexTupleSize(lefthighkey)));
2033 
2034  /*
2035  * Log the contents of the right page in the format understood by
2036  * _bt_restore_page(). The whole right page will be recreated.
2037  *
2038  * Direct access to page is not good but faster - we should implement
2039  * some new func in page API. Note we only store the tuples
2040  * themselves, knowing that they were inserted in item-number order
2041  * and so the line pointers can be reconstructed. See comments for
2042  * _bt_restore_page().
2043  */
2045  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2046  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2047 
2048  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2049  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2050 
2051  PageSetLSN(origpage, recptr);
2052  PageSetLSN(rightpage, recptr);
2053  if (!isrightmost)
2054  PageSetLSN(spage, recptr);
2055  if (!isleaf)
2056  PageSetLSN(BufferGetPage(cbuf), recptr);
2057  }
2058 
2059  END_CRIT_SECTION();
2060 
2061  /* release the old right sibling */
2062  if (!isrightmost)
2063  _bt_relbuf(rel, sbuf);
2064 
2065  /* release the child */
2066  if (!isleaf)
2067  _bt_relbuf(rel, cbuf);
2068 
2069  /* be tidy */
2070  if (isleaf)
2071  pfree(lefthighkey);
2072 
2073  /* split's done */
2074  return rbuf;
2075 }
2076 
2077 /*
2078  * _bt_insert_parent() -- Insert downlink into parent, completing split.
2079  *
2080  * On entry, buf and rbuf are the left and right split pages, which we
2081  * still hold write locks on. Both locks will be released here. We
2082  * release the rbuf lock once we have a write lock on the page that we
2083  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2084  * The lock on buf is released at the same point as the lock on the parent
2085  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2086  * atomic operation that completes the split by inserting a new downlink.
2087  *
2088  * stack - stack showing how we got here. Will be NULL when splitting true
2089  * root, or during concurrent root split, where we can be inefficient
2090  * isroot - we split the true root
2091  * isonly - we split a page alone on its level (might have been fast root)
2092  */
2093 static void
2095  Buffer buf,
2096  Buffer rbuf,
2097  BTStack stack,
2098  bool isroot,
2099  bool isonly)
2100 {
2101  /*
2102  * Here we have to do something Lehman and Yao don't talk about: deal with
2103  * a root split and construction of a new root. If our stack is empty
2104  * then we have just split a node on what had been the root level when we
2105  * descended the tree. If it was still the root then we perform a
2106  * new-root construction. If it *wasn't* the root anymore, search to find
2107  * the next higher level that someone constructed meanwhile, and find the
2108  * right place to insert as for the normal case.
2109  *
2110  * If we have to search for the parent level, we do so by re-descending
2111  * from the root. This is not super-efficient, but it's rare enough not
2112  * to matter.
2113  */
2114  if (isroot)
2115  {
2116  Buffer rootbuf;
2117 
2118  Assert(stack == NULL);
2119  Assert(isonly);
2120  /* create a new root node and update the metapage */
2121  rootbuf = _bt_newroot(rel, buf, rbuf);
2122  /* release the split buffers */
2123  _bt_relbuf(rel, rootbuf);
2124  _bt_relbuf(rel, rbuf);
2125  _bt_relbuf(rel, buf);
2126  }
2127  else
2128  {
2130  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2131  Page page = BufferGetPage(buf);
2132  IndexTuple new_item;
2133  BTStackData fakestack;
2134  IndexTuple ritem;
2135  Buffer pbuf;
2136 
2137  if (stack == NULL)
2138  {
2139  BTPageOpaque opaque;
2140 
2141  elog(DEBUG2, "concurrent ROOT page split");
2142  opaque = BTPageGetOpaque(page);
2143 
2144  /*
2145  * We should never reach here when a leaf page split takes place
2146  * despite the insert of newitem being able to apply the fastpath
2147  * optimization. Make sure of that with an assertion.
2148  *
2149  * This is more of a performance issue than a correctness issue.
2150  * The fastpath won't have a descent stack. Using a phony stack
2151  * here works, but never rely on that. The fastpath should be
2152  * rejected within _bt_search_insert() when the rightmost leaf
2153  * page will split, since it's faster to go through _bt_search()
2154  * and get a stack in the usual way.
2155  */
2156  Assert(!(P_ISLEAF(opaque) &&
2158 
2159  /* Find the leftmost page at the next level up */
2160  pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
2161  /* Set up a phony stack entry pointing there */
2162  stack = &fakestack;
2163  stack->bts_blkno = BufferGetBlockNumber(pbuf);
2165  stack->bts_parent = NULL;
2166  _bt_relbuf(rel, pbuf);
2167  }
2168 
2169  /* get high key from left, a strict lower bound for new right page */
2170  ritem = (IndexTuple) PageGetItem(page,
2171  PageGetItemId(page, P_HIKEY));
2172 
2173  /* form an index tuple that points at the new right page */
2174  new_item = CopyIndexTuple(ritem);
2175  BTreeTupleSetDownLink(new_item, rbknum);
2176 
2177  /*
2178  * Re-find and write lock the parent of buf.
2179  *
2180  * It's possible that the location of buf's downlink has changed since
2181  * our initial _bt_search() descent. _bt_getstackbuf() will detect
2182  * and recover from this, updating the stack, which ensures that the
2183  * new downlink will be inserted at the correct offset. Even buf's
2184  * parent may have changed.
2185  */
2186  pbuf = _bt_getstackbuf(rel, stack, bknum);
2187 
2188  /*
2189  * Unlock the right child. The left child will be unlocked in
2190  * _bt_insertonpg().
2191  *
2192  * Unlocking the right child must be delayed until here to ensure that
2193  * no concurrent VACUUM operation can become confused. Page deletion
2194  * cannot be allowed to fail to re-find a downlink for the rbuf page.
2195  * (Actually, this is just a vestige of how things used to work. The
2196  * page deletion code is expected to check for the INCOMPLETE_SPLIT
2197  * flag on the left child. It won't attempt deletion of the right
2198  * child until the split is complete. Despite all this, we opt to
2199  * conservatively delay unlocking the right child until here.)
2200  */
2201  _bt_relbuf(rel, rbuf);
2202 
2203  if (pbuf == InvalidBuffer)
2204  ereport(ERROR,
2205  (errcode(ERRCODE_INDEX_CORRUPTED),
2206  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2207  RelationGetRelationName(rel), bknum, rbknum)));
2208 
2209  /* Recursively insert into the parent */
2210  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
2211  new_item, MAXALIGN(IndexTupleSize(new_item)),
2212  stack->bts_offset + 1, 0, isonly);
2213 
2214  /* be tidy */
2215  pfree(new_item);
2216  }
2217 }
2218 
2219 /*
2220  * _bt_finish_split() -- Finish an incomplete split
2221  *
2222  * A crash or other failure can leave a split incomplete. The insertion
2223  * routines won't allow to insert on a page that is incompletely split.
2224  * Before inserting on such a page, call _bt_finish_split().
2225  *
2226  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2227  * and unpinned.
2228  */
2229 void
2231 {
2232  Page lpage = BufferGetPage(lbuf);
2233  BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2234  Buffer rbuf;
2235  Page rpage;
2236  BTPageOpaque rpageop;
2237  bool wasroot;
2238  bool wasonly;
2239 
2240  Assert(P_INCOMPLETE_SPLIT(lpageop));
2241 
2242  /* Lock right sibling, the one missing the downlink */
2243  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2244  rpage = BufferGetPage(rbuf);
2245  rpageop = BTPageGetOpaque(rpage);
2246 
2247  /* Could this be a root split? */
2248  if (!stack)
2249  {
2250  Buffer metabuf;
2251  Page metapg;
2252  BTMetaPageData *metad;
2253 
2254  /* acquire lock on the metapage */
2255  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2256  metapg = BufferGetPage(metabuf);
2257  metad = BTPageGetMeta(metapg);
2258 
2259  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2260 
2261  _bt_relbuf(rel, metabuf);
2262  }
2263  else
2264  wasroot = false;
2265 
2266  /* Was this the only page on the level before split? */
2267  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2268 
2269  elog(DEBUG1, "finishing incomplete split of %u/%u",
2271 
2272  _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
2273 }
2274 
2275 /*
2276  * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2277  * tuple whose downlink points to child page.
2278  *
2279  * Caller passes child's block number, which is used to identify
2280  * associated pivot tuple in parent page using a linear search that
2281  * matches on pivot's downlink/block number. The expected location of
2282  * the pivot tuple is taken from the stack one level above the child
2283  * page. This is used as a starting point. Insertions into the
2284  * parent level could cause the pivot tuple to move right; deletions
2285  * could cause it to move left, but not left of the page we previously
2286  * found it on.
2287  *
2288  * Caller can use its stack to relocate the pivot tuple/downlink for
2289  * any same-level page to the right of the page found by its initial
2290  * descent. This is necessary because of the possibility that caller
2291  * moved right to recover from a concurrent page split. It's also
2292  * convenient for certain callers to be able to step right when there
2293  * wasn't a concurrent page split, while still using their original
2294  * stack. For example, the checkingunique _bt_doinsert() case may
2295  * have to step right when there are many physical duplicates, and its
2296  * scantid forces an insertion to the right of the "first page the
2297  * value could be on". (This is also relied on by all of our callers
2298  * when dealing with !heapkeyspace indexes.)
2299  *
2300  * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2301  * tuple not found (should not happen). Adjusts bts_blkno &
2302  * bts_offset if changed. Page split caller should insert its new
2303  * pivot tuple for its new right sibling page on parent page, at the
2304  * offset number bts_offset + 1.
2305  */
2306 Buffer
2308 {
2309  BlockNumber blkno;
2310  OffsetNumber start;
2311 
2312  blkno = stack->bts_blkno;
2313  start = stack->bts_offset;
2314 
2315  for (;;)
2316  {
2317  Buffer buf;
2318  Page page;
2319  BTPageOpaque opaque;
2320 
2321  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2322  page = BufferGetPage(buf);
2323  opaque = BTPageGetOpaque(page);
2324 
2325  if (P_INCOMPLETE_SPLIT(opaque))
2326  {
2327  _bt_finish_split(rel, buf, stack->bts_parent);
2328  continue;
2329  }
2330 
2331  if (!P_IGNORE(opaque))
2332  {
2333  OffsetNumber offnum,
2334  minoff,
2335  maxoff;
2336  ItemId itemid;
2337  IndexTuple item;
2338 
2339  minoff = P_FIRSTDATAKEY(opaque);
2340  maxoff = PageGetMaxOffsetNumber(page);
2341 
2342  /*
2343  * start = InvalidOffsetNumber means "search the whole page". We
2344  * need this test anyway due to possibility that page has a high
2345  * key now when it didn't before.
2346  */
2347  if (start < minoff)
2348  start = minoff;
2349 
2350  /*
2351  * Need this check too, to guard against possibility that page
2352  * split since we visited it originally.
2353  */
2354  if (start > maxoff)
2355  start = OffsetNumberNext(maxoff);
2356 
2357  /*
2358  * These loops will check every item on the page --- but in an
2359  * order that's attuned to the probability of where it actually
2360  * is. Scan to the right first, then to the left.
2361  */
2362  for (offnum = start;
2363  offnum <= maxoff;
2364  offnum = OffsetNumberNext(offnum))
2365  {
2366  itemid = PageGetItemId(page, offnum);
2367  item = (IndexTuple) PageGetItem(page, itemid);
2368 
2369  if (BTreeTupleGetDownLink(item) == child)
2370  {
2371  /* Return accurate pointer to where link is now */
2372  stack->bts_blkno = blkno;
2373  stack->bts_offset = offnum;
2374  return buf;
2375  }
2376  }
2377 
2378  for (offnum = OffsetNumberPrev(start);
2379  offnum >= minoff;
2380  offnum = OffsetNumberPrev(offnum))
2381  {
2382  itemid = PageGetItemId(page, offnum);
2383  item = (IndexTuple) PageGetItem(page, itemid);
2384 
2385  if (BTreeTupleGetDownLink(item) == child)
2386  {
2387  /* Return accurate pointer to where link is now */
2388  stack->bts_blkno = blkno;
2389  stack->bts_offset = offnum;
2390  return buf;
2391  }
2392  }
2393  }
2394 
2395  /*
2396  * The item we're looking for moved right at least one page.
2397  *
2398  * Lehman and Yao couple/chain locks when moving right here, which we
2399  * can avoid. See nbtree/README.
2400  */
2401  if (P_RIGHTMOST(opaque))
2402  {
2403  _bt_relbuf(rel, buf);
2404  return InvalidBuffer;
2405  }
2406  blkno = opaque->btpo_next;
2407  start = InvalidOffsetNumber;
2408  _bt_relbuf(rel, buf);
2409  }
2410 }
2411 
2412 /*
2413  * _bt_newroot() -- Create a new root page for the index.
2414  *
2415  * We've just split the old root page and need to create a new one.
2416  * In order to do this, we add a new root page to the file, then lock
2417  * the metadata page and update it. This is guaranteed to be deadlock-
2418  * free, because all readers release their locks on the metadata page
2419  * before trying to lock the root, and all writers lock the root before
2420  * trying to lock the metadata page. We have a write lock on the old
2421  * root page, so we have not introduced any cycles into the waits-for
2422  * graph.
2423  *
2424  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2425  * locked. On exit, a new root page exists with entries for the
2426  * two new children, metapage is updated and unlocked/unpinned.
2427  * The new root buffer is returned to caller which has to unlock/unpin
2428  * lbuf, rbuf & rootbuf.
2429  */
2430 static Buffer
2432 {
2433  Buffer rootbuf;
2434  Page lpage,
2435  rootpage;
2436  BlockNumber lbkno,
2437  rbkno;
2438  BlockNumber rootblknum;
2439  BTPageOpaque rootopaque;
2440  BTPageOpaque lopaque;
2441  ItemId itemid;
2442  IndexTuple item;
2443  IndexTuple left_item;
2444  Size left_item_sz;
2445  IndexTuple right_item;
2446  Size right_item_sz;
2447  Buffer metabuf;
2448  Page metapg;
2449  BTMetaPageData *metad;
2450 
2451  lbkno = BufferGetBlockNumber(lbuf);
2452  rbkno = BufferGetBlockNumber(rbuf);
2453  lpage = BufferGetPage(lbuf);
2454  lopaque = BTPageGetOpaque(lpage);
2455 
2456  /* get a new root page */
2457  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2458  rootpage = BufferGetPage(rootbuf);
2459  rootblknum = BufferGetBlockNumber(rootbuf);
2460 
2461  /* acquire lock on the metapage */
2462  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2463  metapg = BufferGetPage(metabuf);
2464  metad = BTPageGetMeta(metapg);
2465 
2466  /*
2467  * Create downlink item for left page (old root). The key value used is
2468  * "minus infinity", a sentinel value that's reliably less than any real
2469  * key value that could appear in the left page.
2470  */
2471  left_item_sz = sizeof(IndexTupleData);
2472  left_item = (IndexTuple) palloc(left_item_sz);
2473  left_item->t_info = left_item_sz;
2474  BTreeTupleSetDownLink(left_item, lbkno);
2475  BTreeTupleSetNAtts(left_item, 0, false);
2476 
2477  /*
2478  * Create downlink item for right page. The key for it is obtained from
2479  * the "high key" position in the left page.
2480  */
2481  itemid = PageGetItemId(lpage, P_HIKEY);
2482  right_item_sz = ItemIdGetLength(itemid);
2483  item = (IndexTuple) PageGetItem(lpage, itemid);
2484  right_item = CopyIndexTuple(item);
2485  BTreeTupleSetDownLink(right_item, rbkno);
2486 
2487  /* NO EREPORT(ERROR) from here till newroot op is logged */
2489 
2490  /* upgrade metapage if needed */
2491  if (metad->btm_version < BTREE_NOVAC_VERSION)
2492  _bt_upgrademetapage(metapg);
2493 
2494  /* set btree special data */
2495  rootopaque = BTPageGetOpaque(rootpage);
2496  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2497  rootopaque->btpo_flags = BTP_ROOT;
2498  rootopaque->btpo_level =
2499  (BTPageGetOpaque(lpage))->btpo_level + 1;
2500  rootopaque->btpo_cycleid = 0;
2501 
2502  /* update metapage data */
2503  metad->btm_root = rootblknum;
2504  metad->btm_level = rootopaque->btpo_level;
2505  metad->btm_fastroot = rootblknum;
2506  metad->btm_fastlevel = rootopaque->btpo_level;
2507 
2508  /*
2509  * Insert the left page pointer into the new root page. The root page is
2510  * the rightmost page on its level so there is no "high key" in it; the
2511  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2512  *
2513  * Note: we *must* insert the two items in item-number order, for the
2514  * benefit of _bt_restore_page().
2515  */
2516  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2517  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2518  false, false) == InvalidOffsetNumber)
2519  elog(PANIC, "failed to add leftkey to new root page"
2520  " while splitting block %u of index \"%s\"",
2522 
2523  /*
2524  * insert the right page pointer into the new root page.
2525  */
2526  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2527  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2529  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2530  false, false) == InvalidOffsetNumber)
2531  elog(PANIC, "failed to add rightkey to new root page"
2532  " while splitting block %u of index \"%s\"",
2534 
2535  /* Clear the incomplete-split flag in the left child */
2536  Assert(P_INCOMPLETE_SPLIT(lopaque));
2537  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2538  MarkBufferDirty(lbuf);
2539 
2540  MarkBufferDirty(rootbuf);
2541  MarkBufferDirty(metabuf);
2542 
2543  /* XLOG stuff */
2544  if (RelationNeedsWAL(rel))
2545  {
2546  xl_btree_newroot xlrec;
2547  XLogRecPtr recptr;
2548  xl_btree_metadata md;
2549 
2550  xlrec.rootblk = rootblknum;
2551  xlrec.level = metad->btm_level;
2552 
2553  XLogBeginInsert();
2554  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2555 
2556  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2559 
2561  md.version = metad->btm_version;
2562  md.root = rootblknum;
2563  md.level = metad->btm_level;
2564  md.fastroot = rootblknum;
2565  md.fastlevel = metad->btm_level;
2567  md.allequalimage = metad->btm_allequalimage;
2568 
2569  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2570 
2571  /*
2572  * Direct access to page is not good but faster - we should implement
2573  * some new func in page API.
2574  */
2576  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2577  ((PageHeader) rootpage)->pd_special -
2578  ((PageHeader) rootpage)->pd_upper);
2579 
2580  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2581 
2582  PageSetLSN(lpage, recptr);
2583  PageSetLSN(rootpage, recptr);
2584  PageSetLSN(metapg, recptr);
2585  }
2586 
2587  END_CRIT_SECTION();
2588 
2589  /* done with metapage */
2590  _bt_relbuf(rel, metabuf);
2591 
2592  pfree(left_item);
2593  pfree(right_item);
2594 
2595  return rootbuf;
2596 }
2597 
2598 /*
2599  * _bt_pgaddtup() -- add a data item to a particular page during split.
2600  *
2601  * The difference between this routine and a bare PageAddItem call is
2602  * that this code can deal with the first data item on an internal btree
2603  * page in passing. This data item (which is called "firstright" within
2604  * _bt_split()) has a key that must be treated as minus infinity after
2605  * the split. Therefore, we truncate away all attributes when caller
2606  * specifies it's the first data item on page (downlink is not changed,
2607  * though). This extra step is only needed for the right page of an
2608  * internal page split. There is no need to do this for the first data
2609  * item on the existing/left page, since that will already have been
2610  * truncated during an earlier page split.
2611  *
2612  * See _bt_split() for a high level explanation of why we truncate here.
2613  * Note that this routine has nothing to do with suffix truncation,
2614  * despite using some of the same infrastructure.
2615  */
2616 static inline bool
2618  Size itemsize,
2619  IndexTuple itup,
2620  OffsetNumber itup_off,
2621  bool newfirstdataitem)
2622 {
2623  IndexTupleData trunctuple;
2624 
2625  if (newfirstdataitem)
2626  {
2627  trunctuple = *itup;
2628  trunctuple.t_info = sizeof(IndexTupleData);
2629  BTreeTupleSetNAtts(&trunctuple, 0, false);
2630  itup = &trunctuple;
2631  itemsize = sizeof(IndexTupleData);
2632  }
2633 
2634  if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2635  false) == InvalidOffsetNumber))
2636  return false;
2637 
2638  return true;
2639 }
2640 
2641 /*
2642  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2643  *
2644  * There are three operations performed here: simple index deletion, bottom-up
2645  * index deletion, and deduplication. If all three operations fail to free
2646  * enough space for the incoming item then caller will go on to split the
2647  * page. We always consider simple deletion first. If that doesn't work out
2648  * we consider alternatives. Callers that only want us to consider simple
2649  * deletion (without any fallback) ask for that using the 'simpleonly'
2650  * argument.
2651  *
2652  * We usually pick only one alternative "complex" operation when simple
2653  * deletion alone won't prevent a page split. The 'checkingunique',
2654  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2655  *
2656  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2657  * level flag was found set. The flag was useful back when there wasn't
2658  * necessarily one single page for a duplicate tuple to go on (before heap TID
2659  * became a part of the key space in version 4 indexes). But we don't
2660  * actually look at the flag anymore (it's not a gating condition for our
2661  * caller). That would cause us to miss tuples that are safe to delete,
2662  * without getting any benefit in return. We know that the alternative is to
2663  * split the page; scanning the line pointer array in passing won't have
2664  * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2665  * all this because !heapkeyspace indexes must still do a "getting tired"
2666  * linear search, and so are likely to get some benefit from using it as a
2667  * gating condition.)
2668  */
2669 static void
2671  BTInsertState insertstate,
2672  bool simpleonly, bool checkingunique,
2673  bool uniquedup, bool indexUnchanged)
2674 {
2676  int ndeletable = 0;
2677  OffsetNumber offnum,
2678  minoff,
2679  maxoff;
2680  Buffer buffer = insertstate->buf;
2681  BTScanInsert itup_key = insertstate->itup_key;
2682  Page page = BufferGetPage(buffer);
2683  BTPageOpaque opaque = BTPageGetOpaque(page);
2684 
2685  Assert(P_ISLEAF(opaque));
2686  Assert(simpleonly || itup_key->heapkeyspace);
2687  Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2688 
2689  /*
2690  * Scan over all items to see which ones need to be deleted according to
2691  * LP_DEAD flags. We'll usually manage to delete a few extra items that
2692  * are not marked LP_DEAD in passing. Often the extra items that actually
2693  * end up getting deleted are items that would have had their LP_DEAD bit
2694  * set before long anyway (if we opted not to include them as extras).
2695  */
2696  minoff = P_FIRSTDATAKEY(opaque);
2697  maxoff = PageGetMaxOffsetNumber(page);
2698  for (offnum = minoff;
2699  offnum <= maxoff;
2700  offnum = OffsetNumberNext(offnum))
2701  {
2702  ItemId itemId = PageGetItemId(page, offnum);
2703 
2704  if (ItemIdIsDead(itemId))
2705  deletable[ndeletable++] = offnum;
2706  }
2707 
2708  if (ndeletable > 0)
2709  {
2710  _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2711  insertstate->itup, minoff, maxoff);
2712  insertstate->bounds_valid = false;
2713 
2714  /* Return when a page split has already been avoided */
2715  if (PageGetFreeSpace(page) >= insertstate->itemsz)
2716  return;
2717 
2718  /* Might as well assume duplicates (if checkingunique) */
2719  uniquedup = true;
2720  }
2721 
2722  /*
2723  * We're done with simple deletion. Return early with callers that only
2724  * call here so that simple deletion can be considered. This includes
2725  * callers that explicitly ask for this and checkingunique callers that
2726  * probably don't have any version churn duplicates on the page.
2727  *
2728  * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2729  * return at this point (or when we go on the try either or both of our
2730  * other strategies and they also fail). We do not bother expending a
2731  * separate write to clear it, however. Caller will definitely clear it
2732  * when it goes on to split the page (note also that the deduplication
2733  * process will clear the flag in passing, just to keep things tidy).
2734  */
2735  if (simpleonly || (checkingunique && !uniquedup))
2736  {
2737  Assert(!indexUnchanged);
2738  return;
2739  }
2740 
2741  /* Assume bounds about to be invalidated (this is almost certain now) */
2742  insertstate->bounds_valid = false;
2743 
2744  /*
2745  * Perform bottom-up index deletion pass when executor hint indicated that
2746  * incoming item is logically unchanged, or for a unique index that is
2747  * known to have physical duplicates for some other reason. (There is a
2748  * large overlap between these two cases for a unique index. It's worth
2749  * having both triggering conditions in order to apply the optimization in
2750  * the event of successive related INSERT and DELETE statements.)
2751  *
2752  * We'll go on to do a deduplication pass when a bottom-up pass fails to
2753  * delete an acceptable amount of free space (a significant fraction of
2754  * the page, or space for the new item, whichever is greater).
2755  *
2756  * Note: Bottom-up index deletion uses the same equality/equivalence
2757  * routines as deduplication internally. However, it does not merge
2758  * together index tuples, so the same correctness considerations do not
2759  * apply. We deliberately omit an index-is-allequalimage test here.
2760  */
2761  if ((indexUnchanged || uniquedup) &&
2762  _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2763  return;
2764 
2765  /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2766  if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2767  _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
2768  insertstate->itemsz, (indexUnchanged || uniquedup));
2769 }
2770 
2771 /*
2772  * _bt_simpledel_pass - Simple index tuple deletion pass.
2773  *
2774  * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2775  * of all such tuples are determined by caller (caller passes these to us as
2776  * its 'deletable' argument).
2777  *
2778  * We might also delete extra index tuples that turn out to be safe to delete
2779  * in passing (though they must be cheap to check in passing to begin with).
2780  * There is no certainty that any extra tuples will be deleted, though. The
2781  * high level goal of the approach we take is to get the most out of each call
2782  * here (without noticeably increasing the per-call overhead compared to what
2783  * we need to do just to be able to delete the page's LP_DEAD-marked index
2784  * tuples).
2785  *
2786  * The number of extra index tuples that turn out to be deletable might
2787  * greatly exceed the number of LP_DEAD-marked index tuples due to various
2788  * locality related effects. For example, it's possible that the total number
2789  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2790  * quite low, in which case we might end up checking if it's possible to
2791  * delete _most_ index tuples on the page (without the tableam needing to
2792  * access additional table blocks). The tableam will sometimes stumble upon
2793  * _many_ extra deletable index tuples in indexes where this pattern is
2794  * common.
2795  *
2796  * See nbtree/README for further details on simple index tuple deletion.
2797  */
2798 static void
2800  OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2801  OffsetNumber minoff, OffsetNumber maxoff)
2802 {
2803  Page page = BufferGetPage(buffer);
2804  BlockNumber *deadblocks;
2805  int ndeadblocks;
2806  TM_IndexDeleteOp delstate;
2807  OffsetNumber offnum;
2808 
2809  /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2810  deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2811  &ndeadblocks);
2812 
2813  /* Initialize tableam state that describes index deletion operation */
2814  delstate.irel = rel;
2815  delstate.iblknum = BufferGetBlockNumber(buffer);
2816  delstate.bottomup = false;
2817  delstate.bottomupfreespace = 0;
2818  delstate.ndeltids = 0;
2819  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2820  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2821 
2822  for (offnum = minoff;
2823  offnum <= maxoff;
2824  offnum = OffsetNumberNext(offnum))
2825  {
2826  ItemId itemid = PageGetItemId(page, offnum);
2827  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2828  TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2829  TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2830  BlockNumber tidblock;
2831  void *match;
2832 
2833  if (!BTreeTupleIsPosting(itup))
2834  {
2835  tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2836  match = bsearch(&tidblock, deadblocks, ndeadblocks,
2837  sizeof(BlockNumber), _bt_blk_cmp);
2838 
2839  if (!match)
2840  {
2841  Assert(!ItemIdIsDead(itemid));
2842  continue;
2843  }
2844 
2845  /*
2846  * TID's table block is among those pointed to by the TIDs from
2847  * LP_DEAD-bit set tuples on page -- add TID to deltids
2848  */
2849  odeltid->tid = itup->t_tid;
2850  odeltid->id = delstate.ndeltids;
2851  ostatus->idxoffnum = offnum;
2852  ostatus->knowndeletable = ItemIdIsDead(itemid);
2853  ostatus->promising = false; /* unused */
2854  ostatus->freespace = 0; /* unused */
2855 
2856  delstate.ndeltids++;
2857  }
2858  else
2859  {
2860  int nitem = BTreeTupleGetNPosting(itup);
2861 
2862  for (int p = 0; p < nitem; p++)
2863  {
2864  ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2865 
2866  tidblock = ItemPointerGetBlockNumber(tid);
2867  match = bsearch(&tidblock, deadblocks, ndeadblocks,
2868  sizeof(BlockNumber), _bt_blk_cmp);
2869 
2870  if (!match)
2871  {
2872  Assert(!ItemIdIsDead(itemid));
2873  continue;
2874  }
2875 
2876  /*
2877  * TID's table block is among those pointed to by the TIDs
2878  * from LP_DEAD-bit set tuples on page -- add TID to deltids
2879  */
2880  odeltid->tid = *tid;
2881  odeltid->id = delstate.ndeltids;
2882  ostatus->idxoffnum = offnum;
2883  ostatus->knowndeletable = ItemIdIsDead(itemid);
2884  ostatus->promising = false; /* unused */
2885  ostatus->freespace = 0; /* unused */
2886 
2887  odeltid++;
2888  ostatus++;
2889  delstate.ndeltids++;
2890  }
2891  }
2892  }
2893 
2894  pfree(deadblocks);
2895 
2896  Assert(delstate.ndeltids >= ndeletable);
2897 
2898  /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2899  _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2900 
2901  pfree(delstate.deltids);
2902  pfree(delstate.status);
2903 }
2904 
2905 /*
2906  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2907  *
2908  * Builds sorted and unique-ified array of table block numbers from index
2909  * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2910  * block from incoming newitem just in case it isn't among the LP_DEAD-related
2911  * table blocks.
2912  *
2913  * Always counting the newitem's table block as an LP_DEAD related block makes
2914  * sense because the cost is consistently low; it is practically certain that
2915  * the table block will not incur a buffer miss in tableam. On the other hand
2916  * the benefit is often quite high. There is a decent chance that there will
2917  * be some deletable items from this block, since in general most garbage
2918  * tuples became garbage in the recent past (in many cases this won't be the
2919  * first logical row that core code added to/modified in table block
2920  * recently).
2921  *
2922  * Returns final array, and sets *nblocks to its final size for caller.
2923  */
2924 static BlockNumber *
2925 _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2926  IndexTuple newitem, int *nblocks)
2927 {
2928  int spacentids,
2929  ntids;
2930  BlockNumber *tidblocks;
2931 
2932  /*
2933  * Accumulate each TID's block in array whose initial size has space for
2934  * one table block per LP_DEAD-set tuple (plus space for the newitem table
2935  * block). Array will only need to grow when there are LP_DEAD-marked
2936  * posting list tuples (which is not that common).
2937  */
2938  spacentids = ndeletable + 1;
2939  ntids = 0;
2940  tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2941 
2942  /*
2943  * First add the table block for the incoming newitem. This is the one
2944  * case where simple deletion can visit a table block that doesn't have
2945  * any known deletable items.
2946  */
2947  Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2948  tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2949 
2950  for (int i = 0; i < ndeletable; i++)
2951  {
2952  ItemId itemid = PageGetItemId(page, deletable[i]);
2953  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2954 
2955  Assert(ItemIdIsDead(itemid));
2956 
2957  if (!BTreeTupleIsPosting(itup))
2958  {
2959  if (ntids + 1 > spacentids)
2960  {
2961  spacentids *= 2;
2962  tidblocks = (BlockNumber *)
2963  repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2964  }
2965 
2966  tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2967  }
2968  else
2969  {
2970  int nposting = BTreeTupleGetNPosting(itup);
2971 
2972  if (ntids + nposting > spacentids)
2973  {
2974  spacentids = Max(spacentids * 2, ntids + nposting);
2975  tidblocks = (BlockNumber *)
2976  repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2977  }
2978 
2979  for (int j = 0; j < nposting; j++)
2980  {
2981  ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2982 
2983  tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
2984  }
2985  }
2986  }
2987 
2988  qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
2989  *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
2990 
2991  return tidblocks;
2992 }
2993 
2994 /*
2995  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
2996  */
2997 static inline int
2998 _bt_blk_cmp(const void *arg1, const void *arg2)
2999 {
3000  BlockNumber b1 = *((BlockNumber *) arg1);
3001  BlockNumber b2 = *((BlockNumber *) arg2);
3002 
3003  if (b1 < b2)
3004  return -1;
3005  else if (b1 > b2)
3006  return 1;
3007 
3008  return 0;
3009 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2755
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3915
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1573
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3985
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:702
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
#define P_NEW
Definition: bufmgr.h:91
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:424
Page PageGetTempPage(Page page)
Definition: bufpage.c:365
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
Pointer Page
Definition: bufpage.h:78
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:356
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:234
#define PageGetItem(page, itemId)
Definition: bufpage.h:339
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:415
#define PageGetLSN(page)
Definition: bufpage.h:365
unsigned short uint16
Definition: c.h:451
unsigned int uint32
Definition: c.h:452
#define MAXALIGN(LEN)
Definition: c.h:768
#define PG_UINT32_MAX
Definition: c.h:536
#define Max(x, y)
Definition: c.h:991
#define unlikely(x)
Definition: c.h:284
unsigned char uint8
Definition: c.h:450
uint32 TransactionId
Definition: c.h:598
size_t Size
Definition: c.h:551
int errmsg_internal(const char *fmt,...)
Definition: elog.c:991
int errdetail(const char *fmt,...)
Definition: elog.c:1037
int errhint(const char *fmt,...)
Definition: elog.c:1151
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define DEBUG2
Definition: elog.h:23
#define PANIC
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:24
#define ERROR
Definition: elog.h:33
#define ereport(elevel,...)
Definition: elog.h:143
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:177
IndexUniqueCheck
Definition: genam.h:115
@ UNIQUE_CHECK_NO
Definition: genam.h:116
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:119
@ UNIQUE_CHECK_PARTIAL
Definition: genam.h:118
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:437
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:528
int j
Definition: isn.c:74
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
Pointer Item
Definition: item.h:17
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
struct IndexTupleData IndexTupleData
#define MaxIndexTuplesPerPage
Definition: itup.h:144
Assert(fmt[strlen(fmt) - 1] !='\n')
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:668
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:825
@ XLTW_InsertIndex
Definition: lmgr.h:31
void pfree(void *pointer)
Definition: mcxt.c:1175
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1188
void * palloc(Size size)
Definition: mcxt.c:1068
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:1018
bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
Definition: nbtdedup.c:306
void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
Definition: nbtdedup.c:58
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2431
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:30
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:813
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
Definition: nbtinsert.c:2799
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2094
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:315
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:100
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2230
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2617
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1025
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1101
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
Definition: nbtinsert.c:2670
static int _bt_blk_cmp(const void *arg1, const void *arg2)
Definition: nbtinsert.c:2998
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2307
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1462
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:406
static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
Definition: nbtinsert.c:2925
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1015
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:109
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:672
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1141
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1105
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:871
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1528
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1103
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:513
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:475
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define P_HIKEY
Definition: nbtree.h:369
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define BTP_ROOT
Definition: nbtree.h:77
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:557
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:371
#define P_ISROOT(opaque)
Definition: nbtree.h:222
#define P_NONE
Definition: nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:83
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:539
#define BT_READ
Definition: nbtree.h:714
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:551
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:487
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
#define BT_WRITE
Definition: nbtree.h:715
#define P_FIRSTKEY
Definition: nbtree.h:370
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:590
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:572
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:442
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:656
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2307
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:96
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:129
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2638
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:182
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1942
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2205
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:32
#define SizeOfBtreeInsert
Definition: nbtxlog.h:84
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:31
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:27
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:28
#define SizeOfBtreeSplit
Definition: nbtxlog.h:158
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:335
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:29
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:30
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define INDEX_MAX_KEYS
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:185
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:28
static char * buf
Definition: pg_test_fsync.c:67
#define qsort(a, b, c, d)
Definition: port.h:495
uintptr_t Datum
Definition: postgres.h:411
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3169
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4448
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition: qunique.h:21
#define RelationGetDescr(relation)
Definition: rel.h:515
#define RelationGetRelationName(relation)
Definition: rel.h:523
#define RelationGetTargetBlock(relation)
Definition: rel.h:586
#define RelationNeedsWAL(relation)
Definition: rel.h:613
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:593
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:501
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:508
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5867
#define SnapshotSelf
Definition: snapmgr.h:66
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:74
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
IndexTuple itup
Definition: nbtree.h:811
BTScanInsert itup_key
Definition: nbtree.h:813
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
BlockNumber btm_root
Definition: nbtree.h:107
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint16 btpo_flags
Definition: nbtree.h:67
uint32 btpo_level
Definition: nbtree.h:66
BTCycleId btpo_cycleid
Definition: nbtree.h:68
ItemPointer scantid
Definition: nbtree.h:791
bool allequalimage
Definition: nbtree.h:787
bool heapkeyspace
Definition: nbtree.h:786
bool anynullkeys
Definition: nbtree.h:788
BlockNumber bts_blkno
Definition: nbtree.h:729
struct BTStackData * bts_parent
Definition: nbtree.h:731
OffsetNumber bts_offset
Definition: nbtree.h:730
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
TransactionId xmin
Definition: snapshot.h:157
TransactionId xmax
Definition: snapshot.h:158
uint32 speculativeToken
Definition: snapshot.h:193
TM_IndexStatus * status
Definition: tableam.h:231
int bottomupfreespace
Definition: tableam.h:226
Relation irel
Definition: tableam.h:223
TM_IndexDelete * deltids
Definition: tableam.h:230
BlockNumber iblknum
Definition: tableam.h:224
ItemPointerData tid
Definition: tableam.h:189
bool knowndeletable
Definition: tableam.h:196
bool promising
Definition: tableam.h:199
int16 freespace
Definition: tableam.h:200
OffsetNumber idxoffnum
Definition: tableam.h:195
OffsetNumber offnum
Definition: nbtxlog.h:78
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:332
BlockNumber rootblk
Definition: nbtxlog.h:331
uint16 postingoff
Definition: nbtxlog.h:155
OffsetNumber firstrightoff
Definition: nbtxlog.h:153
uint32 level
Definition: nbtxlog.h:152
OffsetNumber newitemoff
Definition: nbtxlog.h:154
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:219
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:443
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:389
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:351
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33