PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/tableam.h"
21 #include "access/transam.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "storage/lmgr.h"
25 #include "storage/predicate.h"
26 #include "storage/smgr.h"
27 
28 /* Minimum tree height for application of fastpath optimization */
29 #define BTREE_FASTPATH_MIN_LEVEL 2
30 
31 
32 static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
34  Relation heapRel,
35  IndexUniqueCheck checkUnique, bool *is_unique,
36  uint32 *speculativeToken);
38  BTInsertState insertstate,
39  bool checkingunique,
40  BTStack stack,
41  Relation heapRel);
42 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
43 static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
44  Buffer buf,
45  Buffer cbuf,
46  BTStack stack,
47  IndexTuple itup,
48  Size itemsz,
49  OffsetNumber newitemoff,
50  int postingoff,
51  bool split_only_page);
52 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
53  Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
54  IndexTuple newitem, IndexTuple orignewitem,
55  IndexTuple nposting, uint16 postingoff);
56 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
57  BTStack stack, bool is_root, bool is_only);
58 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
59 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
60  OffsetNumber itup_off);
61 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
62 
63 /*
64  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
65  *
66  * This routine is called by the public interface routine, btinsert.
67  * By here, itup is filled in, including the TID.
68  *
69  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
70  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
71  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
72  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
73  * don't actually insert.
74  *
75  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
76  * it must be true if the entry is known unique, else false.
77  * (In the current implementation we'll also return true after a
78  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
79  * that's just a coding artifact.)
80  */
81 bool
83  IndexUniqueCheck checkUnique, Relation heapRel)
84 {
85  bool is_unique = false;
86  BTInsertStateData insertstate;
87  BTScanInsert itup_key;
88  BTStack stack;
89  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
90 
91  /* we need an insertion scan key to do our search, so build one */
92  itup_key = _bt_mkscankey(rel, itup);
93 
94  if (checkingunique)
95  {
96  if (!itup_key->anynullkeys)
97  {
98  /* No (heapkeyspace) scantid until uniqueness established */
99  itup_key->scantid = NULL;
100  }
101  else
102  {
103  /*
104  * Scan key for new tuple contains NULL key values. Bypass
105  * checkingunique steps. They are unnecessary because core code
106  * considers NULL unequal to every value, including NULL.
107  *
108  * This optimization avoids O(N^2) behavior within the
109  * _bt_findinsertloc() heapkeyspace path when a unique index has a
110  * large number of "duplicates" with NULL key values.
111  */
112  checkingunique = false;
113  /* Tuple is unique in the sense that core code cares about */
114  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
115  is_unique = true;
116  }
117  }
118 
119  /*
120  * Fill in the BTInsertState working area, to track the current page and
121  * position within the page to insert on.
122  *
123  * Note that itemsz is passed down to lower level code that deals with
124  * inserting the item. It must be MAXALIGN()'d. This ensures that space
125  * accounting code consistently considers the alignment overhead that we
126  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
127  * already conservative about alignment, but we don't rely on that from
128  * this distance. Besides, preserving the "true" tuple size in index
129  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
130  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
131  */
132  insertstate.itup = itup;
133  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
134  insertstate.itup_key = itup_key;
135  insertstate.bounds_valid = false;
136  insertstate.buf = InvalidBuffer;
137  insertstate.postingoff = 0;
138 
139 search:
140 
141  /*
142  * Find and lock the leaf page that the tuple should be added to by
143  * searching from the root page. insertstate.buf will hold a buffer that
144  * is locked in exclusive mode afterwards.
145  */
146  stack = _bt_search_insert(rel, &insertstate);
147 
148  /*
149  * checkingunique inserts are not allowed to go ahead when two tuples with
150  * equal key attribute values would be visible to new MVCC snapshots once
151  * the xact commits. Check for conflicts in the locked page/buffer (if
152  * needed) here.
153  *
154  * It might be necessary to check a page to the right in _bt_check_unique,
155  * though that should be very rare. In practice the first page the value
156  * could be on (with scantid omitted) is almost always also the only page
157  * that a matching tuple might be found on. This is due to the behavior
158  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
159  * only be allowed to cross a page boundary when there is no candidate
160  * leaf page split point that avoids it. Also, _bt_check_unique can use
161  * the leaf page high key to determine that there will be no duplicates on
162  * the right sibling without actually visiting it (it uses the high key in
163  * cases where the new item happens to belong at the far right of the leaf
164  * page).
165  *
166  * NOTE: obviously, _bt_check_unique can only detect keys that are already
167  * in the index; so it cannot defend against concurrent insertions of the
168  * same key. We protect against that by means of holding a write lock on
169  * the first page the value could be on, with omitted/-inf value for the
170  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
171  * the same key must acquire a write lock on the same page, so only one
172  * would-be inserter can be making the check at one time. Furthermore,
173  * once we are past the check we hold write locks continuously until we
174  * have performed our insertion, so no later inserter can fail to see our
175  * insertion. (This requires some care in _bt_findinsertloc.)
176  *
177  * If we must wait for another xact, we release the lock while waiting,
178  * and then must perform a new search.
179  *
180  * For a partial uniqueness check, we don't wait for the other xact. Just
181  * let the tuple in and return false for possibly non-unique, or true for
182  * definitely unique.
183  */
184  if (checkingunique)
185  {
186  TransactionId xwait;
187  uint32 speculativeToken;
188 
189  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
190  &is_unique, &speculativeToken);
191 
192  if (unlikely(TransactionIdIsValid(xwait)))
193  {
194  /* Have to wait for the other guy ... */
195  _bt_relbuf(rel, insertstate.buf);
196  insertstate.buf = InvalidBuffer;
197 
198  /*
199  * If it's a speculative insertion, wait for it to finish (ie. to
200  * go ahead with the insertion, or kill the tuple). Otherwise
201  * wait for the transaction to finish as usual.
202  */
203  if (speculativeToken)
204  SpeculativeInsertionWait(xwait, speculativeToken);
205  else
206  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
207 
208  /* start over... */
209  if (stack)
210  _bt_freestack(stack);
211  goto search;
212  }
213 
214  /* Uniqueness is established -- restore heap tid as scantid */
215  if (itup_key->heapkeyspace)
216  itup_key->scantid = &itup->t_tid;
217  }
218 
219  if (checkUnique != UNIQUE_CHECK_EXISTING)
220  {
221  OffsetNumber newitemoff;
222 
223  /*
224  * The only conflict predicate locking cares about for indexes is when
225  * an index tuple insert conflicts with an existing lock. We don't
226  * know the actual page we're going to insert on for sure just yet in
227  * checkingunique and !heapkeyspace cases, but it's okay to use the
228  * first page the value could be on (with scantid omitted) instead.
229  */
231 
232  /*
233  * Do the insertion. Note that insertstate contains cached binary
234  * search bounds established within _bt_check_unique when insertion is
235  * checkingunique.
236  */
237  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
238  stack, heapRel);
239  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
240  itup, insertstate.itemsz, newitemoff,
241  insertstate.postingoff, false);
242  }
243  else
244  {
245  /* just release the buffer */
246  _bt_relbuf(rel, insertstate.buf);
247  }
248 
249  /* be tidy */
250  if (stack)
251  _bt_freestack(stack);
252  pfree(itup_key);
253 
254  return is_unique;
255 }
256 
257 /*
258  * _bt_search_insert() -- _bt_search() wrapper for inserts
259  *
260  * Search the tree for a particular scankey, or more precisely for the first
261  * leaf page it could be on. Try to make use of the fastpath optimization's
262  * rightmost leaf page cache before actually searching the tree from the root
263  * page, though.
264  *
265  * Return value is a stack of parent-page pointers (though see notes about
266  * fastpath optimization and page splits below). insertstate->buf is set to
267  * the address of the leaf-page buffer, which is write-locked and pinned in
268  * all cases (if necessary by creating a new empty root page for caller).
269  *
270  * The fastpath optimization avoids most of the work of searching the tree
271  * repeatedly when a single backend inserts successive new tuples on the
272  * rightmost leaf page of an index. A backend cache of the rightmost leaf
273  * page is maintained within _bt_insertonpg(), and used here. The cache is
274  * invalidated here when an insert of a non-pivot tuple must take place on a
275  * non-rightmost leaf page.
276  *
277  * The optimization helps with indexes on an auto-incremented field. It also
278  * helps with indexes on datetime columns, as well as indexes with lots of
279  * NULL values. (NULLs usually get inserted in the rightmost page for single
280  * column indexes, since they usually get treated as coming after everything
281  * else in the key space. Individual NULL tuples will generally be placed on
282  * the rightmost leaf page due to the influence of the heap TID column.)
283  *
284  * Note that we avoid applying the optimization when there is insufficient
285  * space on the rightmost page to fit caller's new item. This is necessary
286  * because we'll need to return a real descent stack when a page split is
287  * expected (actually, caller can cope with a leaf page split that uses a NULL
288  * stack, but that's very slow and so must be avoided). Note also that the
289  * fastpath optimization acquires the lock on the page conditionally as a way
290  * of reducing extra contention when there are concurrent insertions into the
291  * rightmost page (we give up if we'd have to wait for the lock). We assume
292  * that it isn't useful to apply the optimization when there is contention,
293  * since each per-backend cache won't stay valid for long.
294  */
295 static BTStack
297 {
298  Assert(insertstate->buf == InvalidBuffer);
299  Assert(!insertstate->bounds_valid);
300  Assert(insertstate->postingoff == 0);
301 
303  {
304  /* Simulate a _bt_getbuf() call with conditional locking */
305  insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
306  if (ConditionalLockBuffer(insertstate->buf))
307  {
308  Page page;
309  BTPageOpaque lpageop;
310 
311  _bt_checkpage(rel, insertstate->buf);
312  page = BufferGetPage(insertstate->buf);
313  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
314 
315  /*
316  * Check if the page is still the rightmost leaf page and has
317  * enough free space to accommodate the new tuple. Also check
318  * that the insertion scan key is strictly greater than the first
319  * non-pivot tuple on the page. (Note that we expect itup_key's
320  * scantid to be unset when our caller is a checkingunique
321  * inserter.)
322  */
323  if (P_RIGHTMOST(lpageop) &&
324  P_ISLEAF(lpageop) &&
325  !P_IGNORE(lpageop) &&
326  PageGetFreeSpace(page) > insertstate->itemsz &&
327  PageGetMaxOffsetNumber(page) >= P_HIKEY &&
328  _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
329  {
330  /*
331  * Caller can use the fastpath optimization because cached
332  * block is still rightmost leaf page, which can fit caller's
333  * new tuple without splitting. Keep block in local cache for
334  * next insert, and have caller use NULL stack.
335  *
336  * Note that _bt_insert_parent() has an assertion that catches
337  * leaf page splits that somehow follow from a fastpath insert
338  * (it should only be passed a NULL stack when it must deal
339  * with a concurrent root page split, and never because a NULL
340  * stack was returned here).
341  */
342  return NULL;
343  }
344 
345  /* Page unsuitable for caller, drop lock and pin */
346  _bt_relbuf(rel, insertstate->buf);
347  }
348  else
349  {
350  /* Lock unavailable, drop pin */
351  ReleaseBuffer(insertstate->buf);
352  }
353 
354  /* Forget block, since cache doesn't appear to be useful */
356  }
357 
358  /* Cannot use optimization -- descend tree, return proper descent stack */
359  return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
360  NULL);
361 }
362 
363 /*
364  * _bt_check_unique() -- Check for violation of unique index constraint
365  *
366  * Returns InvalidTransactionId if there is no conflict, else an xact ID
367  * we must wait for to see if it commits a conflicting tuple. If an actual
368  * conflict is detected, no return --- just ereport(). If an xact ID is
369  * returned, and the conflicting tuple still has a speculative insertion in
370  * progress, *speculativeToken is set to non-zero, and the caller can wait for
371  * the verdict on the insertion using SpeculativeInsertionWait().
372  *
373  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
374  * InvalidTransactionId because we don't want to wait. In this case we
375  * set *is_unique to false if there is a potential conflict, and the
376  * core code must redo the uniqueness check later.
377  *
378  * As a side-effect, sets state in insertstate that can later be used by
379  * _bt_findinsertloc() to reuse most of the binary search work we do
380  * here.
381  *
382  * Do not call here when there are NULL values in scan key. NULL should be
383  * considered unequal to NULL when checking for duplicates, but we are not
384  * prepared to handle that correctly.
385  */
386 static TransactionId
388  IndexUniqueCheck checkUnique, bool *is_unique,
389  uint32 *speculativeToken)
390 {
391  IndexTuple itup = insertstate->itup;
392  IndexTuple curitup;
393  ItemId curitemid;
394  BTScanInsert itup_key = insertstate->itup_key;
395  SnapshotData SnapshotDirty;
396  OffsetNumber offset;
397  OffsetNumber maxoff;
398  Page page;
399  BTPageOpaque opaque;
400  Buffer nbuf = InvalidBuffer;
401  bool found = false;
402  bool inposting = false;
403  bool prevalldead = true;
404  int curposti = 0;
405 
406  /* Assume unique until we find a duplicate */
407  *is_unique = true;
408 
409  InitDirtySnapshot(SnapshotDirty);
410 
411  page = BufferGetPage(insertstate->buf);
412  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
413  maxoff = PageGetMaxOffsetNumber(page);
414 
415  /*
416  * Find the first tuple with the same key.
417  *
418  * This also saves the binary search bounds in insertstate. We use them
419  * in the fastpath below, but also in the _bt_findinsertloc() call later.
420  */
421  Assert(!insertstate->bounds_valid);
422  offset = _bt_binsrch_insert(rel, insertstate);
423 
424  /*
425  * Scan over all equal tuples, looking for live conflicts.
426  */
427  Assert(!insertstate->bounds_valid || insertstate->low == offset);
428  Assert(!itup_key->anynullkeys);
429  Assert(itup_key->scantid == NULL);
430  for (;;)
431  {
432  /*
433  * Each iteration of the loop processes one heap TID, not one index
434  * tuple. Current offset number for page isn't usually advanced on
435  * iterations that process heap TIDs from posting list tuples.
436  *
437  * "inposting" state is set when _inside_ a posting list --- not when
438  * we're at the start (or end) of a posting list. We advance curposti
439  * at the end of the iteration when inside a posting list tuple. In
440  * general, every loop iteration either advances the page offset or
441  * advances curposti --- an iteration that handles the rightmost/max
442  * heap TID in a posting list finally advances the page offset (and
443  * unsets "inposting").
444  *
445  * Make sure the offset points to an actual index tuple before trying
446  * to examine it...
447  */
448  if (offset <= maxoff)
449  {
450  /*
451  * Fastpath: In most cases, we can use cached search bounds to
452  * limit our consideration to items that are definitely
453  * duplicates. This fastpath doesn't apply when the original page
454  * is empty, or when initial offset is past the end of the
455  * original page, which may indicate that we need to examine a
456  * second or subsequent page.
457  *
458  * Note that this optimization allows us to avoid calling
459  * _bt_compare() directly when there are no duplicates, as long as
460  * the offset where the key will go is not at the end of the page.
461  */
462  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
463  {
464  Assert(insertstate->bounds_valid);
465  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
466  Assert(insertstate->low <= insertstate->stricthigh);
467  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
468  break;
469  }
470 
471  /*
472  * We can skip items that are already marked killed.
473  *
474  * In the presence of heavy update activity an index may contain
475  * many killed items with the same key; running _bt_compare() on
476  * each killed item gets expensive. Just advance over killed
477  * items as quickly as we can. We only apply _bt_compare() when
478  * we get to a non-killed item. We could reuse the bounds to
479  * avoid _bt_compare() calls for known equal tuples, but it
480  * doesn't seem worth it. Workloads with heavy update activity
481  * tend to have many deduplication passes, so we'll often avoid
482  * most of those comparisons, too (we call _bt_compare() when the
483  * posting list tuple is initially encountered, though not when
484  * processing later TIDs from the same tuple).
485  */
486  if (!inposting)
487  curitemid = PageGetItemId(page, offset);
488  if (inposting || !ItemIdIsDead(curitemid))
489  {
490  ItemPointerData htid;
491  bool all_dead = false;
492 
493  if (!inposting)
494  {
495  /* Plain tuple, or first TID in posting list tuple */
496  if (_bt_compare(rel, itup_key, page, offset) != 0)
497  break; /* we're past all the equal tuples */
498 
499  /* Advanced curitup */
500  curitup = (IndexTuple) PageGetItem(page, curitemid);
501  Assert(!BTreeTupleIsPivot(curitup));
502  }
503 
504  /* okay, we gotta fetch the heap tuple using htid ... */
505  if (!BTreeTupleIsPosting(curitup))
506  {
507  /* ... htid is from simple non-pivot tuple */
508  Assert(!inposting);
509  htid = curitup->t_tid;
510  }
511  else if (!inposting)
512  {
513  /* ... htid is first TID in new posting list */
514  inposting = true;
515  prevalldead = true;
516  curposti = 0;
517  htid = *BTreeTupleGetPostingN(curitup, 0);
518  }
519  else
520  {
521  /* ... htid is second or subsequent TID in posting list */
522  Assert(curposti > 0);
523  htid = *BTreeTupleGetPostingN(curitup, curposti);
524  }
525 
526  /*
527  * If we are doing a recheck, we expect to find the tuple we
528  * are rechecking. It's not a duplicate, but we have to keep
529  * scanning.
530  */
531  if (checkUnique == UNIQUE_CHECK_EXISTING &&
532  ItemPointerCompare(&htid, &itup->t_tid) == 0)
533  {
534  found = true;
535  }
536 
537  /*
538  * Check if there's any table tuples for this index entry
539  * satisfying SnapshotDirty. This is necessary because for AMs
540  * with optimizations like heap's HOT, we have just a single
541  * index entry for the entire chain.
542  */
543  else if (table_index_fetch_tuple_check(heapRel, &htid,
544  &SnapshotDirty,
545  &all_dead))
546  {
547  TransactionId xwait;
548 
549  /*
550  * It is a duplicate. If we are only doing a partial
551  * check, then don't bother checking if the tuple is being
552  * updated in another transaction. Just return the fact
553  * that it is a potential conflict and leave the full
554  * check till later. Don't invalidate binary search
555  * bounds.
556  */
557  if (checkUnique == UNIQUE_CHECK_PARTIAL)
558  {
559  if (nbuf != InvalidBuffer)
560  _bt_relbuf(rel, nbuf);
561  *is_unique = false;
562  return InvalidTransactionId;
563  }
564 
565  /*
566  * If this tuple is being updated by other transaction
567  * then we have to wait for its commit/abort.
568  */
569  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
570  SnapshotDirty.xmin : SnapshotDirty.xmax;
571 
572  if (TransactionIdIsValid(xwait))
573  {
574  if (nbuf != InvalidBuffer)
575  _bt_relbuf(rel, nbuf);
576  /* Tell _bt_doinsert to wait... */
577  *speculativeToken = SnapshotDirty.speculativeToken;
578  /* Caller releases lock on buf immediately */
579  insertstate->bounds_valid = false;
580  return xwait;
581  }
582 
583  /*
584  * Otherwise we have a definite conflict. But before
585  * complaining, look to see if the tuple we want to insert
586  * is itself now committed dead --- if so, don't complain.
587  * This is a waste of time in normal scenarios but we must
588  * do it to support CREATE INDEX CONCURRENTLY.
589  *
590  * We must follow HOT-chains here because during
591  * concurrent index build, we insert the root TID though
592  * the actual tuple may be somewhere in the HOT-chain.
593  * While following the chain we might not stop at the
594  * exact tuple which triggered the insert, but that's OK
595  * because if we find a live tuple anywhere in this chain,
596  * we have a unique key conflict. The other live tuple is
597  * not part of this chain because it had a different index
598  * entry.
599  */
600  if (table_index_fetch_tuple_check(heapRel, &itup->t_tid,
601  SnapshotSelf, NULL))
602  {
603  /* Normal case --- it's still live */
604  }
605  else
606  {
607  /*
608  * It's been deleted, so no error, and no need to
609  * continue searching
610  */
611  break;
612  }
613 
614  /*
615  * Check for a conflict-in as we would if we were going to
616  * write to this page. We aren't actually going to write,
617  * but we want a chance to report SSI conflicts that would
618  * otherwise be masked by this unique constraint
619  * violation.
620  */
621  CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
622 
623  /*
624  * This is a definite conflict. Break the tuple down into
625  * datums and report the error. But first, make sure we
626  * release the buffer locks we're holding ---
627  * BuildIndexValueDescription could make catalog accesses,
628  * which in the worst case might touch this same index and
629  * cause deadlocks.
630  */
631  if (nbuf != InvalidBuffer)
632  _bt_relbuf(rel, nbuf);
633  _bt_relbuf(rel, insertstate->buf);
634  insertstate->buf = InvalidBuffer;
635  insertstate->bounds_valid = false;
636 
637  {
639  bool isnull[INDEX_MAX_KEYS];
640  char *key_desc;
641 
643  values, isnull);
644 
645  key_desc = BuildIndexValueDescription(rel, values,
646  isnull);
647 
648  ereport(ERROR,
649  (errcode(ERRCODE_UNIQUE_VIOLATION),
650  errmsg("duplicate key value violates unique constraint \"%s\"",
652  key_desc ? errdetail("Key %s already exists.",
653  key_desc) : 0,
654  errtableconstraint(heapRel,
655  RelationGetRelationName(rel))));
656  }
657  }
658  else if (all_dead && (!inposting ||
659  (prevalldead &&
660  curposti == BTreeTupleGetNPosting(curitup) - 1)))
661  {
662  /*
663  * The conflicting tuple (or all HOT chains pointed to by
664  * all posting list TIDs) is dead to everyone, so mark the
665  * index entry killed.
666  */
667  ItemIdMarkDead(curitemid);
668  opaque->btpo_flags |= BTP_HAS_GARBAGE;
669 
670  /*
671  * Mark buffer with a dirty hint, since state is not
672  * crucial. Be sure to mark the proper buffer dirty.
673  */
674  if (nbuf != InvalidBuffer)
675  MarkBufferDirtyHint(nbuf, true);
676  else
677  MarkBufferDirtyHint(insertstate->buf, true);
678  }
679 
680  /*
681  * Remember if posting list tuple has even a single HOT chain
682  * whose members are not all dead
683  */
684  if (!all_dead && inposting)
685  prevalldead = false;
686  }
687  }
688 
689  if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
690  {
691  /* Advance to next TID in same posting list */
692  curposti++;
693  continue;
694  }
695  else if (offset < maxoff)
696  {
697  /* Advance to next tuple */
698  curposti = 0;
699  inposting = false;
700  offset = OffsetNumberNext(offset);
701  }
702  else
703  {
704  int highkeycmp;
705 
706  /* If scankey == hikey we gotta check the next page too */
707  if (P_RIGHTMOST(opaque))
708  break;
709  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
710  Assert(highkeycmp <= 0);
711  if (highkeycmp != 0)
712  break;
713  /* Advance to next non-dead page --- there must be one */
714  for (;;)
715  {
716  BlockNumber nblkno = opaque->btpo_next;
717 
718  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
719  page = BufferGetPage(nbuf);
720  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
721  if (!P_IGNORE(opaque))
722  break;
723  if (P_RIGHTMOST(opaque))
724  elog(ERROR, "fell off the end of index \"%s\"",
726  }
727  /* Will also advance to next tuple */
728  curposti = 0;
729  inposting = false;
730  maxoff = PageGetMaxOffsetNumber(page);
731  offset = P_FIRSTDATAKEY(opaque);
732  /* Don't invalidate binary search bounds */
733  }
734  }
735 
736  /*
737  * If we are doing a recheck then we should have found the tuple we are
738  * checking. Otherwise there's something very wrong --- probably, the
739  * index is on a non-immutable expression.
740  */
741  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
742  ereport(ERROR,
743  (errcode(ERRCODE_INTERNAL_ERROR),
744  errmsg("failed to re-find tuple within index \"%s\"",
746  errhint("This may be because of a non-immutable index expression."),
747  errtableconstraint(heapRel,
748  RelationGetRelationName(rel))));
749 
750  if (nbuf != InvalidBuffer)
751  _bt_relbuf(rel, nbuf);
752 
753  return InvalidTransactionId;
754 }
755 
756 
757 /*
758  * _bt_findinsertloc() -- Finds an insert location for a tuple
759  *
760  * On entry, insertstate buffer contains the page the new tuple belongs
761  * on. It is exclusive-locked and pinned by the caller.
762  *
763  * If 'checkingunique' is true, the buffer on entry is the first page
764  * that contains duplicates of the new key. If there are duplicates on
765  * multiple pages, the correct insertion position might be some page to
766  * the right, rather than the first page. In that case, this function
767  * moves right to the correct target page.
768  *
769  * (In a !heapkeyspace index, there can be multiple pages with the same
770  * high key, where the new tuple could legitimately be placed on. In
771  * that case, the caller passes the first page containing duplicates,
772  * just like when checkingunique=true. If that page doesn't have enough
773  * room for the new tuple, this function moves right, trying to find a
774  * legal page that does.)
775  *
776  * On exit, insertstate buffer contains the chosen insertion page, and
777  * the offset within that page is returned. If _bt_findinsertloc needed
778  * to move right, the lock and pin on the original page are released, and
779  * the new buffer is exclusively locked and pinned instead.
780  *
781  * If insertstate contains cached binary search bounds, we will take
782  * advantage of them. This avoids repeating comparisons that we made in
783  * _bt_check_unique() already.
784  *
785  * If there is not enough room on the page for the new tuple, we try to
786  * make room by removing any LP_DEAD tuples.
787  */
788 static OffsetNumber
790  BTInsertState insertstate,
791  bool checkingunique,
792  BTStack stack,
793  Relation heapRel)
794 {
795  BTScanInsert itup_key = insertstate->itup_key;
796  Page page = BufferGetPage(insertstate->buf);
797  BTPageOpaque lpageop;
798  OffsetNumber newitemoff;
799 
800  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
801 
802  /* Check 1/3 of a page restriction */
803  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
804  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
805  insertstate->itup);
806 
807  Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
808  Assert(!insertstate->bounds_valid || checkingunique);
809  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
810  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
811  Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
812 
813  if (itup_key->heapkeyspace)
814  {
815  /* Keep track of whether checkingunique duplicate seen */
816  bool uniquedup = false;
817 
818  /*
819  * If we're inserting into a unique index, we may have to walk right
820  * through leaf pages to find the one leaf page that we must insert on
821  * to.
822  *
823  * This is needed for checkingunique callers because a scantid was not
824  * used when we called _bt_search(). scantid can only be set after
825  * _bt_check_unique() has checked for duplicates. The buffer
826  * initially stored in insertstate->buf has the page where the first
827  * duplicate key might be found, which isn't always the page that new
828  * tuple belongs on. The heap TID attribute for new tuple (scantid)
829  * could force us to insert on a sibling page, though that should be
830  * very rare in practice.
831  */
832  if (checkingunique)
833  {
834  if (insertstate->low < insertstate->stricthigh)
835  {
836  /* Encountered a duplicate in _bt_check_unique() */
837  Assert(insertstate->bounds_valid);
838  uniquedup = true;
839  }
840 
841  for (;;)
842  {
843  /*
844  * Does the new tuple belong on this page?
845  *
846  * The earlier _bt_check_unique() call may well have
847  * established a strict upper bound on the offset for the new
848  * item. If it's not the last item of the page (i.e. if there
849  * is at least one tuple on the page that goes after the tuple
850  * we're inserting) then we know that the tuple belongs on
851  * this page. We can skip the high key check.
852  */
853  if (insertstate->bounds_valid &&
854  insertstate->low <= insertstate->stricthigh &&
855  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
856  break;
857 
858  /* Test '<=', not '!=', since scantid is set now */
859  if (P_RIGHTMOST(lpageop) ||
860  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
861  break;
862 
863  _bt_stepright(rel, insertstate, stack);
864  /* Update local state after stepping right */
865  page = BufferGetPage(insertstate->buf);
866  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
867  /* Assume duplicates (if checkingunique) */
868  uniquedup = true;
869  }
870  }
871 
872  /*
873  * If the target page is full, see if we can obtain enough space by
874  * erasing LP_DEAD items. If that fails to free enough space, see if
875  * we can avoid a page split by performing a deduplication pass over
876  * the page.
877  *
878  * We only perform a deduplication pass for a checkingunique caller
879  * when the incoming item is a duplicate of an existing item on the
880  * leaf page. This heuristic avoids wasting cycles -- we only expect
881  * to benefit from deduplicating a unique index page when most or all
882  * recently added items are duplicates. See nbtree/README.
883  */
884  if (PageGetFreeSpace(page) < insertstate->itemsz)
885  {
886  if (P_HAS_GARBAGE(lpageop))
887  {
888  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
889  insertstate->bounds_valid = false;
890 
891  /* Might as well assume duplicates (if checkingunique) */
892  uniquedup = true;
893  }
894 
895  if (itup_key->allequalimage && BTGetDeduplicateItems(rel) &&
896  (!checkingunique || uniquedup) &&
897  PageGetFreeSpace(page) < insertstate->itemsz)
898  {
899  _bt_dedup_one_page(rel, insertstate->buf, heapRel,
900  insertstate->itup, insertstate->itemsz,
901  checkingunique);
902  insertstate->bounds_valid = false;
903  }
904  }
905  }
906  else
907  {
908  /*----------
909  * This is a !heapkeyspace (version 2 or 3) index. The current page
910  * is the first page that we could insert the new tuple to, but there
911  * may be other pages to the right that we could opt to use instead.
912  *
913  * If the new key is equal to one or more existing keys, we can
914  * legitimately place it anywhere in the series of equal keys. In
915  * fact, if the new key is equal to the page's "high key" we can place
916  * it on the next page. If it is equal to the high key, and there's
917  * not room to insert the new tuple on the current page without
918  * splitting, then we move right hoping to find more free space and
919  * avoid a split.
920  *
921  * Keep scanning right until we
922  * (a) find a page with enough free space,
923  * (b) reach the last page where the tuple can legally go, or
924  * (c) get tired of searching.
925  * (c) is not flippant; it is important because if there are many
926  * pages' worth of equal keys, it's better to split one of the early
927  * pages than to scan all the way to the end of the run of equal keys
928  * on every insert. We implement "get tired" as a random choice,
929  * since stopping after scanning a fixed number of pages wouldn't work
930  * well (we'd never reach the right-hand side of previously split
931  * pages). The probability of moving right is set at 0.99, which may
932  * seem too high to change the behavior much, but it does an excellent
933  * job of preventing O(N^2) behavior with many equal keys.
934  *----------
935  */
936  while (PageGetFreeSpace(page) < insertstate->itemsz)
937  {
938  /*
939  * Before considering moving right, see if we can obtain enough
940  * space by erasing LP_DEAD items
941  */
942  if (P_HAS_GARBAGE(lpageop))
943  {
944  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
945  insertstate->bounds_valid = false;
946 
947  if (PageGetFreeSpace(page) >= insertstate->itemsz)
948  break; /* OK, now we have enough space */
949  }
950 
951  /*
952  * Nope, so check conditions (b) and (c) enumerated above
953  *
954  * The earlier _bt_check_unique() call may well have established a
955  * strict upper bound on the offset for the new item. If it's not
956  * the last item of the page (i.e. if there is at least one tuple
957  * on the page that's greater than the tuple we're inserting to)
958  * then we know that the tuple belongs on this page. We can skip
959  * the high key check.
960  */
961  if (insertstate->bounds_valid &&
962  insertstate->low <= insertstate->stricthigh &&
963  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
964  break;
965 
966  if (P_RIGHTMOST(lpageop) ||
967  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
968  random() <= (MAX_RANDOM_VALUE / 100))
969  break;
970 
971  _bt_stepright(rel, insertstate, stack);
972  /* Update local state after stepping right */
973  page = BufferGetPage(insertstate->buf);
974  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
975  }
976  }
977 
978  /*
979  * We should now be on the correct page. Find the offset within the page
980  * for the new tuple. (Possibly reusing earlier search bounds.)
981  */
982  Assert(P_RIGHTMOST(lpageop) ||
983  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
984 
985  newitemoff = _bt_binsrch_insert(rel, insertstate);
986 
987  if (insertstate->postingoff == -1)
988  {
989  /*
990  * There is an overlapping posting list tuple with its LP_DEAD bit
991  * set. We don't want to unnecessarily unset its LP_DEAD bit while
992  * performing a posting list split, so delete all LP_DEAD items early.
993  * This is the only case where LP_DEAD deletes happen even though
994  * there is space for newitem on the page.
995  */
996  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
997 
998  /*
999  * Do new binary search. New insert location cannot overlap with any
1000  * posting list now.
1001  */
1002  insertstate->bounds_valid = false;
1003  insertstate->postingoff = 0;
1004  newitemoff = _bt_binsrch_insert(rel, insertstate);
1005  Assert(insertstate->postingoff == 0);
1006  }
1007 
1008  return newitemoff;
1009 }
1010 
1011 /*
1012  * Step right to next non-dead page, during insertion.
1013  *
1014  * This is a bit more complicated than moving right in a search. We must
1015  * write-lock the target page before releasing write lock on current page;
1016  * else someone else's _bt_check_unique scan could fail to see our insertion.
1017  * Write locks on intermediate dead pages won't do because we don't know when
1018  * they will get de-linked from the tree.
1019  *
1020  * This is more aggressive than it needs to be for non-unique !heapkeyspace
1021  * indexes.
1022  */
1023 static void
1025 {
1026  Page page;
1027  BTPageOpaque lpageop;
1028  Buffer rbuf;
1029  BlockNumber rblkno;
1030 
1031  page = BufferGetPage(insertstate->buf);
1032  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1033 
1034  rbuf = InvalidBuffer;
1035  rblkno = lpageop->btpo_next;
1036  for (;;)
1037  {
1038  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1039  page = BufferGetPage(rbuf);
1040  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1041 
1042  /*
1043  * If this page was incompletely split, finish the split now. We do
1044  * this while holding a lock on the left sibling, which is not good
1045  * because finishing the split could be a fairly lengthy operation.
1046  * But this should happen very seldom.
1047  */
1048  if (P_INCOMPLETE_SPLIT(lpageop))
1049  {
1050  _bt_finish_split(rel, rbuf, stack);
1051  rbuf = InvalidBuffer;
1052  continue;
1053  }
1054 
1055  if (!P_IGNORE(lpageop))
1056  break;
1057  if (P_RIGHTMOST(lpageop))
1058  elog(ERROR, "fell off the end of index \"%s\"",
1060 
1061  rblkno = lpageop->btpo_next;
1062  }
1063  /* rbuf locked; unlock buf, update state for caller */
1064  _bt_relbuf(rel, insertstate->buf);
1065  insertstate->buf = rbuf;
1066  insertstate->bounds_valid = false;
1067 }
1068 
1069 /*----------
1070  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1071  *
1072  * This recursive procedure does the following things:
1073  *
1074  * + if postingoff != 0, splits existing posting list tuple
1075  * (since it overlaps with new 'itup' tuple).
1076  * + if necessary, splits the target page, using 'itup_key' for
1077  * suffix truncation on leaf pages (caller passes NULL for
1078  * non-leaf pages).
1079  * + inserts the new tuple (might be split from posting list).
1080  * + if the page was split, pops the parent stack, and finds the
1081  * right place to insert the new child pointer (by walking
1082  * right using information stored in the parent stack).
1083  * + invokes itself with the appropriate tuple for the right
1084  * child page on the parent.
1085  * + updates the metapage if a true root or fast root is split.
1086  *
1087  * On entry, we must have the correct buffer in which to do the
1088  * insertion, and the buffer must be pinned and write-locked. On return,
1089  * we will have dropped both the pin and the lock on the buffer.
1090  *
1091  * This routine only performs retail tuple insertions. 'itup' should
1092  * always be either a non-highkey leaf item, or a downlink (new high
1093  * key items are created indirectly, when a page is split). When
1094  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1095  * we're inserting the downlink for. This function will clear the
1096  * INCOMPLETE_SPLIT flag on it, and release the buffer.
1097  *----------
1098  */
1099 static void
1101  BTScanInsert itup_key,
1102  Buffer buf,
1103  Buffer cbuf,
1104  BTStack stack,
1105  IndexTuple itup,
1106  Size itemsz,
1107  OffsetNumber newitemoff,
1108  int postingoff,
1109  bool split_only_page)
1110 {
1111  Page page;
1112  BTPageOpaque lpageop;
1113  IndexTuple oposting = NULL;
1114  IndexTuple origitup = NULL;
1115  IndexTuple nposting = NULL;
1116 
1117  page = BufferGetPage(buf);
1118  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1119 
1120  /* child buffer must be given iff inserting on an internal page */
1121  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
1122  /* tuple must have appropriate number of attributes */
1123  Assert(!P_ISLEAF(lpageop) ||
1124  BTreeTupleGetNAtts(itup, rel) ==
1126  Assert(P_ISLEAF(lpageop) ||
1127  BTreeTupleGetNAtts(itup, rel) <=
1129  Assert(!BTreeTupleIsPosting(itup));
1130  Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1131 
1132  /*
1133  * Every internal page should have exactly one negative infinity item at
1134  * all times. Only _bt_split() and _bt_newroot() should add items that
1135  * become negative infinity items through truncation, since they're the
1136  * only routines that allocate new internal pages.
1137  */
1138  Assert(P_ISLEAF(lpageop) || newitemoff > P_FIRSTDATAKEY(lpageop));
1139 
1140  /* The caller should've finished any incomplete splits already. */
1141  if (P_INCOMPLETE_SPLIT(lpageop))
1142  elog(ERROR, "cannot insert to incompletely split page %u",
1143  BufferGetBlockNumber(buf));
1144 
1145  /*
1146  * Do we need to split an existing posting list item?
1147  */
1148  if (postingoff != 0)
1149  {
1150  ItemId itemid = PageGetItemId(page, newitemoff);
1151 
1152  /*
1153  * The new tuple is a duplicate with a heap TID that falls inside the
1154  * range of an existing posting list tuple on a leaf page. Prepare to
1155  * split an existing posting list. Overwriting the posting list with
1156  * its post-split version is treated as an extra step in either the
1157  * insert or page split critical section.
1158  */
1159  Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid));
1160  Assert(itup_key->heapkeyspace && itup_key->allequalimage);
1161  oposting = (IndexTuple) PageGetItem(page, itemid);
1162 
1163  /* use a mutable copy of itup as our itup from here on */
1164  origitup = itup;
1165  itup = CopyIndexTuple(origitup);
1166  nposting = _bt_swap_posting(itup, oposting, postingoff);
1167  /* itup now contains rightmost/max TID from oposting */
1168 
1169  /* Alter offset so that newitem goes after posting list */
1170  newitemoff = OffsetNumberNext(newitemoff);
1171  }
1172 
1173  /*
1174  * Do we need to split the page to fit the item on it?
1175  *
1176  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1177  * so this comparison is correct even though we appear to be accounting
1178  * only for the item and not for its line pointer.
1179  */
1180  if (PageGetFreeSpace(page) < itemsz)
1181  {
1182  bool is_root = P_ISROOT(lpageop);
1183  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
1184  Buffer rbuf;
1185 
1186  /* split the buffer into left and right halves */
1187  rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
1188  origitup, nposting, postingoff);
1190  BufferGetBlockNumber(buf),
1191  BufferGetBlockNumber(rbuf));
1192 
1193  /*----------
1194  * By here,
1195  *
1196  * + our target page has been split;
1197  * + the original tuple has been inserted;
1198  * + we have write locks on both the old (left half)
1199  * and new (right half) buffers, after the split; and
1200  * + we know the key we want to insert into the parent
1201  * (it's the "high key" on the left child page).
1202  *
1203  * We're ready to do the parent insertion. We need to hold onto the
1204  * locks for the child pages until we locate the parent, but we can
1205  * at least release the lock on the right child before doing the
1206  * actual insertion. The lock on the left child will be released
1207  * last of all by parent insertion, where it is the 'cbuf' of parent
1208  * page.
1209  *----------
1210  */
1211  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
1212  }
1213  else
1214  {
1215  bool isleaf = P_ISLEAF(lpageop);
1216  bool isrightmost = P_RIGHTMOST(lpageop);
1217  Buffer metabuf = InvalidBuffer;
1218  Page metapg = NULL;
1219  BTMetaPageData *metad = NULL;
1220  BlockNumber blockcache;
1221 
1222  /*
1223  * If we are doing this insert because we split a page that was the
1224  * only one on its tree level, but was not the root, it may have been
1225  * the "fast root". We need to ensure that the fast root link points
1226  * at or above the current page. We can safely acquire a lock on the
1227  * metapage here --- see comments for _bt_newroot().
1228  */
1229  if (split_only_page)
1230  {
1231  Assert(!isleaf);
1232  Assert(BufferIsValid(cbuf));
1233 
1234  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1235  metapg = BufferGetPage(metabuf);
1236  metad = BTPageGetMeta(metapg);
1237 
1238  if (metad->btm_fastlevel >= lpageop->btpo.level)
1239  {
1240  /* no update wanted */
1241  _bt_relbuf(rel, metabuf);
1242  metabuf = InvalidBuffer;
1243  }
1244  }
1245 
1246  /* Do the update. No ereport(ERROR) until changes are logged */
1248 
1249  if (postingoff != 0)
1250  memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1251 
1252  if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1253  false) == InvalidOffsetNumber)
1254  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1256 
1257  MarkBufferDirty(buf);
1258 
1259  if (BufferIsValid(metabuf))
1260  {
1261  /* upgrade meta-page if needed */
1262  if (metad->btm_version < BTREE_NOVAC_VERSION)
1263  _bt_upgrademetapage(metapg);
1264  metad->btm_fastroot = BufferGetBlockNumber(buf);
1265  metad->btm_fastlevel = lpageop->btpo.level;
1266  MarkBufferDirty(metabuf);
1267  }
1268 
1269  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
1270  if (BufferIsValid(cbuf))
1271  {
1272  Page cpage = BufferGetPage(cbuf);
1273  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1274 
1275  Assert(P_INCOMPLETE_SPLIT(cpageop));
1276  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1277  MarkBufferDirty(cbuf);
1278  }
1279 
1280  /* XLOG stuff */
1281  if (RelationNeedsWAL(rel))
1282  {
1283  xl_btree_insert xlrec;
1284  xl_btree_metadata xlmeta;
1285  uint8 xlinfo;
1286  XLogRecPtr recptr;
1287 
1288  xlrec.offnum = newitemoff;
1289 
1290  XLogBeginInsert();
1291  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1292 
1293  if (isleaf && postingoff == 0)
1294  {
1295  /* Simple leaf insert */
1296  xlinfo = XLOG_BTREE_INSERT_LEAF;
1297  }
1298  else if (postingoff != 0)
1299  {
1300  /*
1301  * Leaf insert with posting list split. Must include
1302  * postingoff field before newitem/orignewitem.
1303  */
1304  xlinfo = XLOG_BTREE_INSERT_POST;
1305  }
1306  else
1307  {
1308  /*
1309  * Register the left child whose INCOMPLETE_SPLIT flag was
1310  * cleared.
1311  */
1313 
1314  xlinfo = XLOG_BTREE_INSERT_UPPER;
1315  }
1316 
1317  if (BufferIsValid(metabuf))
1318  {
1320  xlmeta.version = metad->btm_version;
1321  xlmeta.root = metad->btm_root;
1322  xlmeta.level = metad->btm_level;
1323  xlmeta.fastroot = metad->btm_fastroot;
1324  xlmeta.fastlevel = metad->btm_fastlevel;
1325  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1328  xlmeta.allequalimage = metad->btm_allequalimage;
1329 
1331  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1332 
1333  xlinfo = XLOG_BTREE_INSERT_META;
1334  }
1335 
1337  if (postingoff == 0)
1338  {
1339  /* Simple, common case -- log itup from caller */
1340  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1341  }
1342  else
1343  {
1344  /*
1345  * Insert with posting list split (XLOG_BTREE_INSERT_POST
1346  * record) case.
1347  *
1348  * Log postingoff. Also log origitup, not itup. REDO routine
1349  * must reconstruct final itup (as well as nposting) using
1350  * _bt_swap_posting().
1351  */
1352  uint16 upostingoff = postingoff;
1353 
1354  XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1355  XLogRegisterBufData(0, (char *) origitup,
1356  IndexTupleSize(origitup));
1357  }
1358 
1359  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1360 
1361  if (BufferIsValid(metabuf))
1362  PageSetLSN(metapg, recptr);
1363  if (BufferIsValid(cbuf))
1364  PageSetLSN(BufferGetPage(cbuf), recptr);
1365 
1366  PageSetLSN(page, recptr);
1367  }
1368 
1369  END_CRIT_SECTION();
1370 
1371  /* Release subsidiary buffers */
1372  if (BufferIsValid(metabuf))
1373  _bt_relbuf(rel, metabuf);
1374  if (BufferIsValid(cbuf))
1375  _bt_relbuf(rel, cbuf);
1376 
1377  /*
1378  * Cache the block number if this is the rightmost leaf page. Cache
1379  * may be used by a future inserter within _bt_search_insert().
1380  */
1381  blockcache = InvalidBlockNumber;
1382  if (isrightmost && isleaf && !P_ISROOT(lpageop))
1383  blockcache = BufferGetBlockNumber(buf);
1384 
1385  /* Release buffer for insertion target block */
1386  _bt_relbuf(rel, buf);
1387 
1388  /*
1389  * If we decided to cache the insertion target block before releasing
1390  * its buffer lock, then cache it now. Check the height of the tree
1391  * first, though. We don't go for the optimization with small
1392  * indexes. Defer final check to this point to ensure that we don't
1393  * call _bt_getrootheight while holding a buffer lock.
1394  */
1395  if (BlockNumberIsValid(blockcache) &&
1397  RelationSetTargetBlock(rel, blockcache);
1398  }
1399 
1400  /* be tidy */
1401  if (postingoff != 0)
1402  {
1403  /* itup is actually a modified copy of caller's original */
1404  pfree(nposting);
1405  pfree(itup);
1406  }
1407 }
1408 
1409 /*
1410  * _bt_split() -- split a page in the btree.
1411  *
1412  * On entry, buf is the page to split, and is pinned and write-locked.
1413  * newitemoff etc. tell us about the new item that must be inserted
1414  * along with the data from the original page.
1415  *
1416  * itup_key is used for suffix truncation on leaf pages (internal
1417  * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1418  * is the left-sibling of the page we're inserting the downlink for.
1419  * This function will clear the INCOMPLETE_SPLIT flag on it, and
1420  * release the buffer.
1421  *
1422  * orignewitem, nposting, and postingoff are needed when an insert of
1423  * orignewitem results in both a posting list split and a page split.
1424  * These extra posting list split details are used here in the same
1425  * way as they are used in the more common case where a posting list
1426  * split does not coincide with a page split. We need to deal with
1427  * posting list splits directly in order to ensure that everything
1428  * that follows from the insert of orignewitem is handled as a single
1429  * atomic operation (though caller's insert of a new pivot/downlink
1430  * into parent page will still be a separate operation). See
1431  * nbtree/README for details on the design of posting list splits.
1432  *
1433  * Returns the new right sibling of buf, pinned and write-locked.
1434  * The pin and lock on buf are maintained.
1435  */
1436 static Buffer
1438  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1439  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1440 {
1441  Buffer rbuf;
1442  Page origpage;
1443  Page leftpage,
1444  rightpage;
1445  BlockNumber origpagenumber,
1446  rightpagenumber;
1447  BTPageOpaque ropaque,
1448  lopaque,
1449  oopaque;
1450  Buffer sbuf = InvalidBuffer;
1451  Page spage = NULL;
1452  BTPageOpaque sopaque = NULL;
1453  Size itemsz;
1454  ItemId itemid;
1455  IndexTuple item;
1456  OffsetNumber leftoff,
1457  rightoff;
1458  OffsetNumber firstright;
1459  OffsetNumber origpagepostingoff;
1460  OffsetNumber maxoff;
1461  OffsetNumber i;
1462  bool newitemonleft,
1463  isleaf;
1464  IndexTuple lefthikey;
1465  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1466  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1467 
1468  /*
1469  * origpage is the original page to be split. leftpage is a temporary
1470  * buffer that receives the left-sibling data, which will be copied back
1471  * into origpage on success. rightpage is the new page that will receive
1472  * the right-sibling data.
1473  *
1474  * leftpage is allocated after choosing a split point. rightpage's new
1475  * buffer isn't acquired until after leftpage is initialized and has new
1476  * high key, the last point where splitting the page may fail (barring
1477  * corruption). Failing before acquiring new buffer won't have lasting
1478  * consequences, since origpage won't have been modified and leftpage is
1479  * only workspace.
1480  */
1481  origpage = BufferGetPage(buf);
1482  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1483  origpagenumber = BufferGetBlockNumber(buf);
1484 
1485  /*
1486  * Choose a point to split origpage at.
1487  *
1488  * A split point can be thought of as a point _between_ two existing
1489  * tuples on origpage (lastleft and firstright tuples), provided you
1490  * pretend that the new item that didn't fit is already on origpage.
1491  *
1492  * Since origpage does not actually contain newitem, the representation of
1493  * split points needs to work with two boundary cases: splits where
1494  * newitem is lastleft, and splits where newitem is firstright.
1495  * newitemonleft resolves the ambiguity that would otherwise exist when
1496  * newitemoff == firstright. In all other cases it's clear which side of
1497  * the split every tuple goes on from context. newitemonleft is usually
1498  * (but not always) redundant information.
1499  */
1500  firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1501  newitem, &newitemonleft);
1502 
1503  /* Allocate temp buffer for leftpage */
1504  leftpage = PageGetTempPage(origpage);
1505  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1506  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1507 
1508  /*
1509  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1510  * and HAS_GARBAGE flags.
1511  */
1512  lopaque->btpo_flags = oopaque->btpo_flags;
1513  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1514  /* set flag in leftpage indicating that rightpage has no downlink yet */
1515  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1516  lopaque->btpo_prev = oopaque->btpo_prev;
1517  /* handle btpo_next after rightpage buffer acquired */
1518  lopaque->btpo.level = oopaque->btpo.level;
1519  /* handle btpo_cycleid after rightpage buffer acquired */
1520 
1521  /*
1522  * Copy the original page's LSN into leftpage, which will become the
1523  * updated version of the page. We need this because XLogInsert will
1524  * examine the LSN and possibly dump it in a page image.
1525  */
1526  PageSetLSN(leftpage, PageGetLSN(origpage));
1527  isleaf = P_ISLEAF(oopaque);
1528 
1529  /*
1530  * Determine page offset number of existing overlapped-with-orignewitem
1531  * posting list when it is necessary to perform a posting list split in
1532  * passing. Note that newitem was already changed by caller (newitem no
1533  * longer has the orignewitem TID).
1534  *
1535  * This page offset number (origpagepostingoff) will be used to pretend
1536  * that the posting split has already taken place, even though the
1537  * required modifications to origpage won't occur until we reach the
1538  * critical section. The lastleft and firstright tuples of our page split
1539  * point should, in effect, come from an imaginary version of origpage
1540  * that has the nposting tuple instead of the original posting list tuple.
1541  *
1542  * Note: _bt_findsplitloc() should have compensated for coinciding posting
1543  * list splits in just the same way, at least in theory. It doesn't
1544  * bother with that, though. In practice it won't affect its choice of
1545  * split point.
1546  */
1547  origpagepostingoff = InvalidOffsetNumber;
1548  if (postingoff != 0)
1549  {
1550  Assert(isleaf);
1551  Assert(ItemPointerCompare(&orignewitem->t_tid,
1552  &newitem->t_tid) < 0);
1553  Assert(BTreeTupleIsPosting(nposting));
1554  origpagepostingoff = OffsetNumberPrev(newitemoff);
1555  }
1556 
1557  /*
1558  * The "high key" for the new left page will be the first key that's going
1559  * to go into the new right page, or a truncated version if this is a leaf
1560  * page split.
1561  *
1562  * The high key for the left page is formed using the first item on the
1563  * right page, which may seem to be contrary to Lehman & Yao's approach of
1564  * using the left page's last item as its new high key when splitting on
1565  * the leaf level. It isn't, though: suffix truncation will leave the
1566  * left page's high key fully equal to the last item on the left page when
1567  * two tuples with equal key values (excluding heap TID) enclose the split
1568  * point. It isn't actually necessary for a new leaf high key to be equal
1569  * to the last item on the left for the L&Y "subtree" invariant to hold.
1570  * It's sufficient to make sure that the new leaf high key is strictly
1571  * less than the first item on the right leaf page, and greater than or
1572  * equal to (not necessarily equal to) the last item on the left leaf
1573  * page.
1574  *
1575  * In other words, when suffix truncation isn't possible, L&Y's exact
1576  * approach to leaf splits is taken. (Actually, even that is slightly
1577  * inaccurate. A tuple with all the keys from firstright but the heap TID
1578  * from lastleft will be used as the new high key, since the last left
1579  * tuple could be physically larger despite being opclass-equal in respect
1580  * of all attributes prior to the heap TID attribute.)
1581  */
1582  if (!newitemonleft && newitemoff == firstright)
1583  {
1584  /* incoming tuple will become first on right page */
1585  itemsz = newitemsz;
1586  item = newitem;
1587  }
1588  else
1589  {
1590  /* existing item at firstright will become first on right page */
1591  itemid = PageGetItemId(origpage, firstright);
1592  itemsz = ItemIdGetLength(itemid);
1593  item = (IndexTuple) PageGetItem(origpage, itemid);
1594  if (firstright == origpagepostingoff)
1595  item = nposting;
1596  }
1597 
1598  /*
1599  * Truncate unneeded key and non-key attributes of the high key item
1600  * before inserting it on the left page. This can only happen at the leaf
1601  * level, since in general all pivot tuple values originate from leaf
1602  * level high keys. A pivot tuple in a grandparent page must guide a
1603  * search not only to the correct parent page, but also to the correct
1604  * leaf page.
1605  */
1606  if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
1607  {
1608  IndexTuple lastleft;
1609 
1610  /*
1611  * Determine which tuple will become the last on the left page. This
1612  * is needed to decide how many attributes from the first item on the
1613  * right page must remain in new high key for left page.
1614  */
1615  if (newitemonleft && newitemoff == firstright)
1616  {
1617  /* incoming tuple will become last on left page */
1618  lastleft = newitem;
1619  }
1620  else
1621  {
1622  OffsetNumber lastleftoff;
1623 
1624  /* item just before firstright will become last on left page */
1625  lastleftoff = OffsetNumberPrev(firstright);
1626  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1627  itemid = PageGetItemId(origpage, lastleftoff);
1628  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1629  if (lastleftoff == origpagepostingoff)
1630  lastleft = nposting;
1631  }
1632 
1633  Assert(lastleft != item);
1634  lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
1635  itemsz = IndexTupleSize(lefthikey);
1636  itemsz = MAXALIGN(itemsz);
1637  }
1638  else
1639  lefthikey = item;
1640 
1641  /*
1642  * Add new high key to leftpage
1643  */
1644  leftoff = P_HIKEY;
1645 
1646  Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
1647  Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
1648  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1649  false, false) == InvalidOffsetNumber)
1650  elog(ERROR, "failed to add hikey to the left sibling"
1651  " while splitting block %u of index \"%s\"",
1652  origpagenumber, RelationGetRelationName(rel));
1653  leftoff = OffsetNumberNext(leftoff);
1654  /* be tidy */
1655  if (lefthikey != item)
1656  pfree(lefthikey);
1657 
1658  /*
1659  * Acquire a new right page to split into, now that left page has a new
1660  * high key. From here on, it's not okay to throw an error without
1661  * zeroing rightpage first. This coding rule ensures that we won't
1662  * confuse future VACUUM operations, which might otherwise try to re-find
1663  * a downlink to a leftover junk page as the page undergoes deletion.
1664  *
1665  * It would be reasonable to start the critical section just after the new
1666  * rightpage buffer is acquired instead; that would allow us to avoid
1667  * leftover junk pages without bothering to zero rightpage. We do it this
1668  * way because it avoids an unnecessary PANIC when either origpage or its
1669  * existing sibling page are corrupt.
1670  */
1671  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1672  rightpage = BufferGetPage(rbuf);
1673  rightpagenumber = BufferGetBlockNumber(rbuf);
1674  /* rightpage was initialized by _bt_getbuf */
1675  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1676 
1677  /*
1678  * Finish off remaining leftpage special area fields. They cannot be set
1679  * before both origpage (leftpage) and rightpage buffers are acquired and
1680  * locked.
1681  */
1682  lopaque->btpo_next = rightpagenumber;
1683  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1684 
1685  /*
1686  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1687  * and HAS_GARBAGE flags.
1688  */
1689  ropaque->btpo_flags = oopaque->btpo_flags;
1690  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1691  ropaque->btpo_prev = origpagenumber;
1692  ropaque->btpo_next = oopaque->btpo_next;
1693  ropaque->btpo.level = oopaque->btpo.level;
1694  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1695 
1696  /*
1697  * Add new high key to rightpage where necessary.
1698  *
1699  * If the page we're splitting is not the rightmost page at its level in
1700  * the tree, then the first entry on the page is the high key from
1701  * origpage.
1702  */
1703  rightoff = P_HIKEY;
1704 
1705  if (!P_RIGHTMOST(oopaque))
1706  {
1707  itemid = PageGetItemId(origpage, P_HIKEY);
1708  itemsz = ItemIdGetLength(itemid);
1709  item = (IndexTuple) PageGetItem(origpage, itemid);
1710  Assert(BTreeTupleGetNAtts(item, rel) > 0);
1711  Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
1712  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1713  false, false) == InvalidOffsetNumber)
1714  {
1715  memset(rightpage, 0, BufferGetPageSize(rbuf));
1716  elog(ERROR, "failed to add hikey to the right sibling"
1717  " while splitting block %u of index \"%s\"",
1718  origpagenumber, RelationGetRelationName(rel));
1719  }
1720  rightoff = OffsetNumberNext(rightoff);
1721  }
1722 
1723  /*
1724  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1725  * additional pivot tuples in !isleaf case) to the appropriate page.
1726  *
1727  * Note: we *must* insert at least the right page's items in item-number
1728  * order, for the benefit of _bt_restore_page().
1729  */
1730  maxoff = PageGetMaxOffsetNumber(origpage);
1731 
1732  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1733  {
1734  itemid = PageGetItemId(origpage, i);
1735  itemsz = ItemIdGetLength(itemid);
1736  item = (IndexTuple) PageGetItem(origpage, itemid);
1737 
1738  /* replace original item with nposting due to posting split? */
1739  if (i == origpagepostingoff)
1740  {
1741  Assert(BTreeTupleIsPosting(item));
1742  Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1743  item = nposting;
1744  }
1745 
1746  /* does new item belong before this one? */
1747  else if (i == newitemoff)
1748  {
1749  if (newitemonleft)
1750  {
1751  Assert(newitemoff <= firstright);
1752  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1753  {
1754  memset(rightpage, 0, BufferGetPageSize(rbuf));
1755  elog(ERROR, "failed to add new item to the left sibling"
1756  " while splitting block %u of index \"%s\"",
1757  origpagenumber, RelationGetRelationName(rel));
1758  }
1759  leftoff = OffsetNumberNext(leftoff);
1760  }
1761  else
1762  {
1763  Assert(newitemoff >= firstright);
1764  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1765  {
1766  memset(rightpage, 0, BufferGetPageSize(rbuf));
1767  elog(ERROR, "failed to add new item to the right sibling"
1768  " while splitting block %u of index \"%s\"",
1769  origpagenumber, RelationGetRelationName(rel));
1770  }
1771  rightoff = OffsetNumberNext(rightoff);
1772  }
1773  }
1774 
1775  /* decide which page to put it on */
1776  if (i < firstright)
1777  {
1778  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1779  {
1780  memset(rightpage, 0, BufferGetPageSize(rbuf));
1781  elog(ERROR, "failed to add old item to the left sibling"
1782  " while splitting block %u of index \"%s\"",
1783  origpagenumber, RelationGetRelationName(rel));
1784  }
1785  leftoff = OffsetNumberNext(leftoff);
1786  }
1787  else
1788  {
1789  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1790  {
1791  memset(rightpage, 0, BufferGetPageSize(rbuf));
1792  elog(ERROR, "failed to add old item to the right sibling"
1793  " while splitting block %u of index \"%s\"",
1794  origpagenumber, RelationGetRelationName(rel));
1795  }
1796  rightoff = OffsetNumberNext(rightoff);
1797  }
1798  }
1799 
1800  /* cope with possibility that newitem goes at the end */
1801  if (i <= newitemoff)
1802  {
1803  /*
1804  * Can't have newitemonleft here; that would imply we were told to put
1805  * *everything* on the left page, which cannot fit (if it could, we'd
1806  * not be splitting the page).
1807  */
1808  Assert(!newitemonleft);
1809  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1810  {
1811  memset(rightpage, 0, BufferGetPageSize(rbuf));
1812  elog(ERROR, "failed to add new item to the right sibling"
1813  " while splitting block %u of index \"%s\"",
1814  origpagenumber, RelationGetRelationName(rel));
1815  }
1816  rightoff = OffsetNumberNext(rightoff);
1817  }
1818 
1819  /*
1820  * We have to grab the right sibling (if any) and fix the prev pointer
1821  * there. We are guaranteed that this is deadlock-free since no other
1822  * writer will be holding a lock on that page and trying to move left, and
1823  * all readers release locks on a page before trying to fetch its
1824  * neighbors.
1825  */
1826  if (!P_RIGHTMOST(oopaque))
1827  {
1828  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1829  spage = BufferGetPage(sbuf);
1830  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1831  if (sopaque->btpo_prev != origpagenumber)
1832  {
1833  memset(rightpage, 0, BufferGetPageSize(rbuf));
1834  ereport(ERROR,
1835  (errcode(ERRCODE_INDEX_CORRUPTED),
1836  errmsg_internal("right sibling's left-link doesn't match: "
1837  "block %u links to %u instead of expected %u in index \"%s\"",
1838  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1839  RelationGetRelationName(rel))));
1840  }
1841 
1842  /*
1843  * Check to see if we can set the SPLIT_END flag in the right-hand
1844  * split page; this can save some I/O for vacuum since it need not
1845  * proceed to the right sibling. We can set the flag if the right
1846  * sibling has a different cycleid: that means it could not be part of
1847  * a group of pages that were all split off from the same ancestor
1848  * page. If you're confused, imagine that page A splits to A B and
1849  * then again, yielding A C B, while vacuum is in progress. Tuples
1850  * originally in A could now be in either B or C, hence vacuum must
1851  * examine both pages. But if D, our right sibling, has a different
1852  * cycleid then it could not contain any tuples that were in A when
1853  * the vacuum started.
1854  */
1855  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1856  ropaque->btpo_flags |= BTP_SPLIT_END;
1857  }
1858 
1859  /*
1860  * Right sibling is locked, new siblings are prepared, but original page
1861  * is not updated yet.
1862  *
1863  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1864  * not starting the critical section till here because we haven't been
1865  * scribbling on the original page yet; see comments above.
1866  */
1868 
1869  /*
1870  * By here, the original data page has been split into two new halves, and
1871  * these are correct. The algorithm requires that the left page never
1872  * move during a split, so we copy the new left page back on top of the
1873  * original. Note that this is not a waste of time, since we also require
1874  * (in the page management code) that the center of a page always be
1875  * clean, and the most efficient way to guarantee this is just to compact
1876  * the data by reinserting it into a new left page. (XXX the latter
1877  * comment is probably obsolete; but in any case it's good to not scribble
1878  * on the original page until we enter the critical section.)
1879  *
1880  * We need to do this before writing the WAL record, so that XLogInsert
1881  * can WAL log an image of the page if necessary.
1882  */
1883  PageRestoreTempPage(leftpage, origpage);
1884  /* leftpage, lopaque must not be used below here */
1885 
1886  MarkBufferDirty(buf);
1887  MarkBufferDirty(rbuf);
1888 
1889  if (!P_RIGHTMOST(ropaque))
1890  {
1891  sopaque->btpo_prev = rightpagenumber;
1892  MarkBufferDirty(sbuf);
1893  }
1894 
1895  /*
1896  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1897  * a split.
1898  */
1899  if (!isleaf)
1900  {
1901  Page cpage = BufferGetPage(cbuf);
1902  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1903 
1904  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1905  MarkBufferDirty(cbuf);
1906  }
1907 
1908  /* XLOG stuff */
1909  if (RelationNeedsWAL(rel))
1910  {
1911  xl_btree_split xlrec;
1912  uint8 xlinfo;
1913  XLogRecPtr recptr;
1914 
1915  xlrec.level = ropaque->btpo.level;
1916  /* See comments below on newitem, orignewitem, and posting lists */
1917  xlrec.firstright = firstright;
1918  xlrec.newitemoff = newitemoff;
1919  xlrec.postingoff = 0;
1920  if (postingoff != 0 && origpagepostingoff < firstright)
1921  xlrec.postingoff = postingoff;
1922 
1923  XLogBeginInsert();
1924  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1925 
1928  /* Log the right sibling, because we've changed its prev-pointer. */
1929  if (!P_RIGHTMOST(ropaque))
1931  if (BufferIsValid(cbuf))
1933 
1934  /*
1935  * Log the new item, if it was inserted on the left page. (If it was
1936  * put on the right page, we don't need to explicitly WAL log it
1937  * because it's included with all the other items on the right page.)
1938  * Show the new item as belonging to the left page buffer, so that it
1939  * is not stored if XLogInsert decides it needs a full-page image of
1940  * the left page. We always store newitemoff in the record, though.
1941  *
1942  * The details are sometimes slightly different for page splits that
1943  * coincide with a posting list split. If both the replacement
1944  * posting list and newitem go on the right page, then we don't need
1945  * to log anything extra, just like the simple !newitemonleft
1946  * no-posting-split case (postingoff is set to zero in the WAL record,
1947  * so recovery doesn't need to process a posting list split at all).
1948  * Otherwise, we set postingoff and log orignewitem instead of
1949  * newitem, despite having actually inserted newitem. REDO routine
1950  * must reconstruct nposting and newitem using _bt_swap_posting().
1951  *
1952  * Note: It's possible that our page split point is the point that
1953  * makes the posting list lastleft and newitem firstright. This is
1954  * the only case where we log orignewitem/newitem despite newitem
1955  * going on the right page. If XLogInsert decides that it can omit
1956  * orignewitem due to logging a full-page image of the left page,
1957  * everything still works out, since recovery only needs to log
1958  * orignewitem for items on the left page (just like the regular
1959  * newitem-logged case).
1960  */
1961  if (newitemonleft && xlrec.postingoff == 0)
1962  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1963  else if (xlrec.postingoff != 0)
1964  {
1965  Assert(newitemonleft || firstright == newitemoff);
1966  Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem));
1967  XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz));
1968  }
1969 
1970  /* Log the left page's new high key */
1971  itemid = PageGetItemId(origpage, P_HIKEY);
1972  item = (IndexTuple) PageGetItem(origpage, itemid);
1973  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1974 
1975  /*
1976  * Log the contents of the right page in the format understood by
1977  * _bt_restore_page(). The whole right page will be recreated.
1978  *
1979  * Direct access to page is not good but faster - we should implement
1980  * some new func in page API. Note we only store the tuples
1981  * themselves, knowing that they were inserted in item-number order
1982  * and so the line pointers can be reconstructed. See comments for
1983  * _bt_restore_page().
1984  */
1986  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1987  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1988 
1989  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
1990  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1991 
1992  PageSetLSN(origpage, recptr);
1993  PageSetLSN(rightpage, recptr);
1994  if (!P_RIGHTMOST(ropaque))
1995  {
1996  PageSetLSN(spage, recptr);
1997  }
1998  if (!isleaf)
1999  {
2000  PageSetLSN(BufferGetPage(cbuf), recptr);
2001  }
2002  }
2003 
2004  END_CRIT_SECTION();
2005 
2006  /* release the old right sibling */
2007  if (!P_RIGHTMOST(ropaque))
2008  _bt_relbuf(rel, sbuf);
2009 
2010  /* release the child */
2011  if (!isleaf)
2012  _bt_relbuf(rel, cbuf);
2013 
2014  /* split's done */
2015  return rbuf;
2016 }
2017 
2018 /*
2019  * _bt_insert_parent() -- Insert downlink into parent, completing split.
2020  *
2021  * On entry, buf and rbuf are the left and right split pages, which we
2022  * still hold write locks on. Both locks will be released here. We
2023  * release the rbuf lock once we have a write lock on the page that we
2024  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2025  * The lock on buf is released at the same point as the lock on the parent
2026  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2027  * atomic operation that completes the split by inserting a new downlink.
2028  *
2029  * stack - stack showing how we got here. Will be NULL when splitting true
2030  * root, or during concurrent root split, where we can be inefficient
2031  * is_root - we split the true root
2032  * is_only - we split a page alone on its level (might have been fast root)
2033  */
2034 static void
2036  Buffer buf,
2037  Buffer rbuf,
2038  BTStack stack,
2039  bool is_root,
2040  bool is_only)
2041 {
2042  /*
2043  * Here we have to do something Lehman and Yao don't talk about: deal with
2044  * a root split and construction of a new root. If our stack is empty
2045  * then we have just split a node on what had been the root level when we
2046  * descended the tree. If it was still the root then we perform a
2047  * new-root construction. If it *wasn't* the root anymore, search to find
2048  * the next higher level that someone constructed meanwhile, and find the
2049  * right place to insert as for the normal case.
2050  *
2051  * If we have to search for the parent level, we do so by re-descending
2052  * from the root. This is not super-efficient, but it's rare enough not
2053  * to matter.
2054  */
2055  if (is_root)
2056  {
2057  Buffer rootbuf;
2058 
2059  Assert(stack == NULL);
2060  Assert(is_only);
2061  /* create a new root node and update the metapage */
2062  rootbuf = _bt_newroot(rel, buf, rbuf);
2063  /* release the split buffers */
2064  _bt_relbuf(rel, rootbuf);
2065  _bt_relbuf(rel, rbuf);
2066  _bt_relbuf(rel, buf);
2067  }
2068  else
2069  {
2070  BlockNumber bknum = BufferGetBlockNumber(buf);
2071  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2072  Page page = BufferGetPage(buf);
2073  IndexTuple new_item;
2074  BTStackData fakestack;
2075  IndexTuple ritem;
2076  Buffer pbuf;
2077 
2078  if (stack == NULL)
2079  {
2080  BTPageOpaque lpageop;
2081 
2082  elog(DEBUG2, "concurrent ROOT page split");
2083  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
2084 
2085  /*
2086  * We should never reach here when a leaf page split takes place
2087  * despite the insert of newitem being able to apply the fastpath
2088  * optimization. Make sure of that with an assertion.
2089  *
2090  * This is more of a performance issue than a correctness issue.
2091  * The fastpath won't have a descent stack. Using a phony stack
2092  * here works, but never rely on that. The fastpath should be
2093  * rejected within _bt_search_insert() when the rightmost leaf
2094  * page will split, since it's faster to go through _bt_search()
2095  * and get a stack in the usual way.
2096  */
2097  Assert(!(P_ISLEAF(lpageop) &&
2099 
2100  /* Find the leftmost page at the next level up */
2101  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
2102  NULL);
2103  /* Set up a phony stack entry pointing there */
2104  stack = &fakestack;
2105  stack->bts_blkno = BufferGetBlockNumber(pbuf);
2107  stack->bts_parent = NULL;
2108  _bt_relbuf(rel, pbuf);
2109  }
2110 
2111  /* get high key from left, a strict lower bound for new right page */
2112  ritem = (IndexTuple) PageGetItem(page,
2113  PageGetItemId(page, P_HIKEY));
2114 
2115  /* form an index tuple that points at the new right page */
2116  new_item = CopyIndexTuple(ritem);
2117  BTreeTupleSetDownLink(new_item, rbknum);
2118 
2119  /*
2120  * Re-find and write lock the parent of buf.
2121  *
2122  * It's possible that the location of buf's downlink has changed since
2123  * our initial _bt_search() descent. _bt_getstackbuf() will detect
2124  * and recover from this, updating the stack, which ensures that the
2125  * new downlink will be inserted at the correct offset. Even buf's
2126  * parent may have changed.
2127  */
2128  pbuf = _bt_getstackbuf(rel, stack, bknum);
2129 
2130  /*
2131  * Unlock the right child. The left child will be unlocked in
2132  * _bt_insertonpg().
2133  *
2134  * Unlocking the right child must be delayed until here to ensure that
2135  * no concurrent VACUUM operation can become confused. Page deletion
2136  * cannot be allowed to fail to re-find a downlink for the rbuf page.
2137  * (Actually, this is just a vestige of how things used to work. The
2138  * page deletion code is expected to check for the INCOMPLETE_SPLIT
2139  * flag on the left child. It won't attempt deletion of the right
2140  * child until the split is complete. Despite all this, we opt to
2141  * conservatively delay unlocking the right child until here.)
2142  */
2143  _bt_relbuf(rel, rbuf);
2144 
2145  if (pbuf == InvalidBuffer)
2146  ereport(ERROR,
2147  (errcode(ERRCODE_INDEX_CORRUPTED),
2148  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2149  RelationGetRelationName(rel), bknum, rbknum)));
2150 
2151  /* Recursively insert into the parent */
2152  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
2153  new_item, MAXALIGN(IndexTupleSize(new_item)),
2154  stack->bts_offset + 1, 0, is_only);
2155 
2156  /* be tidy */
2157  pfree(new_item);
2158  }
2159 }
2160 
2161 /*
2162  * _bt_finish_split() -- Finish an incomplete split
2163  *
2164  * A crash or other failure can leave a split incomplete. The insertion
2165  * routines won't allow to insert on a page that is incompletely split.
2166  * Before inserting on such a page, call _bt_finish_split().
2167  *
2168  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2169  * and unpinned.
2170  */
2171 void
2173 {
2174  Page lpage = BufferGetPage(lbuf);
2175  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
2176  Buffer rbuf;
2177  Page rpage;
2178  BTPageOpaque rpageop;
2179  bool was_root;
2180  bool was_only;
2181 
2182  Assert(P_INCOMPLETE_SPLIT(lpageop));
2183 
2184  /* Lock right sibling, the one missing the downlink */
2185  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2186  rpage = BufferGetPage(rbuf);
2187  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
2188 
2189  /* Could this be a root split? */
2190  if (!stack)
2191  {
2192  Buffer metabuf;
2193  Page metapg;
2194  BTMetaPageData *metad;
2195 
2196  /* acquire lock on the metapage */
2197  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2198  metapg = BufferGetPage(metabuf);
2199  metad = BTPageGetMeta(metapg);
2200 
2201  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
2202 
2203  _bt_relbuf(rel, metabuf);
2204  }
2205  else
2206  was_root = false;
2207 
2208  /* Was this the only page on the level before split? */
2209  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2210 
2211  elog(DEBUG1, "finishing incomplete split of %u/%u",
2213 
2214  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
2215 }
2216 
2217 /*
2218  * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2219  * tuple whose downlink points to child page.
2220  *
2221  * Caller passes child's block number, which is used to identify
2222  * associated pivot tuple in parent page using a linear search that
2223  * matches on pivot's downlink/block number. The expected location of
2224  * the pivot tuple is taken from the stack one level above the child
2225  * page. This is used as a starting point. Insertions into the
2226  * parent level could cause the pivot tuple to move right; deletions
2227  * could cause it to move left, but not left of the page we previously
2228  * found it on.
2229  *
2230  * Caller can use its stack to relocate the pivot tuple/downlink for
2231  * any same-level page to the right of the page found by its initial
2232  * descent. This is necessary because of the possibility that caller
2233  * moved right to recover from a concurrent page split. It's also
2234  * convenient for certain callers to be able to step right when there
2235  * wasn't a concurrent page split, while still using their original
2236  * stack. For example, the checkingunique _bt_doinsert() case may
2237  * have to step right when there are many physical duplicates, and its
2238  * scantid forces an insertion to the right of the "first page the
2239  * value could be on".
2240  *
2241  * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2242  * tuple not found (should not happen). Adjusts bts_blkno &
2243  * bts_offset if changed. Page split caller should insert its new
2244  * pivot tuple for its new right sibling page on parent page, at the
2245  * offset number bts_offset + 1.
2246  */
2247 Buffer
2249 {
2250  BlockNumber blkno;
2251  OffsetNumber start;
2252 
2253  blkno = stack->bts_blkno;
2254  start = stack->bts_offset;
2255 
2256  for (;;)
2257  {
2258  Buffer buf;
2259  Page page;
2260  BTPageOpaque opaque;
2261 
2262  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2263  page = BufferGetPage(buf);
2264  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2265 
2266  if (P_INCOMPLETE_SPLIT(opaque))
2267  {
2268  _bt_finish_split(rel, buf, stack->bts_parent);
2269  continue;
2270  }
2271 
2272  if (!P_IGNORE(opaque))
2273  {
2274  OffsetNumber offnum,
2275  minoff,
2276  maxoff;
2277  ItemId itemid;
2278  IndexTuple item;
2279 
2280  minoff = P_FIRSTDATAKEY(opaque);
2281  maxoff = PageGetMaxOffsetNumber(page);
2282 
2283  /*
2284  * start = InvalidOffsetNumber means "search the whole page". We
2285  * need this test anyway due to possibility that page has a high
2286  * key now when it didn't before.
2287  */
2288  if (start < minoff)
2289  start = minoff;
2290 
2291  /*
2292  * Need this check too, to guard against possibility that page
2293  * split since we visited it originally.
2294  */
2295  if (start > maxoff)
2296  start = OffsetNumberNext(maxoff);
2297 
2298  /*
2299  * These loops will check every item on the page --- but in an
2300  * order that's attuned to the probability of where it actually
2301  * is. Scan to the right first, then to the left.
2302  */
2303  for (offnum = start;
2304  offnum <= maxoff;
2305  offnum = OffsetNumberNext(offnum))
2306  {
2307  itemid = PageGetItemId(page, offnum);
2308  item = (IndexTuple) PageGetItem(page, itemid);
2309 
2310  if (BTreeTupleGetDownLink(item) == child)
2311  {
2312  /* Return accurate pointer to where link is now */
2313  stack->bts_blkno = blkno;
2314  stack->bts_offset = offnum;
2315  return buf;
2316  }
2317  }
2318 
2319  for (offnum = OffsetNumberPrev(start);
2320  offnum >= minoff;
2321  offnum = OffsetNumberPrev(offnum))
2322  {
2323  itemid = PageGetItemId(page, offnum);
2324  item = (IndexTuple) PageGetItem(page, itemid);
2325 
2326  if (BTreeTupleGetDownLink(item) == child)
2327  {
2328  /* Return accurate pointer to where link is now */
2329  stack->bts_blkno = blkno;
2330  stack->bts_offset = offnum;
2331  return buf;
2332  }
2333  }
2334  }
2335 
2336  /*
2337  * The item we're looking for moved right at least one page.
2338  *
2339  * Lehman and Yao couple/chain locks when moving right here, which we
2340  * can avoid. See nbtree/README.
2341  */
2342  if (P_RIGHTMOST(opaque))
2343  {
2344  _bt_relbuf(rel, buf);
2345  return InvalidBuffer;
2346  }
2347  blkno = opaque->btpo_next;
2348  start = InvalidOffsetNumber;
2349  _bt_relbuf(rel, buf);
2350  }
2351 }
2352 
2353 /*
2354  * _bt_newroot() -- Create a new root page for the index.
2355  *
2356  * We've just split the old root page and need to create a new one.
2357  * In order to do this, we add a new root page to the file, then lock
2358  * the metadata page and update it. This is guaranteed to be deadlock-
2359  * free, because all readers release their locks on the metadata page
2360  * before trying to lock the root, and all writers lock the root before
2361  * trying to lock the metadata page. We have a write lock on the old
2362  * root page, so we have not introduced any cycles into the waits-for
2363  * graph.
2364  *
2365  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2366  * locked. On exit, a new root page exists with entries for the
2367  * two new children, metapage is updated and unlocked/unpinned.
2368  * The new root buffer is returned to caller which has to unlock/unpin
2369  * lbuf, rbuf & rootbuf.
2370  */
2371 static Buffer
2373 {
2374  Buffer rootbuf;
2375  Page lpage,
2376  rootpage;
2377  BlockNumber lbkno,
2378  rbkno;
2379  BlockNumber rootblknum;
2380  BTPageOpaque rootopaque;
2381  BTPageOpaque lopaque;
2382  ItemId itemid;
2383  IndexTuple item;
2384  IndexTuple left_item;
2385  Size left_item_sz;
2386  IndexTuple right_item;
2387  Size right_item_sz;
2388  Buffer metabuf;
2389  Page metapg;
2390  BTMetaPageData *metad;
2391 
2392  lbkno = BufferGetBlockNumber(lbuf);
2393  rbkno = BufferGetBlockNumber(rbuf);
2394  lpage = BufferGetPage(lbuf);
2395  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2396 
2397  /* get a new root page */
2398  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2399  rootpage = BufferGetPage(rootbuf);
2400  rootblknum = BufferGetBlockNumber(rootbuf);
2401 
2402  /* acquire lock on the metapage */
2403  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2404  metapg = BufferGetPage(metabuf);
2405  metad = BTPageGetMeta(metapg);
2406 
2407  /*
2408  * Create downlink item for left page (old root). Since this will be the
2409  * first item in a non-leaf page, it implicitly has minus-infinity key
2410  * value, so we need not store any actual key in it.
2411  */
2412  left_item_sz = sizeof(IndexTupleData);
2413  left_item = (IndexTuple) palloc(left_item_sz);
2414  left_item->t_info = left_item_sz;
2415  BTreeTupleSetDownLink(left_item, lbkno);
2416  BTreeTupleSetNAtts(left_item, 0);
2417 
2418  /*
2419  * Create downlink item for right page. The key for it is obtained from
2420  * the "high key" position in the left page.
2421  */
2422  itemid = PageGetItemId(lpage, P_HIKEY);
2423  right_item_sz = ItemIdGetLength(itemid);
2424  item = (IndexTuple) PageGetItem(lpage, itemid);
2425  right_item = CopyIndexTuple(item);
2426  BTreeTupleSetDownLink(right_item, rbkno);
2427 
2428  /* NO EREPORT(ERROR) from here till newroot op is logged */
2430 
2431  /* upgrade metapage if needed */
2432  if (metad->btm_version < BTREE_NOVAC_VERSION)
2433  _bt_upgrademetapage(metapg);
2434 
2435  /* set btree special data */
2436  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2437  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2438  rootopaque->btpo_flags = BTP_ROOT;
2439  rootopaque->btpo.level =
2440  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2441  rootopaque->btpo_cycleid = 0;
2442 
2443  /* update metapage data */
2444  metad->btm_root = rootblknum;
2445  metad->btm_level = rootopaque->btpo.level;
2446  metad->btm_fastroot = rootblknum;
2447  metad->btm_fastlevel = rootopaque->btpo.level;
2448 
2449  /*
2450  * Insert the left page pointer into the new root page. The root page is
2451  * the rightmost page on its level so there is no "high key" in it; the
2452  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2453  *
2454  * Note: we *must* insert the two items in item-number order, for the
2455  * benefit of _bt_restore_page().
2456  */
2457  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2458  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2459  false, false) == InvalidOffsetNumber)
2460  elog(PANIC, "failed to add leftkey to new root page"
2461  " while splitting block %u of index \"%s\"",
2463 
2464  /*
2465  * insert the right page pointer into the new root page.
2466  */
2467  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2468  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2470  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2471  false, false) == InvalidOffsetNumber)
2472  elog(PANIC, "failed to add rightkey to new root page"
2473  " while splitting block %u of index \"%s\"",
2475 
2476  /* Clear the incomplete-split flag in the left child */
2477  Assert(P_INCOMPLETE_SPLIT(lopaque));
2478  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2479  MarkBufferDirty(lbuf);
2480 
2481  MarkBufferDirty(rootbuf);
2482  MarkBufferDirty(metabuf);
2483 
2484  /* XLOG stuff */
2485  if (RelationNeedsWAL(rel))
2486  {
2487  xl_btree_newroot xlrec;
2488  XLogRecPtr recptr;
2489  xl_btree_metadata md;
2490 
2491  xlrec.rootblk = rootblknum;
2492  xlrec.level = metad->btm_level;
2493 
2494  XLogBeginInsert();
2495  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2496 
2497  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2500 
2502  md.version = metad->btm_version;
2503  md.root = rootblknum;
2504  md.level = metad->btm_level;
2505  md.fastroot = rootblknum;
2506  md.fastlevel = metad->btm_level;
2509  md.allequalimage = metad->btm_allequalimage;
2510 
2511  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2512 
2513  /*
2514  * Direct access to page is not good but faster - we should implement
2515  * some new func in page API.
2516  */
2518  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2519  ((PageHeader) rootpage)->pd_special -
2520  ((PageHeader) rootpage)->pd_upper);
2521 
2522  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2523 
2524  PageSetLSN(lpage, recptr);
2525  PageSetLSN(rootpage, recptr);
2526  PageSetLSN(metapg, recptr);
2527  }
2528 
2529  END_CRIT_SECTION();
2530 
2531  /* done with metapage */
2532  _bt_relbuf(rel, metabuf);
2533 
2534  pfree(left_item);
2535  pfree(right_item);
2536 
2537  return rootbuf;
2538 }
2539 
2540 /*
2541  * _bt_pgaddtup() -- add a data item to a particular page during split.
2542  *
2543  * The difference between this routine and a bare PageAddItem call is
2544  * that this code knows that the leftmost data item on an internal
2545  * btree page has a key that must be treated as minus infinity.
2546  * Therefore, it truncates away all attributes. This extra step is
2547  * only needed during internal page splits.
2548  *
2549  * Truncation of an internal page data item can be thought of as one
2550  * of the steps used to "move" a boundary separator key during an
2551  * internal page split. Conceptually, _bt_split() caller splits
2552  * internal pages "inside" the firstright data item: firstright's
2553  * separator key is used as the high key for the left page, while its
2554  * downlink is used within the first data item (also the negative
2555  * infinity item) for the right page. Each distinct separator key
2556  * should appear no more than once per level of the tree.
2557  *
2558  * CAUTION: this works ONLY if we insert the tuples in order, so that
2559  * the given itup_off does represent the final position of the tuple!
2560  */
2561 static bool
2563  Size itemsize,
2564  IndexTuple itup,
2565  OffsetNumber itup_off)
2566 {
2568  IndexTupleData trunctuple;
2569 
2570  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2571  {
2572  trunctuple = *itup;
2573  trunctuple.t_info = sizeof(IndexTupleData);
2574  BTreeTupleSetNAtts(&trunctuple, 0);
2575  itup = &trunctuple;
2576  itemsize = sizeof(IndexTupleData);
2577  }
2578 
2579  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2580  false, false) == InvalidOffsetNumber)
2581  return false;
2582 
2583  return true;
2584 }
2585 
2586 /*
2587  * _bt_vacuum_one_page - vacuum just one index page.
2588  *
2589  * Try to remove LP_DEAD items from the given page. The passed buffer
2590  * must be exclusive-locked, but unlike a real VACUUM, we don't need a
2591  * super-exclusive "cleanup" lock (see nbtree/README).
2592  */
2593 static void
2595 {
2597  int ndeletable = 0;
2598  OffsetNumber offnum,
2599  minoff,
2600  maxoff;
2601  Page page = BufferGetPage(buffer);
2603 
2604  Assert(P_ISLEAF(opaque));
2605 
2606  /*
2607  * Scan over all items to see which ones need to be deleted according to
2608  * LP_DEAD flags.
2609  */
2610  minoff = P_FIRSTDATAKEY(opaque);
2611  maxoff = PageGetMaxOffsetNumber(page);
2612  for (offnum = minoff;
2613  offnum <= maxoff;
2614  offnum = OffsetNumberNext(offnum))
2615  {
2616  ItemId itemId = PageGetItemId(page, offnum);
2617 
2618  if (ItemIdIsDead(itemId))
2619  deletable[ndeletable++] = offnum;
2620  }
2621 
2622  if (ndeletable > 0)
2623  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2624 
2625  /*
2626  * Note: if we didn't find any LP_DEAD items, then the page's
2627  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2628  * separate write to clear it, however. We will clear it when we split
2629  * the page, or when deduplication runs.
2630  */
2631 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1100
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
bool allequalimage
Definition: nbtxlog.h:57
BlockNumber rootblk
Definition: nbtxlog.h:318
#define BTP_ROOT
Definition: nbtree.h:73
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1914
#define BTP_SPLIT_END
Definition: nbtree.h:77
BlockNumber btpo_next
Definition: nbtree.h:59
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:76
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:1071
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:82
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:403
#define P_IGNORE(opaque)
Definition: nbtree.h:219
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:93
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique)
Definition: nbtdedup.c:56
bool bounds_valid
Definition: nbtree.h:706
uint32 TransactionId
Definition: c.h:513
uint32 btm_version
Definition: nbtree.h:101
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:596
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:921
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3392
#define RelationGetDescr(relation)
Definition: rel.h:461
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
ItemPointer scantid
Definition: nbtree.h:674
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:785
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1405
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:985
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
long random(void)
Definition: random.c:22
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define SizeOfBtreeSplit
Definition: nbtxlog.h:161
ItemPointerData t_tid
Definition: itup.h:37
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:51
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:789
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:365
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:431
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:220
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
OffsetNumber stricthigh
Definition: nbtree.h:708
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 level
Definition: nbtxlog.h:319
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:79
bool allequalimage
Definition: nbtree.h:670
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1024
static void BTreeTupleSetNAtts(IndexTuple itup, int natts)
Definition: nbtree.h:466
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3322
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:2035
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2372
#define P_NEW
Definition: bufmgr.h:79
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
OffsetNumber low
Definition: nbtree.h:707
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:452
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2296
#define PANIC
Definition: elog.h:53
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:759
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:520
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:574
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1437
uint16 OffsetNumber
Definition: off.h:24
IndexUniqueCheck
Definition: genam.h:112
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2179
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5255
#define BT_READ
Definition: nbtree.h:597
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
unsigned short uint16
Definition: c.h:366
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
void pfree(void *pointer)
Definition: mcxt.c:1056
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2172
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:718
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
OffsetNumber newitemoff
Definition: nbtxlog.h:157
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2594
BTCycleId btpo_cycleid
Definition: nbtree.h:66
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1176
#define DEBUG2
Definition: elog.h:24
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:781
#define BTPageGetMeta(p)
Definition: nbtree.h:114
bool btm_allequalimage
Definition: nbtree.h:111
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
#define SnapshotSelf
Definition: snapmgr.h:68
OffsetNumber bts_offset
Definition: nbtree.h:613
static char * buf
Definition: pg_test_fsync.c:67
OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:127
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:452
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define MAX_RANDOM_VALUE
#define P_FIRSTKEY
Definition: nbtree.h:243
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:447
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:469
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
unsigned int uint32
Definition: c.h:367
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:434
TransactionId xmax
Definition: snapshot.h:158
TransactionId xmin
Definition: snapshot.h:157
IndexTuple itup
Definition: nbtree.h:694
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:437
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:454
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:649
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:296
#define BufferGetPage(buffer)
Definition: bufmgr.h:157
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:201
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3585
#define BTREE_METAPAGE
Definition: nbtree.h:141
#define P_ISROOT(opaque)
Definition: nbtree.h:215
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:29
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:322
uint32 version
Definition: nbtxlog.h:50
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
bool anynullkeys
Definition: nbtree.h:671
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:344
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2562
uintptr_t Datum
Definition: postgres.h:367
uint32 level
Definition: nbtxlog.h:155
OffsetNumber offnum
Definition: nbtxlog.h:81
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:144
BlockNumber btm_root
Definition: nbtree.h:102
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4374
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:612
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:940
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
#define ereport(elevel,...)
Definition: elog.h:144
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:419
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
uint32 speculativeToken
Definition: snapshot.h:193
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:624
uint64 XLogRecPtr
Definition: xlogdefs.h:21
BTScanInsert itup_key
Definition: nbtree.h:696
#define Assert(condition)
Definition: c.h:738
OffsetNumber firstright
Definition: nbtxlog.h:156
bool heapkeyspace
Definition: nbtree.h:669
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:31
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:543
#define INDEX_MAX_KEYS
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:527
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
#define MAXALIGN(LEN)
Definition: c.h:691
#define BufferIsValid(bufnum)
Definition: bufmgr.h:111
struct BTStackData * bts_parent
Definition: nbtree.h:614
#define RelationNeedsWAL(relation)
Definition: rel.h:537
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2652
#define PageGetLSN(page)
Definition: bufpage.h:366
#define BTMaxItemSize(page)
Definition: nbtree.h:157
#define P_HIKEY
Definition: nbtree.h:242
static Datum values[MAXATTR]
Definition: bootstrap.c:167
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2570
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:387
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:214
uint32 level
Definition: nbtxlog.h:52
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:952
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:393
#define unlikely(x)
Definition: c.h:206
BlockNumber fastroot
Definition: nbtxlog.h:53
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:176
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:598
uint16 postingoff
Definition: nbtxlog.h:158
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define SizeOfBtreeInsert
Definition: nbtxlog.h:87
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3098
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:100
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2248
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214