PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/transam.h"
21 #include "access/xloginsert.h"
22 #include "common/int.h"
23 #include "common/pg_prng.h"
24 #include "lib/qunique.h"
25 #include "miscadmin.h"
26 #include "storage/lmgr.h"
27 #include "storage/predicate.h"
28 #include "storage/smgr.h"
29 
30 /* Minimum tree height for application of fastpath optimization */
31 #define BTREE_FASTPATH_MIN_LEVEL 2
32 
33 
34 static BTStack _bt_search_insert(Relation rel, Relation heaprel,
35  BTInsertState insertstate);
37  Relation heapRel,
38  IndexUniqueCheck checkUnique, bool *is_unique,
39  uint32 *speculativeToken);
41  BTInsertState insertstate,
42  bool checkingunique,
43  bool indexUnchanged,
44  BTStack stack,
45  Relation heapRel);
46 static void _bt_stepright(Relation rel, Relation heaprel,
47  BTInsertState insertstate, BTStack stack);
48 static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
49  Buffer buf,
50  Buffer cbuf,
51  BTStack stack,
52  IndexTuple itup,
53  Size itemsz,
54  OffsetNumber newitemoff,
55  int postingoff,
56  bool split_only_page);
57 static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
58  Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
59  Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
60  IndexTuple nposting, uint16 postingoff);
61 static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
62  Buffer rbuf, BTStack stack, bool isroot, bool isonly);
63 static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
64 static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
65  OffsetNumber itup_off, bool newfirstdataitem);
66 static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
67  BTInsertState insertstate,
68  bool simpleonly, bool checkingunique,
69  bool uniquedup, bool indexUnchanged);
70 static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
71  OffsetNumber *deletable, int ndeletable,
72  IndexTuple newitem, OffsetNumber minoff,
73  OffsetNumber maxoff);
74 static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
75  int ndeletable, IndexTuple newitem,
76  int *nblocks);
77 static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
78 
79 /*
80  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
81  *
82  * This routine is called by the public interface routine, btinsert.
83  * By here, itup is filled in, including the TID.
84  *
85  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
86  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
87  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
88  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
89  * don't actually insert.
90  *
91  * indexUnchanged executor hint indicates if itup is from an
92  * UPDATE that didn't logically change the indexed value, but
93  * must nevertheless have a new entry to point to a successor
94  * version.
95  *
96  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
97  * it must be true if the entry is known unique, else false.
98  * (In the current implementation we'll also return true after a
99  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
100  * that's just a coding artifact.)
101  */
102 bool
104  IndexUniqueCheck checkUnique, bool indexUnchanged,
105  Relation heapRel)
106 {
107  bool is_unique = false;
108  BTInsertStateData insertstate;
109  BTScanInsert itup_key;
110  BTStack stack;
111  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
112 
113  /* we need an insertion scan key to do our search, so build one */
114  itup_key = _bt_mkscankey(rel, itup);
115 
116  if (checkingunique)
117  {
118  if (!itup_key->anynullkeys)
119  {
120  /* No (heapkeyspace) scantid until uniqueness established */
121  itup_key->scantid = NULL;
122  }
123  else
124  {
125  /*
126  * Scan key for new tuple contains NULL key values. Bypass
127  * checkingunique steps. They are unnecessary because core code
128  * considers NULL unequal to every value, including NULL.
129  *
130  * This optimization avoids O(N^2) behavior within the
131  * _bt_findinsertloc() heapkeyspace path when a unique index has a
132  * large number of "duplicates" with NULL key values.
133  */
134  checkingunique = false;
135  /* Tuple is unique in the sense that core code cares about */
136  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
137  is_unique = true;
138  }
139  }
140 
141  /*
142  * Fill in the BTInsertState working area, to track the current page and
143  * position within the page to insert on.
144  *
145  * Note that itemsz is passed down to lower level code that deals with
146  * inserting the item. It must be MAXALIGN()'d. This ensures that space
147  * accounting code consistently considers the alignment overhead that we
148  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
149  * already conservative about alignment, but we don't rely on that from
150  * this distance. Besides, preserving the "true" tuple size in index
151  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
152  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
153  */
154  insertstate.itup = itup;
155  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
156  insertstate.itup_key = itup_key;
157  insertstate.bounds_valid = false;
158  insertstate.buf = InvalidBuffer;
159  insertstate.postingoff = 0;
160 
161 search:
162 
163  /*
164  * Find and lock the leaf page that the tuple should be added to by
165  * searching from the root page. insertstate.buf will hold a buffer that
166  * is locked in exclusive mode afterwards.
167  */
168  stack = _bt_search_insert(rel, heapRel, &insertstate);
169 
170  /*
171  * checkingunique inserts are not allowed to go ahead when two tuples with
172  * equal key attribute values would be visible to new MVCC snapshots once
173  * the xact commits. Check for conflicts in the locked page/buffer (if
174  * needed) here.
175  *
176  * It might be necessary to check a page to the right in _bt_check_unique,
177  * though that should be very rare. In practice the first page the value
178  * could be on (with scantid omitted) is almost always also the only page
179  * that a matching tuple might be found on. This is due to the behavior
180  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
181  * only be allowed to cross a page boundary when there is no candidate
182  * leaf page split point that avoids it. Also, _bt_check_unique can use
183  * the leaf page high key to determine that there will be no duplicates on
184  * the right sibling without actually visiting it (it uses the high key in
185  * cases where the new item happens to belong at the far right of the leaf
186  * page).
187  *
188  * NOTE: obviously, _bt_check_unique can only detect keys that are already
189  * in the index; so it cannot defend against concurrent insertions of the
190  * same key. We protect against that by means of holding a write lock on
191  * the first page the value could be on, with omitted/-inf value for the
192  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
193  * the same key must acquire a write lock on the same page, so only one
194  * would-be inserter can be making the check at one time. Furthermore,
195  * once we are past the check we hold write locks continuously until we
196  * have performed our insertion, so no later inserter can fail to see our
197  * insertion. (This requires some care in _bt_findinsertloc.)
198  *
199  * If we must wait for another xact, we release the lock while waiting,
200  * and then must perform a new search.
201  *
202  * For a partial uniqueness check, we don't wait for the other xact. Just
203  * let the tuple in and return false for possibly non-unique, or true for
204  * definitely unique.
205  */
206  if (checkingunique)
207  {
208  TransactionId xwait;
209  uint32 speculativeToken;
210 
211  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
212  &is_unique, &speculativeToken);
213 
214  if (unlikely(TransactionIdIsValid(xwait)))
215  {
216  /* Have to wait for the other guy ... */
217  _bt_relbuf(rel, insertstate.buf);
218  insertstate.buf = InvalidBuffer;
219 
220  /*
221  * If it's a speculative insertion, wait for it to finish (ie. to
222  * go ahead with the insertion, or kill the tuple). Otherwise
223  * wait for the transaction to finish as usual.
224  */
225  if (speculativeToken)
226  SpeculativeInsertionWait(xwait, speculativeToken);
227  else
228  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
229 
230  /* start over... */
231  if (stack)
232  _bt_freestack(stack);
233  goto search;
234  }
235 
236  /* Uniqueness is established -- restore heap tid as scantid */
237  if (itup_key->heapkeyspace)
238  itup_key->scantid = &itup->t_tid;
239  }
240 
241  if (checkUnique != UNIQUE_CHECK_EXISTING)
242  {
243  OffsetNumber newitemoff;
244 
245  /*
246  * The only conflict predicate locking cares about for indexes is when
247  * an index tuple insert conflicts with an existing lock. We don't
248  * know the actual page we're going to insert on for sure just yet in
249  * checkingunique and !heapkeyspace cases, but it's okay to use the
250  * first page the value could be on (with scantid omitted) instead.
251  */
253 
254  /*
255  * Do the insertion. Note that insertstate contains cached binary
256  * search bounds established within _bt_check_unique when insertion is
257  * checkingunique.
258  */
259  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
260  indexUnchanged, stack, heapRel);
261  _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
262  stack, itup, insertstate.itemsz, newitemoff,
263  insertstate.postingoff, false);
264  }
265  else
266  {
267  /* just release the buffer */
268  _bt_relbuf(rel, insertstate.buf);
269  }
270 
271  /* be tidy */
272  if (stack)
273  _bt_freestack(stack);
274  pfree(itup_key);
275 
276  return is_unique;
277 }
278 
279 /*
280  * _bt_search_insert() -- _bt_search() wrapper for inserts
281  *
282  * Search the tree for a particular scankey, or more precisely for the first
283  * leaf page it could be on. Try to make use of the fastpath optimization's
284  * rightmost leaf page cache before actually searching the tree from the root
285  * page, though.
286  *
287  * Return value is a stack of parent-page pointers (though see notes about
288  * fastpath optimization and page splits below). insertstate->buf is set to
289  * the address of the leaf-page buffer, which is write-locked and pinned in
290  * all cases (if necessary by creating a new empty root page for caller).
291  *
292  * The fastpath optimization avoids most of the work of searching the tree
293  * repeatedly when a single backend inserts successive new tuples on the
294  * rightmost leaf page of an index. A backend cache of the rightmost leaf
295  * page is maintained within _bt_insertonpg(), and used here. The cache is
296  * invalidated here when an insert of a non-pivot tuple must take place on a
297  * non-rightmost leaf page.
298  *
299  * The optimization helps with indexes on an auto-incremented field. It also
300  * helps with indexes on datetime columns, as well as indexes with lots of
301  * NULL values. (NULLs usually get inserted in the rightmost page for single
302  * column indexes, since they usually get treated as coming after everything
303  * else in the key space. Individual NULL tuples will generally be placed on
304  * the rightmost leaf page due to the influence of the heap TID column.)
305  *
306  * Note that we avoid applying the optimization when there is insufficient
307  * space on the rightmost page to fit caller's new item. This is necessary
308  * because we'll need to return a real descent stack when a page split is
309  * expected (actually, caller can cope with a leaf page split that uses a NULL
310  * stack, but that's very slow and so must be avoided). Note also that the
311  * fastpath optimization acquires the lock on the page conditionally as a way
312  * of reducing extra contention when there are concurrent insertions into the
313  * rightmost page (we give up if we'd have to wait for the lock). We assume
314  * that it isn't useful to apply the optimization when there is contention,
315  * since each per-backend cache won't stay valid for long.
316  */
317 static BTStack
319 {
320  Assert(insertstate->buf == InvalidBuffer);
321  Assert(!insertstate->bounds_valid);
322  Assert(insertstate->postingoff == 0);
323 
325  {
326  /* Simulate a _bt_getbuf() call with conditional locking */
327  insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
328  if (_bt_conditionallockbuf(rel, insertstate->buf))
329  {
330  Page page;
331  BTPageOpaque opaque;
332 
333  _bt_checkpage(rel, insertstate->buf);
334  page = BufferGetPage(insertstate->buf);
335  opaque = BTPageGetOpaque(page);
336 
337  /*
338  * Check if the page is still the rightmost leaf page and has
339  * enough free space to accommodate the new tuple. Also check
340  * that the insertion scan key is strictly greater than the first
341  * non-pivot tuple on the page. (Note that we expect itup_key's
342  * scantid to be unset when our caller is a checkingunique
343  * inserter.)
344  */
345  if (P_RIGHTMOST(opaque) &&
346  P_ISLEAF(opaque) &&
347  !P_IGNORE(opaque) &&
348  PageGetFreeSpace(page) > insertstate->itemsz &&
349  PageGetMaxOffsetNumber(page) >= P_HIKEY &&
350  _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
351  {
352  /*
353  * Caller can use the fastpath optimization because cached
354  * block is still rightmost leaf page, which can fit caller's
355  * new tuple without splitting. Keep block in local cache for
356  * next insert, and have caller use NULL stack.
357  *
358  * Note that _bt_insert_parent() has an assertion that catches
359  * leaf page splits that somehow follow from a fastpath insert
360  * (it should only be passed a NULL stack when it must deal
361  * with a concurrent root page split, and never because a NULL
362  * stack was returned here).
363  */
364  return NULL;
365  }
366 
367  /* Page unsuitable for caller, drop lock and pin */
368  _bt_relbuf(rel, insertstate->buf);
369  }
370  else
371  {
372  /* Lock unavailable, drop pin */
373  ReleaseBuffer(insertstate->buf);
374  }
375 
376  /* Forget block, since cache doesn't appear to be useful */
378  }
379 
380  /* Cannot use optimization -- descend tree, return proper descent stack */
381  return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
382  BT_WRITE);
383 }
384 
385 /*
386  * _bt_check_unique() -- Check for violation of unique index constraint
387  *
388  * Returns InvalidTransactionId if there is no conflict, else an xact ID
389  * we must wait for to see if it commits a conflicting tuple. If an actual
390  * conflict is detected, no return --- just ereport(). If an xact ID is
391  * returned, and the conflicting tuple still has a speculative insertion in
392  * progress, *speculativeToken is set to non-zero, and the caller can wait for
393  * the verdict on the insertion using SpeculativeInsertionWait().
394  *
395  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
396  * InvalidTransactionId because we don't want to wait. In this case we
397  * set *is_unique to false if there is a potential conflict, and the
398  * core code must redo the uniqueness check later.
399  *
400  * As a side-effect, sets state in insertstate that can later be used by
401  * _bt_findinsertloc() to reuse most of the binary search work we do
402  * here.
403  *
404  * This code treats NULLs as equal, unlike the default semantics for unique
405  * indexes. So do not call here when there are NULL values in scan key and
406  * the index uses the default NULLS DISTINCT mode.
407  */
408 static TransactionId
410  IndexUniqueCheck checkUnique, bool *is_unique,
411  uint32 *speculativeToken)
412 {
413  IndexTuple itup = insertstate->itup;
414  IndexTuple curitup = NULL;
415  ItemId curitemid = NULL;
416  BTScanInsert itup_key = insertstate->itup_key;
417  SnapshotData SnapshotDirty;
418  OffsetNumber offset;
419  OffsetNumber maxoff;
420  Page page;
421  BTPageOpaque opaque;
422  Buffer nbuf = InvalidBuffer;
423  bool found = false;
424  bool inposting = false;
425  bool prevalldead = true;
426  int curposti = 0;
427 
428  /* Assume unique until we find a duplicate */
429  *is_unique = true;
430 
431  InitDirtySnapshot(SnapshotDirty);
432 
433  page = BufferGetPage(insertstate->buf);
434  opaque = BTPageGetOpaque(page);
435  maxoff = PageGetMaxOffsetNumber(page);
436 
437  /*
438  * Find the first tuple with the same key.
439  *
440  * This also saves the binary search bounds in insertstate. We use them
441  * in the fastpath below, but also in the _bt_findinsertloc() call later.
442  */
443  Assert(!insertstate->bounds_valid);
444  offset = _bt_binsrch_insert(rel, insertstate);
445 
446  /*
447  * Scan over all equal tuples, looking for live conflicts.
448  */
449  Assert(!insertstate->bounds_valid || insertstate->low == offset);
450  Assert(!itup_key->anynullkeys);
451  Assert(itup_key->scantid == NULL);
452  for (;;)
453  {
454  /*
455  * Each iteration of the loop processes one heap TID, not one index
456  * tuple. Current offset number for page isn't usually advanced on
457  * iterations that process heap TIDs from posting list tuples.
458  *
459  * "inposting" state is set when _inside_ a posting list --- not when
460  * we're at the start (or end) of a posting list. We advance curposti
461  * at the end of the iteration when inside a posting list tuple. In
462  * general, every loop iteration either advances the page offset or
463  * advances curposti --- an iteration that handles the rightmost/max
464  * heap TID in a posting list finally advances the page offset (and
465  * unsets "inposting").
466  *
467  * Make sure the offset points to an actual index tuple before trying
468  * to examine it...
469  */
470  if (offset <= maxoff)
471  {
472  /*
473  * Fastpath: In most cases, we can use cached search bounds to
474  * limit our consideration to items that are definitely
475  * duplicates. This fastpath doesn't apply when the original page
476  * is empty, or when initial offset is past the end of the
477  * original page, which may indicate that we need to examine a
478  * second or subsequent page.
479  *
480  * Note that this optimization allows us to avoid calling
481  * _bt_compare() directly when there are no duplicates, as long as
482  * the offset where the key will go is not at the end of the page.
483  */
484  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
485  {
486  Assert(insertstate->bounds_valid);
487  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
488  Assert(insertstate->low <= insertstate->stricthigh);
489  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
490  break;
491  }
492 
493  /*
494  * We can skip items that are already marked killed.
495  *
496  * In the presence of heavy update activity an index may contain
497  * many killed items with the same key; running _bt_compare() on
498  * each killed item gets expensive. Just advance over killed
499  * items as quickly as we can. We only apply _bt_compare() when
500  * we get to a non-killed item. We could reuse the bounds to
501  * avoid _bt_compare() calls for known equal tuples, but it
502  * doesn't seem worth it.
503  */
504  if (!inposting)
505  curitemid = PageGetItemId(page, offset);
506  if (inposting || !ItemIdIsDead(curitemid))
507  {
508  ItemPointerData htid;
509  bool all_dead = false;
510 
511  if (!inposting)
512  {
513  /* Plain tuple, or first TID in posting list tuple */
514  if (_bt_compare(rel, itup_key, page, offset) != 0)
515  break; /* we're past all the equal tuples */
516 
517  /* Advanced curitup */
518  curitup = (IndexTuple) PageGetItem(page, curitemid);
519  Assert(!BTreeTupleIsPivot(curitup));
520  }
521 
522  /* okay, we gotta fetch the heap tuple using htid ... */
523  if (!BTreeTupleIsPosting(curitup))
524  {
525  /* ... htid is from simple non-pivot tuple */
526  Assert(!inposting);
527  htid = curitup->t_tid;
528  }
529  else if (!inposting)
530  {
531  /* ... htid is first TID in new posting list */
532  inposting = true;
533  prevalldead = true;
534  curposti = 0;
535  htid = *BTreeTupleGetPostingN(curitup, 0);
536  }
537  else
538  {
539  /* ... htid is second or subsequent TID in posting list */
540  Assert(curposti > 0);
541  htid = *BTreeTupleGetPostingN(curitup, curposti);
542  }
543 
544  /*
545  * If we are doing a recheck, we expect to find the tuple we
546  * are rechecking. It's not a duplicate, but we have to keep
547  * scanning.
548  */
549  if (checkUnique == UNIQUE_CHECK_EXISTING &&
550  ItemPointerCompare(&htid, &itup->t_tid) == 0)
551  {
552  found = true;
553  }
554 
555  /*
556  * Check if there's any table tuples for this index entry
557  * satisfying SnapshotDirty. This is necessary because for AMs
558  * with optimizations like heap's HOT, we have just a single
559  * index entry for the entire chain.
560  */
561  else if (table_index_fetch_tuple_check(heapRel, &htid,
562  &SnapshotDirty,
563  &all_dead))
564  {
565  TransactionId xwait;
566 
567  /*
568  * It is a duplicate. If we are only doing a partial
569  * check, then don't bother checking if the tuple is being
570  * updated in another transaction. Just return the fact
571  * that it is a potential conflict and leave the full
572  * check till later. Don't invalidate binary search
573  * bounds.
574  */
575  if (checkUnique == UNIQUE_CHECK_PARTIAL)
576  {
577  if (nbuf != InvalidBuffer)
578  _bt_relbuf(rel, nbuf);
579  *is_unique = false;
580  return InvalidTransactionId;
581  }
582 
583  /*
584  * If this tuple is being updated by other transaction
585  * then we have to wait for its commit/abort.
586  */
587  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
588  SnapshotDirty.xmin : SnapshotDirty.xmax;
589 
590  if (TransactionIdIsValid(xwait))
591  {
592  if (nbuf != InvalidBuffer)
593  _bt_relbuf(rel, nbuf);
594  /* Tell _bt_doinsert to wait... */
595  *speculativeToken = SnapshotDirty.speculativeToken;
596  /* Caller releases lock on buf immediately */
597  insertstate->bounds_valid = false;
598  return xwait;
599  }
600 
601  /*
602  * Otherwise we have a definite conflict. But before
603  * complaining, look to see if the tuple we want to insert
604  * is itself now committed dead --- if so, don't complain.
605  * This is a waste of time in normal scenarios but we must
606  * do it to support CREATE INDEX CONCURRENTLY.
607  *
608  * We must follow HOT-chains here because during
609  * concurrent index build, we insert the root TID though
610  * the actual tuple may be somewhere in the HOT-chain.
611  * While following the chain we might not stop at the
612  * exact tuple which triggered the insert, but that's OK
613  * because if we find a live tuple anywhere in this chain,
614  * we have a unique key conflict. The other live tuple is
615  * not part of this chain because it had a different index
616  * entry.
617  */
618  htid = itup->t_tid;
619  if (table_index_fetch_tuple_check(heapRel, &htid,
620  SnapshotSelf, NULL))
621  {
622  /* Normal case --- it's still live */
623  }
624  else
625  {
626  /*
627  * It's been deleted, so no error, and no need to
628  * continue searching
629  */
630  break;
631  }
632 
633  /*
634  * Check for a conflict-in as we would if we were going to
635  * write to this page. We aren't actually going to write,
636  * but we want a chance to report SSI conflicts that would
637  * otherwise be masked by this unique constraint
638  * violation.
639  */
640  CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
641 
642  /*
643  * This is a definite conflict. Break the tuple down into
644  * datums and report the error. But first, make sure we
645  * release the buffer locks we're holding ---
646  * BuildIndexValueDescription could make catalog accesses,
647  * which in the worst case might touch this same index and
648  * cause deadlocks.
649  */
650  if (nbuf != InvalidBuffer)
651  _bt_relbuf(rel, nbuf);
652  _bt_relbuf(rel, insertstate->buf);
653  insertstate->buf = InvalidBuffer;
654  insertstate->bounds_valid = false;
655 
656  {
658  bool isnull[INDEX_MAX_KEYS];
659  char *key_desc;
660 
662  values, isnull);
663 
664  key_desc = BuildIndexValueDescription(rel, values,
665  isnull);
666 
667  ereport(ERROR,
668  (errcode(ERRCODE_UNIQUE_VIOLATION),
669  errmsg("duplicate key value violates unique constraint \"%s\"",
671  key_desc ? errdetail("Key %s already exists.",
672  key_desc) : 0,
673  errtableconstraint(heapRel,
674  RelationGetRelationName(rel))));
675  }
676  }
677  else if (all_dead && (!inposting ||
678  (prevalldead &&
679  curposti == BTreeTupleGetNPosting(curitup) - 1)))
680  {
681  /*
682  * The conflicting tuple (or all HOT chains pointed to by
683  * all posting list TIDs) is dead to everyone, so mark the
684  * index entry killed.
685  */
686  ItemIdMarkDead(curitemid);
687  opaque->btpo_flags |= BTP_HAS_GARBAGE;
688 
689  /*
690  * Mark buffer with a dirty hint, since state is not
691  * crucial. Be sure to mark the proper buffer dirty.
692  */
693  if (nbuf != InvalidBuffer)
694  MarkBufferDirtyHint(nbuf, true);
695  else
696  MarkBufferDirtyHint(insertstate->buf, true);
697  }
698 
699  /*
700  * Remember if posting list tuple has even a single HOT chain
701  * whose members are not all dead
702  */
703  if (!all_dead && inposting)
704  prevalldead = false;
705  }
706  }
707 
708  if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
709  {
710  /* Advance to next TID in same posting list */
711  curposti++;
712  continue;
713  }
714  else if (offset < maxoff)
715  {
716  /* Advance to next tuple */
717  curposti = 0;
718  inposting = false;
719  offset = OffsetNumberNext(offset);
720  }
721  else
722  {
723  int highkeycmp;
724 
725  /* If scankey == hikey we gotta check the next page too */
726  if (P_RIGHTMOST(opaque))
727  break;
728  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
729  Assert(highkeycmp <= 0);
730  if (highkeycmp != 0)
731  break;
732  /* Advance to next non-dead page --- there must be one */
733  for (;;)
734  {
735  BlockNumber nblkno = opaque->btpo_next;
736 
737  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
738  page = BufferGetPage(nbuf);
739  opaque = BTPageGetOpaque(page);
740  if (!P_IGNORE(opaque))
741  break;
742  if (P_RIGHTMOST(opaque))
743  elog(ERROR, "fell off the end of index \"%s\"",
745  }
746  /* Will also advance to next tuple */
747  curposti = 0;
748  inposting = false;
749  maxoff = PageGetMaxOffsetNumber(page);
750  offset = P_FIRSTDATAKEY(opaque);
751  /* Don't invalidate binary search bounds */
752  }
753  }
754 
755  /*
756  * If we are doing a recheck then we should have found the tuple we are
757  * checking. Otherwise there's something very wrong --- probably, the
758  * index is on a non-immutable expression.
759  */
760  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
761  ereport(ERROR,
762  (errcode(ERRCODE_INTERNAL_ERROR),
763  errmsg("failed to re-find tuple within index \"%s\"",
765  errhint("This may be because of a non-immutable index expression."),
766  errtableconstraint(heapRel,
767  RelationGetRelationName(rel))));
768 
769  if (nbuf != InvalidBuffer)
770  _bt_relbuf(rel, nbuf);
771 
772  return InvalidTransactionId;
773 }
774 
775 
776 /*
777  * _bt_findinsertloc() -- Finds an insert location for a tuple
778  *
779  * On entry, insertstate buffer contains the page the new tuple belongs
780  * on. It is exclusive-locked and pinned by the caller.
781  *
782  * If 'checkingunique' is true, the buffer on entry is the first page
783  * that contains duplicates of the new key. If there are duplicates on
784  * multiple pages, the correct insertion position might be some page to
785  * the right, rather than the first page. In that case, this function
786  * moves right to the correct target page.
787  *
788  * (In a !heapkeyspace index, there can be multiple pages with the same
789  * high key, where the new tuple could legitimately be placed on. In
790  * that case, the caller passes the first page containing duplicates,
791  * just like when checkingunique=true. If that page doesn't have enough
792  * room for the new tuple, this function moves right, trying to find a
793  * legal page that does.)
794  *
795  * If 'indexUnchanged' is true, this is for an UPDATE that didn't
796  * logically change the indexed value, but must nevertheless have a new
797  * entry to point to a successor version. This hint from the executor
798  * will influence our behavior when the page might have to be split and
799  * we must consider our options. Bottom-up index deletion can avoid
800  * pathological version-driven page splits, but we only want to go to the
801  * trouble of trying it when we already have moderate confidence that
802  * it's appropriate. The hint should not significantly affect our
803  * behavior over time unless practically all inserts on to the leaf page
804  * get the hint.
805  *
806  * On exit, insertstate buffer contains the chosen insertion page, and
807  * the offset within that page is returned. If _bt_findinsertloc needed
808  * to move right, the lock and pin on the original page are released, and
809  * the new buffer is exclusively locked and pinned instead.
810  *
811  * If insertstate contains cached binary search bounds, we will take
812  * advantage of them. This avoids repeating comparisons that we made in
813  * _bt_check_unique() already.
814  */
815 static OffsetNumber
817  BTInsertState insertstate,
818  bool checkingunique,
819  bool indexUnchanged,
820  BTStack stack,
821  Relation heapRel)
822 {
823  BTScanInsert itup_key = insertstate->itup_key;
824  Page page = BufferGetPage(insertstate->buf);
825  BTPageOpaque opaque;
826  OffsetNumber newitemoff;
827 
828  opaque = BTPageGetOpaque(page);
829 
830  /* Check 1/3 of a page restriction */
831  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
832  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
833  insertstate->itup);
834 
835  Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
836  Assert(!insertstate->bounds_valid || checkingunique);
837  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
838  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
839  Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
840 
841  if (itup_key->heapkeyspace)
842  {
843  /* Keep track of whether checkingunique duplicate seen */
844  bool uniquedup = indexUnchanged;
845 
846  /*
847  * If we're inserting into a unique index, we may have to walk right
848  * through leaf pages to find the one leaf page that we must insert on
849  * to.
850  *
851  * This is needed for checkingunique callers because a scantid was not
852  * used when we called _bt_search(). scantid can only be set after
853  * _bt_check_unique() has checked for duplicates. The buffer
854  * initially stored in insertstate->buf has the page where the first
855  * duplicate key might be found, which isn't always the page that new
856  * tuple belongs on. The heap TID attribute for new tuple (scantid)
857  * could force us to insert on a sibling page, though that should be
858  * very rare in practice.
859  */
860  if (checkingunique)
861  {
862  if (insertstate->low < insertstate->stricthigh)
863  {
864  /* Encountered a duplicate in _bt_check_unique() */
865  Assert(insertstate->bounds_valid);
866  uniquedup = true;
867  }
868 
869  for (;;)
870  {
871  /*
872  * Does the new tuple belong on this page?
873  *
874  * The earlier _bt_check_unique() call may well have
875  * established a strict upper bound on the offset for the new
876  * item. If it's not the last item of the page (i.e. if there
877  * is at least one tuple on the page that goes after the tuple
878  * we're inserting) then we know that the tuple belongs on
879  * this page. We can skip the high key check.
880  */
881  if (insertstate->bounds_valid &&
882  insertstate->low <= insertstate->stricthigh &&
883  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
884  break;
885 
886  /* Test '<=', not '!=', since scantid is set now */
887  if (P_RIGHTMOST(opaque) ||
888  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
889  break;
890 
891  _bt_stepright(rel, heapRel, insertstate, stack);
892  /* Update local state after stepping right */
893  page = BufferGetPage(insertstate->buf);
894  opaque = BTPageGetOpaque(page);
895  /* Assume duplicates (if checkingunique) */
896  uniquedup = true;
897  }
898  }
899 
900  /*
901  * If the target page cannot fit newitem, try to avoid splitting the
902  * page on insert by performing deletion or deduplication now
903  */
904  if (PageGetFreeSpace(page) < insertstate->itemsz)
905  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
906  checkingunique, uniquedup,
907  indexUnchanged);
908  }
909  else
910  {
911  /*----------
912  * This is a !heapkeyspace (version 2 or 3) index. The current page
913  * is the first page that we could insert the new tuple to, but there
914  * may be other pages to the right that we could opt to use instead.
915  *
916  * If the new key is equal to one or more existing keys, we can
917  * legitimately place it anywhere in the series of equal keys. In
918  * fact, if the new key is equal to the page's "high key" we can place
919  * it on the next page. If it is equal to the high key, and there's
920  * not room to insert the new tuple on the current page without
921  * splitting, then we move right hoping to find more free space and
922  * avoid a split.
923  *
924  * Keep scanning right until we
925  * (a) find a page with enough free space,
926  * (b) reach the last page where the tuple can legally go, or
927  * (c) get tired of searching.
928  * (c) is not flippant; it is important because if there are many
929  * pages' worth of equal keys, it's better to split one of the early
930  * pages than to scan all the way to the end of the run of equal keys
931  * on every insert. We implement "get tired" as a random choice,
932  * since stopping after scanning a fixed number of pages wouldn't work
933  * well (we'd never reach the right-hand side of previously split
934  * pages). The probability of moving right is set at 0.99, which may
935  * seem too high to change the behavior much, but it does an excellent
936  * job of preventing O(N^2) behavior with many equal keys.
937  *----------
938  */
939  while (PageGetFreeSpace(page) < insertstate->itemsz)
940  {
941  /*
942  * Before considering moving right, see if we can obtain enough
943  * space by erasing LP_DEAD items
944  */
945  if (P_HAS_GARBAGE(opaque))
946  {
947  /* Perform simple deletion */
948  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
949  false, false, false);
950 
951  if (PageGetFreeSpace(page) >= insertstate->itemsz)
952  break; /* OK, now we have enough space */
953  }
954 
955  /*
956  * Nope, so check conditions (b) and (c) enumerated above
957  *
958  * The earlier _bt_check_unique() call may well have established a
959  * strict upper bound on the offset for the new item. If it's not
960  * the last item of the page (i.e. if there is at least one tuple
961  * on the page that's greater than the tuple we're inserting to)
962  * then we know that the tuple belongs on this page. We can skip
963  * the high key check.
964  */
965  if (insertstate->bounds_valid &&
966  insertstate->low <= insertstate->stricthigh &&
967  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
968  break;
969 
970  if (P_RIGHTMOST(opaque) ||
971  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
973  break;
974 
975  _bt_stepright(rel, heapRel, insertstate, stack);
976  /* Update local state after stepping right */
977  page = BufferGetPage(insertstate->buf);
978  opaque = BTPageGetOpaque(page);
979  }
980  }
981 
982  /*
983  * We should now be on the correct page. Find the offset within the page
984  * for the new tuple. (Possibly reusing earlier search bounds.)
985  */
986  Assert(P_RIGHTMOST(opaque) ||
987  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
988 
989  newitemoff = _bt_binsrch_insert(rel, insertstate);
990 
991  if (insertstate->postingoff == -1)
992  {
993  /*
994  * There is an overlapping posting list tuple with its LP_DEAD bit
995  * set. We don't want to unnecessarily unset its LP_DEAD bit while
996  * performing a posting list split, so perform simple index tuple
997  * deletion early.
998  */
999  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1000  false, false, false);
1001 
1002  /*
1003  * Do new binary search. New insert location cannot overlap with any
1004  * posting list now.
1005  */
1006  Assert(!insertstate->bounds_valid);
1007  insertstate->postingoff = 0;
1008  newitemoff = _bt_binsrch_insert(rel, insertstate);
1009  Assert(insertstate->postingoff == 0);
1010  }
1011 
1012  return newitemoff;
1013 }
1014 
1015 /*
1016  * Step right to next non-dead page, during insertion.
1017  *
1018  * This is a bit more complicated than moving right in a search. We must
1019  * write-lock the target page before releasing write lock on current page;
1020  * else someone else's _bt_check_unique scan could fail to see our insertion.
1021  * Write locks on intermediate dead pages won't do because we don't know when
1022  * they will get de-linked from the tree.
1023  *
1024  * This is more aggressive than it needs to be for non-unique !heapkeyspace
1025  * indexes.
1026  */
1027 static void
1028 _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1029  BTStack stack)
1030 {
1031  Page page;
1032  BTPageOpaque opaque;
1033  Buffer rbuf;
1034  BlockNumber rblkno;
1035 
1036  Assert(heaprel != NULL);
1037  page = BufferGetPage(insertstate->buf);
1038  opaque = BTPageGetOpaque(page);
1039 
1040  rbuf = InvalidBuffer;
1041  rblkno = opaque->btpo_next;
1042  for (;;)
1043  {
1044  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1045  page = BufferGetPage(rbuf);
1046  opaque = BTPageGetOpaque(page);
1047 
1048  /*
1049  * If this page was incompletely split, finish the split now. We do
1050  * this while holding a lock on the left sibling, which is not good
1051  * because finishing the split could be a fairly lengthy operation.
1052  * But this should happen very seldom.
1053  */
1054  if (P_INCOMPLETE_SPLIT(opaque))
1055  {
1056  _bt_finish_split(rel, heaprel, rbuf, stack);
1057  rbuf = InvalidBuffer;
1058  continue;
1059  }
1060 
1061  if (!P_IGNORE(opaque))
1062  break;
1063  if (P_RIGHTMOST(opaque))
1064  elog(ERROR, "fell off the end of index \"%s\"",
1066 
1067  rblkno = opaque->btpo_next;
1068  }
1069  /* rbuf locked; unlock buf, update state for caller */
1070  _bt_relbuf(rel, insertstate->buf);
1071  insertstate->buf = rbuf;
1072  insertstate->bounds_valid = false;
1073 }
1074 
1075 /*----------
1076  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1077  *
1078  * This recursive procedure does the following things:
1079  *
1080  * + if postingoff != 0, splits existing posting list tuple
1081  * (since it overlaps with new 'itup' tuple).
1082  * + if necessary, splits the target page, using 'itup_key' for
1083  * suffix truncation on leaf pages (caller passes NULL for
1084  * non-leaf pages).
1085  * + inserts the new tuple (might be split from posting list).
1086  * + if the page was split, pops the parent stack, and finds the
1087  * right place to insert the new child pointer (by walking
1088  * right using information stored in the parent stack).
1089  * + invokes itself with the appropriate tuple for the right
1090  * child page on the parent.
1091  * + updates the metapage if a true root or fast root is split.
1092  *
1093  * On entry, we must have the correct buffer in which to do the
1094  * insertion, and the buffer must be pinned and write-locked. On return,
1095  * we will have dropped both the pin and the lock on the buffer.
1096  *
1097  * This routine only performs retail tuple insertions. 'itup' should
1098  * always be either a non-highkey leaf item, or a downlink (new high
1099  * key items are created indirectly, when a page is split). When
1100  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1101  * we're inserting the downlink for. This function will clear the
1102  * INCOMPLETE_SPLIT flag on it, and release the buffer.
1103  *----------
1104  */
1105 static void
1107  Relation heaprel,
1108  BTScanInsert itup_key,
1109  Buffer buf,
1110  Buffer cbuf,
1111  BTStack stack,
1112  IndexTuple itup,
1113  Size itemsz,
1114  OffsetNumber newitemoff,
1115  int postingoff,
1116  bool split_only_page)
1117 {
1118  Page page;
1119  BTPageOpaque opaque;
1120  bool isleaf,
1121  isroot,
1122  isrightmost,
1123  isonly;
1124  IndexTuple oposting = NULL;
1125  IndexTuple origitup = NULL;
1126  IndexTuple nposting = NULL;
1127 
1128  page = BufferGetPage(buf);
1129  opaque = BTPageGetOpaque(page);
1130  isleaf = P_ISLEAF(opaque);
1131  isroot = P_ISROOT(opaque);
1132  isrightmost = P_RIGHTMOST(opaque);
1133  isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1134 
1135  /* child buffer must be given iff inserting on an internal page */
1136  Assert(isleaf == !BufferIsValid(cbuf));
1137  /* tuple must have appropriate number of attributes */
1138  Assert(!isleaf ||
1139  BTreeTupleGetNAtts(itup, rel) ==
1141  Assert(isleaf ||
1142  BTreeTupleGetNAtts(itup, rel) <=
1144  Assert(!BTreeTupleIsPosting(itup));
1145  Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1146  /* Caller must always finish incomplete split for us */
1147  Assert(!P_INCOMPLETE_SPLIT(opaque));
1148 
1149  /*
1150  * Every internal page should have exactly one negative infinity item at
1151  * all times. Only _bt_split() and _bt_newlevel() should add items that
1152  * become negative infinity items through truncation, since they're the
1153  * only routines that allocate new internal pages.
1154  */
1155  Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1156 
1157  /*
1158  * Do we need to split an existing posting list item?
1159  */
1160  if (postingoff != 0)
1161  {
1162  ItemId itemid = PageGetItemId(page, newitemoff);
1163 
1164  /*
1165  * The new tuple is a duplicate with a heap TID that falls inside the
1166  * range of an existing posting list tuple on a leaf page. Prepare to
1167  * split an existing posting list. Overwriting the posting list with
1168  * its post-split version is treated as an extra step in either the
1169  * insert or page split critical section.
1170  */
1171  Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1172  oposting = (IndexTuple) PageGetItem(page, itemid);
1173 
1174  /*
1175  * postingoff value comes from earlier call to _bt_binsrch_posting().
1176  * Its binary search might think that a plain tuple must be a posting
1177  * list tuple that needs to be split. This can happen with corruption
1178  * involving an existing plain tuple that is a duplicate of the new
1179  * item, up to and including its table TID. Check for that here in
1180  * passing.
1181  *
1182  * Also verify that our caller has made sure that the existing posting
1183  * list tuple does not have its LP_DEAD bit set.
1184  */
1185  if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1186  ereport(ERROR,
1187  (errcode(ERRCODE_INDEX_CORRUPTED),
1188  errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1191  newitemoff, BufferGetBlockNumber(buf),
1192  RelationGetRelationName(rel))));
1193 
1194  /* use a mutable copy of itup as our itup from here on */
1195  origitup = itup;
1196  itup = CopyIndexTuple(origitup);
1197  nposting = _bt_swap_posting(itup, oposting, postingoff);
1198  /* itup now contains rightmost/max TID from oposting */
1199 
1200  /* Alter offset so that newitem goes after posting list */
1201  newitemoff = OffsetNumberNext(newitemoff);
1202  }
1203 
1204  /*
1205  * Do we need to split the page to fit the item on it?
1206  *
1207  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1208  * so this comparison is correct even though we appear to be accounting
1209  * only for the item and not for its line pointer.
1210  */
1211  if (PageGetFreeSpace(page) < itemsz)
1212  {
1213  Buffer rbuf;
1214 
1215  Assert(!split_only_page);
1216 
1217  /* split the buffer into left and right halves */
1218  rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1219  itup, origitup, nposting, postingoff);
1222  BufferGetBlockNumber(rbuf));
1223 
1224  /*----------
1225  * By here,
1226  *
1227  * + our target page has been split;
1228  * + the original tuple has been inserted;
1229  * + we have write locks on both the old (left half)
1230  * and new (right half) buffers, after the split; and
1231  * + we know the key we want to insert into the parent
1232  * (it's the "high key" on the left child page).
1233  *
1234  * We're ready to do the parent insertion. We need to hold onto the
1235  * locks for the child pages until we locate the parent, but we can
1236  * at least release the lock on the right child before doing the
1237  * actual insertion. The lock on the left child will be released
1238  * last of all by parent insertion, where it is the 'cbuf' of parent
1239  * page.
1240  *----------
1241  */
1242  _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1243  }
1244  else
1245  {
1246  Buffer metabuf = InvalidBuffer;
1247  Page metapg = NULL;
1248  BTMetaPageData *metad = NULL;
1249  BlockNumber blockcache;
1250 
1251  /*
1252  * If we are doing this insert because we split a page that was the
1253  * only one on its tree level, but was not the root, it may have been
1254  * the "fast root". We need to ensure that the fast root link points
1255  * at or above the current page. We can safely acquire a lock on the
1256  * metapage here --- see comments for _bt_newlevel().
1257  */
1258  if (unlikely(split_only_page))
1259  {
1260  Assert(!isleaf);
1261  Assert(BufferIsValid(cbuf));
1262 
1263  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1264  metapg = BufferGetPage(metabuf);
1265  metad = BTPageGetMeta(metapg);
1266 
1267  if (metad->btm_fastlevel >= opaque->btpo_level)
1268  {
1269  /* no update wanted */
1270  _bt_relbuf(rel, metabuf);
1271  metabuf = InvalidBuffer;
1272  }
1273  }
1274 
1275  /* Do the update. No ereport(ERROR) until changes are logged */
1277 
1278  if (postingoff != 0)
1279  memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1280 
1281  if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1282  false) == InvalidOffsetNumber)
1283  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1285 
1287 
1288  if (BufferIsValid(metabuf))
1289  {
1290  /* upgrade meta-page if needed */
1291  if (metad->btm_version < BTREE_NOVAC_VERSION)
1292  _bt_upgrademetapage(metapg);
1294  metad->btm_fastlevel = opaque->btpo_level;
1295  MarkBufferDirty(metabuf);
1296  }
1297 
1298  /*
1299  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1300  * finishes a split
1301  */
1302  if (!isleaf)
1303  {
1304  Page cpage = BufferGetPage(cbuf);
1305  BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1306 
1307  Assert(P_INCOMPLETE_SPLIT(cpageop));
1308  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1309  MarkBufferDirty(cbuf);
1310  }
1311 
1312  /* XLOG stuff */
1313  if (RelationNeedsWAL(rel))
1314  {
1315  xl_btree_insert xlrec;
1316  xl_btree_metadata xlmeta;
1317  uint8 xlinfo;
1318  XLogRecPtr recptr;
1319  uint16 upostingoff;
1320 
1321  xlrec.offnum = newitemoff;
1322 
1323  XLogBeginInsert();
1324  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1325 
1326  if (isleaf && postingoff == 0)
1327  {
1328  /* Simple leaf insert */
1329  xlinfo = XLOG_BTREE_INSERT_LEAF;
1330  }
1331  else if (postingoff != 0)
1332  {
1333  /*
1334  * Leaf insert with posting list split. Must include
1335  * postingoff field before newitem/orignewitem.
1336  */
1337  Assert(isleaf);
1338  xlinfo = XLOG_BTREE_INSERT_POST;
1339  }
1340  else
1341  {
1342  /* Internal page insert, which finishes a split on cbuf */
1343  xlinfo = XLOG_BTREE_INSERT_UPPER;
1345 
1346  if (BufferIsValid(metabuf))
1347  {
1348  /* Actually, it's an internal page insert + meta update */
1349  xlinfo = XLOG_BTREE_INSERT_META;
1350 
1352  xlmeta.version = metad->btm_version;
1353  xlmeta.root = metad->btm_root;
1354  xlmeta.level = metad->btm_level;
1355  xlmeta.fastroot = metad->btm_fastroot;
1356  xlmeta.fastlevel = metad->btm_fastlevel;
1358  xlmeta.allequalimage = metad->btm_allequalimage;
1359 
1360  XLogRegisterBuffer(2, metabuf,
1362  XLogRegisterBufData(2, (char *) &xlmeta,
1363  sizeof(xl_btree_metadata));
1364  }
1365  }
1366 
1368  if (postingoff == 0)
1369  {
1370  /* Just log itup from caller */
1371  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1372  }
1373  else
1374  {
1375  /*
1376  * Insert with posting list split (XLOG_BTREE_INSERT_POST
1377  * record) case.
1378  *
1379  * Log postingoff. Also log origitup, not itup. REDO routine
1380  * must reconstruct final itup (as well as nposting) using
1381  * _bt_swap_posting().
1382  */
1383  upostingoff = postingoff;
1384 
1385  XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1386  XLogRegisterBufData(0, (char *) origitup,
1387  IndexTupleSize(origitup));
1388  }
1389 
1390  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1391 
1392  if (BufferIsValid(metabuf))
1393  PageSetLSN(metapg, recptr);
1394  if (!isleaf)
1395  PageSetLSN(BufferGetPage(cbuf), recptr);
1396 
1397  PageSetLSN(page, recptr);
1398  }
1399 
1400  END_CRIT_SECTION();
1401 
1402  /* Release subsidiary buffers */
1403  if (BufferIsValid(metabuf))
1404  _bt_relbuf(rel, metabuf);
1405  if (!isleaf)
1406  _bt_relbuf(rel, cbuf);
1407 
1408  /*
1409  * Cache the block number if this is the rightmost leaf page. Cache
1410  * may be used by a future inserter within _bt_search_insert().
1411  */
1412  blockcache = InvalidBlockNumber;
1413  if (isrightmost && isleaf && !isroot)
1414  blockcache = BufferGetBlockNumber(buf);
1415 
1416  /* Release buffer for insertion target block */
1417  _bt_relbuf(rel, buf);
1418 
1419  /*
1420  * If we decided to cache the insertion target block before releasing
1421  * its buffer lock, then cache it now. Check the height of the tree
1422  * first, though. We don't go for the optimization with small
1423  * indexes. Defer final check to this point to ensure that we don't
1424  * call _bt_getrootheight while holding a buffer lock.
1425  */
1426  if (BlockNumberIsValid(blockcache) &&
1428  RelationSetTargetBlock(rel, blockcache);
1429  }
1430 
1431  /* be tidy */
1432  if (postingoff != 0)
1433  {
1434  /* itup is actually a modified copy of caller's original */
1435  pfree(nposting);
1436  pfree(itup);
1437  }
1438 }
1439 
1440 /*
1441  * _bt_split() -- split a page in the btree.
1442  *
1443  * On entry, buf is the page to split, and is pinned and write-locked.
1444  * newitemoff etc. tell us about the new item that must be inserted
1445  * along with the data from the original page.
1446  *
1447  * itup_key is used for suffix truncation on leaf pages (internal
1448  * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1449  * is the left-sibling of the page we're inserting the downlink for.
1450  * This function will clear the INCOMPLETE_SPLIT flag on it, and
1451  * release the buffer.
1452  *
1453  * orignewitem, nposting, and postingoff are needed when an insert of
1454  * orignewitem results in both a posting list split and a page split.
1455  * These extra posting list split details are used here in the same
1456  * way as they are used in the more common case where a posting list
1457  * split does not coincide with a page split. We need to deal with
1458  * posting list splits directly in order to ensure that everything
1459  * that follows from the insert of orignewitem is handled as a single
1460  * atomic operation (though caller's insert of a new pivot/downlink
1461  * into parent page will still be a separate operation). See
1462  * nbtree/README for details on the design of posting list splits.
1463  *
1464  * Returns the new right sibling of buf, pinned and write-locked.
1465  * The pin and lock on buf are maintained.
1466  */
1467 static Buffer
1469  Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1470  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1471 {
1472  Buffer rbuf;
1473  Page origpage;
1474  Page leftpage,
1475  rightpage;
1476  BlockNumber origpagenumber,
1477  rightpagenumber;
1478  BTPageOpaque ropaque,
1479  lopaque,
1480  oopaque;
1481  Buffer sbuf = InvalidBuffer;
1482  Page spage = NULL;
1483  BTPageOpaque sopaque = NULL;
1484  Size itemsz;
1485  ItemId itemid;
1486  IndexTuple firstright,
1487  lefthighkey;
1488  OffsetNumber firstrightoff;
1489  OffsetNumber afterleftoff,
1490  afterrightoff,
1491  minusinfoff;
1492  OffsetNumber origpagepostingoff;
1493  OffsetNumber maxoff;
1494  OffsetNumber i;
1495  bool newitemonleft,
1496  isleaf,
1497  isrightmost;
1498 
1499  /*
1500  * origpage is the original page to be split. leftpage is a temporary
1501  * buffer that receives the left-sibling data, which will be copied back
1502  * into origpage on success. rightpage is the new page that will receive
1503  * the right-sibling data.
1504  *
1505  * leftpage is allocated after choosing a split point. rightpage's new
1506  * buffer isn't acquired until after leftpage is initialized and has new
1507  * high key, the last point where splitting the page may fail (barring
1508  * corruption). Failing before acquiring new buffer won't have lasting
1509  * consequences, since origpage won't have been modified and leftpage is
1510  * only workspace.
1511  */
1512  origpage = BufferGetPage(buf);
1513  oopaque = BTPageGetOpaque(origpage);
1514  isleaf = P_ISLEAF(oopaque);
1515  isrightmost = P_RIGHTMOST(oopaque);
1516  maxoff = PageGetMaxOffsetNumber(origpage);
1517  origpagenumber = BufferGetBlockNumber(buf);
1518 
1519  /*
1520  * Choose a point to split origpage at.
1521  *
1522  * A split point can be thought of as a point _between_ two existing data
1523  * items on origpage (the lastleft and firstright tuples), provided you
1524  * pretend that the new item that didn't fit is already on origpage.
1525  *
1526  * Since origpage does not actually contain newitem, the representation of
1527  * split points needs to work with two boundary cases: splits where
1528  * newitem is lastleft, and splits where newitem is firstright.
1529  * newitemonleft resolves the ambiguity that would otherwise exist when
1530  * newitemoff == firstrightoff. In all other cases it's clear which side
1531  * of the split every tuple goes on from context. newitemonleft is
1532  * usually (but not always) redundant information.
1533  *
1534  * firstrightoff is supposed to be an origpage offset number, but it's
1535  * possible that its value will be maxoff+1, which is "past the end" of
1536  * origpage. This happens in the rare case where newitem goes after all
1537  * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1538  * origpage at the point that leaves newitem alone on new right page. Any
1539  * "!newitemonleft && newitemoff == firstrightoff" split point makes
1540  * newitem the firstright tuple, though, so this case isn't a special
1541  * case.
1542  */
1543  firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1544  newitem, &newitemonleft);
1545 
1546  /* Allocate temp buffer for leftpage */
1547  leftpage = PageGetTempPage(origpage);
1548  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1549  lopaque = BTPageGetOpaque(leftpage);
1550 
1551  /*
1552  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1553  * and HAS_GARBAGE flags.
1554  */
1555  lopaque->btpo_flags = oopaque->btpo_flags;
1556  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1557  /* set flag in leftpage indicating that rightpage has no downlink yet */
1558  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1559  lopaque->btpo_prev = oopaque->btpo_prev;
1560  /* handle btpo_next after rightpage buffer acquired */
1561  lopaque->btpo_level = oopaque->btpo_level;
1562  /* handle btpo_cycleid after rightpage buffer acquired */
1563 
1564  /*
1565  * Copy the original page's LSN into leftpage, which will become the
1566  * updated version of the page. We need this because XLogInsert will
1567  * examine the LSN and possibly dump it in a page image.
1568  */
1569  PageSetLSN(leftpage, PageGetLSN(origpage));
1570 
1571  /*
1572  * Determine page offset number of existing overlapped-with-orignewitem
1573  * posting list when it is necessary to perform a posting list split in
1574  * passing. Note that newitem was already changed by caller (newitem no
1575  * longer has the orignewitem TID).
1576  *
1577  * This page offset number (origpagepostingoff) will be used to pretend
1578  * that the posting split has already taken place, even though the
1579  * required modifications to origpage won't occur until we reach the
1580  * critical section. The lastleft and firstright tuples of our page split
1581  * point should, in effect, come from an imaginary version of origpage
1582  * that has the nposting tuple instead of the original posting list tuple.
1583  *
1584  * Note: _bt_findsplitloc() should have compensated for coinciding posting
1585  * list splits in just the same way, at least in theory. It doesn't
1586  * bother with that, though. In practice it won't affect its choice of
1587  * split point.
1588  */
1589  origpagepostingoff = InvalidOffsetNumber;
1590  if (postingoff != 0)
1591  {
1592  Assert(isleaf);
1593  Assert(ItemPointerCompare(&orignewitem->t_tid,
1594  &newitem->t_tid) < 0);
1595  Assert(BTreeTupleIsPosting(nposting));
1596  origpagepostingoff = OffsetNumberPrev(newitemoff);
1597  }
1598 
1599  /*
1600  * The high key for the new left page is a possibly-truncated copy of
1601  * firstright on the leaf level (it's "firstright itself" on internal
1602  * pages; see !isleaf comments below). This may seem to be contrary to
1603  * Lehman & Yao's approach of using a copy of lastleft as the new high key
1604  * when splitting on the leaf level. It isn't, though.
1605  *
1606  * Suffix truncation will leave the left page's high key fully equal to
1607  * lastleft when lastleft and firstright are equal prior to heap TID (that
1608  * is, the tiebreaker TID value comes from lastleft). It isn't actually
1609  * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1610  * "subtree" invariant to hold. It's sufficient to make sure that the new
1611  * leaf high key is strictly less than firstright, and greater than or
1612  * equal to (not necessarily equal to) lastleft. In other words, when
1613  * suffix truncation isn't possible during a leaf page split, we take
1614  * L&Y's exact approach to generating a new high key for the left page.
1615  * (Actually, that is slightly inaccurate. We don't just use a copy of
1616  * lastleft. A tuple with all the keys from firstright but the max heap
1617  * TID from lastleft is used, to avoid introducing a special case.)
1618  */
1619  if (!newitemonleft && newitemoff == firstrightoff)
1620  {
1621  /* incoming tuple becomes firstright */
1622  itemsz = newitemsz;
1623  firstright = newitem;
1624  }
1625  else
1626  {
1627  /* existing item at firstrightoff becomes firstright */
1628  itemid = PageGetItemId(origpage, firstrightoff);
1629  itemsz = ItemIdGetLength(itemid);
1630  firstright = (IndexTuple) PageGetItem(origpage, itemid);
1631  if (firstrightoff == origpagepostingoff)
1632  firstright = nposting;
1633  }
1634 
1635  if (isleaf)
1636  {
1637  IndexTuple lastleft;
1638 
1639  /* Attempt suffix truncation for leaf page splits */
1640  if (newitemonleft && newitemoff == firstrightoff)
1641  {
1642  /* incoming tuple becomes lastleft */
1643  lastleft = newitem;
1644  }
1645  else
1646  {
1647  OffsetNumber lastleftoff;
1648 
1649  /* existing item before firstrightoff becomes lastleft */
1650  lastleftoff = OffsetNumberPrev(firstrightoff);
1651  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1652  itemid = PageGetItemId(origpage, lastleftoff);
1653  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1654  if (lastleftoff == origpagepostingoff)
1655  lastleft = nposting;
1656  }
1657 
1658  lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1659  itemsz = IndexTupleSize(lefthighkey);
1660  }
1661  else
1662  {
1663  /*
1664  * Don't perform suffix truncation on a copy of firstright to make
1665  * left page high key for internal page splits. Must use firstright
1666  * as new high key directly.
1667  *
1668  * Each distinct separator key value originates as a leaf level high
1669  * key; all other separator keys/pivot tuples are copied from one
1670  * level down. A separator key in a grandparent page must be
1671  * identical to high key in rightmost parent page of the subtree to
1672  * its left, which must itself be identical to high key in rightmost
1673  * child page of that same subtree (this even applies to separator
1674  * from grandparent's high key). There must always be an unbroken
1675  * "seam" of identical separator keys that guide index scans at every
1676  * level, starting from the grandparent. That's why suffix truncation
1677  * is unsafe here.
1678  *
1679  * Internal page splits will truncate firstright into a "negative
1680  * infinity" data item when it gets inserted on the new right page
1681  * below, though. This happens during the call to _bt_pgaddtup() for
1682  * the new first data item for right page. Do not confuse this
1683  * mechanism with suffix truncation. It is just a convenient way of
1684  * implementing page splits that split the internal page "inside"
1685  * firstright. The lefthighkey separator key cannot appear a second
1686  * time in the right page (only firstright's downlink goes in right
1687  * page).
1688  */
1689  lefthighkey = firstright;
1690  }
1691 
1692  /*
1693  * Add new high key to leftpage
1694  */
1695  afterleftoff = P_HIKEY;
1696 
1697  Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1698  Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1700  Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1701  if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1702  false) == InvalidOffsetNumber)
1703  elog(ERROR, "failed to add high key to the left sibling"
1704  " while splitting block %u of index \"%s\"",
1705  origpagenumber, RelationGetRelationName(rel));
1706  afterleftoff = OffsetNumberNext(afterleftoff);
1707 
1708  /*
1709  * Acquire a new right page to split into, now that left page has a new
1710  * high key. From here on, it's not okay to throw an error without
1711  * zeroing rightpage first. This coding rule ensures that we won't
1712  * confuse future VACUUM operations, which might otherwise try to re-find
1713  * a downlink to a leftover junk page as the page undergoes deletion.
1714  *
1715  * It would be reasonable to start the critical section just after the new
1716  * rightpage buffer is acquired instead; that would allow us to avoid
1717  * leftover junk pages without bothering to zero rightpage. We do it this
1718  * way because it avoids an unnecessary PANIC when either origpage or its
1719  * existing sibling page are corrupt.
1720  */
1721  rbuf = _bt_allocbuf(rel, heaprel);
1722  rightpage = BufferGetPage(rbuf);
1723  rightpagenumber = BufferGetBlockNumber(rbuf);
1724  /* rightpage was initialized by _bt_getbuf */
1725  ropaque = BTPageGetOpaque(rightpage);
1726 
1727  /*
1728  * Finish off remaining leftpage special area fields. They cannot be set
1729  * before both origpage (leftpage) and rightpage buffers are acquired and
1730  * locked.
1731  *
1732  * btpo_cycleid is only used with leaf pages, though we set it here in all
1733  * cases just to be consistent.
1734  */
1735  lopaque->btpo_next = rightpagenumber;
1736  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1737 
1738  /*
1739  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1740  * and HAS_GARBAGE flags.
1741  */
1742  ropaque->btpo_flags = oopaque->btpo_flags;
1743  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1744  ropaque->btpo_prev = origpagenumber;
1745  ropaque->btpo_next = oopaque->btpo_next;
1746  ropaque->btpo_level = oopaque->btpo_level;
1747  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1748 
1749  /*
1750  * Add new high key to rightpage where necessary.
1751  *
1752  * If the page we're splitting is not the rightmost page at its level in
1753  * the tree, then the first entry on the page is the high key from
1754  * origpage.
1755  */
1756  afterrightoff = P_HIKEY;
1757 
1758  if (!isrightmost)
1759  {
1760  IndexTuple righthighkey;
1761 
1762  itemid = PageGetItemId(origpage, P_HIKEY);
1763  itemsz = ItemIdGetLength(itemid);
1764  righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1765  Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1766  Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1768  if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1769  false, false) == InvalidOffsetNumber)
1770  {
1771  memset(rightpage, 0, BufferGetPageSize(rbuf));
1772  elog(ERROR, "failed to add high key to the right sibling"
1773  " while splitting block %u of index \"%s\"",
1774  origpagenumber, RelationGetRelationName(rel));
1775  }
1776  afterrightoff = OffsetNumberNext(afterrightoff);
1777  }
1778 
1779  /*
1780  * Internal page splits truncate first data item on right page -- it
1781  * becomes "minus infinity" item for the page. Set this up here.
1782  */
1783  minusinfoff = InvalidOffsetNumber;
1784  if (!isleaf)
1785  minusinfoff = afterrightoff;
1786 
1787  /*
1788  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1789  * additional pivot tuples in !isleaf case) to the appropriate page.
1790  *
1791  * Note: we *must* insert at least the right page's items in item-number
1792  * order, for the benefit of _bt_restore_page().
1793  */
1794  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1795  {
1796  IndexTuple dataitem;
1797 
1798  itemid = PageGetItemId(origpage, i);
1799  itemsz = ItemIdGetLength(itemid);
1800  dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1801 
1802  /* replace original item with nposting due to posting split? */
1803  if (i == origpagepostingoff)
1804  {
1805  Assert(BTreeTupleIsPosting(dataitem));
1806  Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1807  dataitem = nposting;
1808  }
1809 
1810  /* does new item belong before this one? */
1811  else if (i == newitemoff)
1812  {
1813  if (newitemonleft)
1814  {
1815  Assert(newitemoff <= firstrightoff);
1816  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1817  false))
1818  {
1819  memset(rightpage, 0, BufferGetPageSize(rbuf));
1820  elog(ERROR, "failed to add new item to the left sibling"
1821  " while splitting block %u of index \"%s\"",
1822  origpagenumber, RelationGetRelationName(rel));
1823  }
1824  afterleftoff = OffsetNumberNext(afterleftoff);
1825  }
1826  else
1827  {
1828  Assert(newitemoff >= firstrightoff);
1829  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1830  afterrightoff == minusinfoff))
1831  {
1832  memset(rightpage, 0, BufferGetPageSize(rbuf));
1833  elog(ERROR, "failed to add new item to the right sibling"
1834  " while splitting block %u of index \"%s\"",
1835  origpagenumber, RelationGetRelationName(rel));
1836  }
1837  afterrightoff = OffsetNumberNext(afterrightoff);
1838  }
1839  }
1840 
1841  /* decide which page to put it on */
1842  if (i < firstrightoff)
1843  {
1844  if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1845  {
1846  memset(rightpage, 0, BufferGetPageSize(rbuf));
1847  elog(ERROR, "failed to add old item to the left sibling"
1848  " while splitting block %u of index \"%s\"",
1849  origpagenumber, RelationGetRelationName(rel));
1850  }
1851  afterleftoff = OffsetNumberNext(afterleftoff);
1852  }
1853  else
1854  {
1855  if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1856  afterrightoff == minusinfoff))
1857  {
1858  memset(rightpage, 0, BufferGetPageSize(rbuf));
1859  elog(ERROR, "failed to add old item to the right sibling"
1860  " while splitting block %u of index \"%s\"",
1861  origpagenumber, RelationGetRelationName(rel));
1862  }
1863  afterrightoff = OffsetNumberNext(afterrightoff);
1864  }
1865  }
1866 
1867  /* Handle case where newitem goes at the end of rightpage */
1868  if (i <= newitemoff)
1869  {
1870  /*
1871  * Can't have newitemonleft here; that would imply we were told to put
1872  * *everything* on the left page, which cannot fit (if it could, we'd
1873  * not be splitting the page).
1874  */
1875  Assert(!newitemonleft && newitemoff == maxoff + 1);
1876  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1877  afterrightoff == minusinfoff))
1878  {
1879  memset(rightpage, 0, BufferGetPageSize(rbuf));
1880  elog(ERROR, "failed to add new item to the right sibling"
1881  " while splitting block %u of index \"%s\"",
1882  origpagenumber, RelationGetRelationName(rel));
1883  }
1884  afterrightoff = OffsetNumberNext(afterrightoff);
1885  }
1886 
1887  /*
1888  * We have to grab the original right sibling (if any) and update its prev
1889  * link. We are guaranteed that this is deadlock-free, since we couple
1890  * the locks in the standard order: left to right.
1891  */
1892  if (!isrightmost)
1893  {
1894  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1895  spage = BufferGetPage(sbuf);
1896  sopaque = BTPageGetOpaque(spage);
1897  if (sopaque->btpo_prev != origpagenumber)
1898  {
1899  memset(rightpage, 0, BufferGetPageSize(rbuf));
1900  ereport(ERROR,
1901  (errcode(ERRCODE_INDEX_CORRUPTED),
1902  errmsg_internal("right sibling's left-link doesn't match: "
1903  "block %u links to %u instead of expected %u in index \"%s\"",
1904  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1905  RelationGetRelationName(rel))));
1906  }
1907 
1908  /*
1909  * Check to see if we can set the SPLIT_END flag in the right-hand
1910  * split page; this can save some I/O for vacuum since it need not
1911  * proceed to the right sibling. We can set the flag if the right
1912  * sibling has a different cycleid: that means it could not be part of
1913  * a group of pages that were all split off from the same ancestor
1914  * page. If you're confused, imagine that page A splits to A B and
1915  * then again, yielding A C B, while vacuum is in progress. Tuples
1916  * originally in A could now be in either B or C, hence vacuum must
1917  * examine both pages. But if D, our right sibling, has a different
1918  * cycleid then it could not contain any tuples that were in A when
1919  * the vacuum started.
1920  */
1921  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1922  ropaque->btpo_flags |= BTP_SPLIT_END;
1923  }
1924 
1925  /*
1926  * Right sibling is locked, new siblings are prepared, but original page
1927  * is not updated yet.
1928  *
1929  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1930  * not starting the critical section till here because we haven't been
1931  * scribbling on the original page yet; see comments above.
1932  */
1934 
1935  /*
1936  * By here, the original data page has been split into two new halves, and
1937  * these are correct. The algorithm requires that the left page never
1938  * move during a split, so we copy the new left page back on top of the
1939  * original. We need to do this before writing the WAL record, so that
1940  * XLogInsert can WAL log an image of the page if necessary.
1941  */
1942  PageRestoreTempPage(leftpage, origpage);
1943  /* leftpage, lopaque must not be used below here */
1944 
1946  MarkBufferDirty(rbuf);
1947 
1948  if (!isrightmost)
1949  {
1950  sopaque->btpo_prev = rightpagenumber;
1951  MarkBufferDirty(sbuf);
1952  }
1953 
1954  /*
1955  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1956  * a split
1957  */
1958  if (!isleaf)
1959  {
1960  Page cpage = BufferGetPage(cbuf);
1961  BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1962 
1963  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1964  MarkBufferDirty(cbuf);
1965  }
1966 
1967  /* XLOG stuff */
1968  if (RelationNeedsWAL(rel))
1969  {
1970  xl_btree_split xlrec;
1971  uint8 xlinfo;
1972  XLogRecPtr recptr;
1973 
1974  xlrec.level = ropaque->btpo_level;
1975  /* See comments below on newitem, orignewitem, and posting lists */
1976  xlrec.firstrightoff = firstrightoff;
1977  xlrec.newitemoff = newitemoff;
1978  xlrec.postingoff = 0;
1979  if (postingoff != 0 && origpagepostingoff < firstrightoff)
1980  xlrec.postingoff = postingoff;
1981 
1982  XLogBeginInsert();
1983  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1984 
1987  /* Log original right sibling, since we've changed its prev-pointer */
1988  if (!isrightmost)
1990  if (!isleaf)
1992 
1993  /*
1994  * Log the new item, if it was inserted on the left page. (If it was
1995  * put on the right page, we don't need to explicitly WAL log it
1996  * because it's included with all the other items on the right page.)
1997  * Show the new item as belonging to the left page buffer, so that it
1998  * is not stored if XLogInsert decides it needs a full-page image of
1999  * the left page. We always store newitemoff in the record, though.
2000  *
2001  * The details are sometimes slightly different for page splits that
2002  * coincide with a posting list split. If both the replacement
2003  * posting list and newitem go on the right page, then we don't need
2004  * to log anything extra, just like the simple !newitemonleft
2005  * no-posting-split case (postingoff is set to zero in the WAL record,
2006  * so recovery doesn't need to process a posting list split at all).
2007  * Otherwise, we set postingoff and log orignewitem instead of
2008  * newitem, despite having actually inserted newitem. REDO routine
2009  * must reconstruct nposting and newitem using _bt_swap_posting().
2010  *
2011  * Note: It's possible that our page split point is the point that
2012  * makes the posting list lastleft and newitem firstright. This is
2013  * the only case where we log orignewitem/newitem despite newitem
2014  * going on the right page. If XLogInsert decides that it can omit
2015  * orignewitem due to logging a full-page image of the left page,
2016  * everything still works out, since recovery only needs to log
2017  * orignewitem for items on the left page (just like the regular
2018  * newitem-logged case).
2019  */
2020  if (newitemonleft && xlrec.postingoff == 0)
2021  XLogRegisterBufData(0, (char *) newitem, newitemsz);
2022  else if (xlrec.postingoff != 0)
2023  {
2024  Assert(isleaf);
2025  Assert(newitemonleft || firstrightoff == newitemoff);
2026  Assert(newitemsz == IndexTupleSize(orignewitem));
2027  XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
2028  }
2029 
2030  /* Log the left page's new high key */
2031  if (!isleaf)
2032  {
2033  /* lefthighkey isn't local copy, get current pointer */
2034  itemid = PageGetItemId(origpage, P_HIKEY);
2035  lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2036  }
2037  XLogRegisterBufData(0, (char *) lefthighkey,
2038  MAXALIGN(IndexTupleSize(lefthighkey)));
2039 
2040  /*
2041  * Log the contents of the right page in the format understood by
2042  * _bt_restore_page(). The whole right page will be recreated.
2043  *
2044  * Direct access to page is not good but faster - we should implement
2045  * some new func in page API. Note we only store the tuples
2046  * themselves, knowing that they were inserted in item-number order
2047  * and so the line pointers can be reconstructed. See comments for
2048  * _bt_restore_page().
2049  */
2051  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2052  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2053 
2054  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2055  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2056 
2057  PageSetLSN(origpage, recptr);
2058  PageSetLSN(rightpage, recptr);
2059  if (!isrightmost)
2060  PageSetLSN(spage, recptr);
2061  if (!isleaf)
2062  PageSetLSN(BufferGetPage(cbuf), recptr);
2063  }
2064 
2065  END_CRIT_SECTION();
2066 
2067  /* release the old right sibling */
2068  if (!isrightmost)
2069  _bt_relbuf(rel, sbuf);
2070 
2071  /* release the child */
2072  if (!isleaf)
2073  _bt_relbuf(rel, cbuf);
2074 
2075  /* be tidy */
2076  if (isleaf)
2077  pfree(lefthighkey);
2078 
2079  /* split's done */
2080  return rbuf;
2081 }
2082 
2083 /*
2084  * _bt_insert_parent() -- Insert downlink into parent, completing split.
2085  *
2086  * On entry, buf and rbuf are the left and right split pages, which we
2087  * still hold write locks on. Both locks will be released here. We
2088  * release the rbuf lock once we have a write lock on the page that we
2089  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2090  * The lock on buf is released at the same point as the lock on the parent
2091  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2092  * atomic operation that completes the split by inserting a new downlink.
2093  *
2094  * stack - stack showing how we got here. Will be NULL when splitting true
2095  * root, or during concurrent root split, where we can be inefficient
2096  * isroot - we split the true root
2097  * isonly - we split a page alone on its level (might have been fast root)
2098  */
2099 static void
2101  Relation heaprel,
2102  Buffer buf,
2103  Buffer rbuf,
2104  BTStack stack,
2105  bool isroot,
2106  bool isonly)
2107 {
2108  Assert(heaprel != NULL);
2109 
2110  /*
2111  * Here we have to do something Lehman and Yao don't talk about: deal with
2112  * a root split and construction of a new root. If our stack is empty
2113  * then we have just split a node on what had been the root level when we
2114  * descended the tree. If it was still the root then we perform a
2115  * new-root construction. If it *wasn't* the root anymore, search to find
2116  * the next higher level that someone constructed meanwhile, and find the
2117  * right place to insert as for the normal case.
2118  *
2119  * If we have to search for the parent level, we do so by re-descending
2120  * from the root. This is not super-efficient, but it's rare enough not
2121  * to matter.
2122  */
2123  if (isroot)
2124  {
2125  Buffer rootbuf;
2126 
2127  Assert(stack == NULL);
2128  Assert(isonly);
2129  /* create a new root node one level up and update the metapage */
2130  rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2131  /* release the split buffers */
2132  _bt_relbuf(rel, rootbuf);
2133  _bt_relbuf(rel, rbuf);
2134  _bt_relbuf(rel, buf);
2135  }
2136  else
2137  {
2139  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2140  Page page = BufferGetPage(buf);
2141  IndexTuple new_item;
2142  BTStackData fakestack;
2143  IndexTuple ritem;
2144  Buffer pbuf;
2145 
2146  if (stack == NULL)
2147  {
2148  BTPageOpaque opaque;
2149 
2150  elog(DEBUG2, "concurrent ROOT page split");
2151  opaque = BTPageGetOpaque(page);
2152 
2153  /*
2154  * We should never reach here when a leaf page split takes place
2155  * despite the insert of newitem being able to apply the fastpath
2156  * optimization. Make sure of that with an assertion.
2157  *
2158  * This is more of a performance issue than a correctness issue.
2159  * The fastpath won't have a descent stack. Using a phony stack
2160  * here works, but never rely on that. The fastpath should be
2161  * rejected within _bt_search_insert() when the rightmost leaf
2162  * page will split, since it's faster to go through _bt_search()
2163  * and get a stack in the usual way.
2164  */
2165  Assert(!(P_ISLEAF(opaque) &&
2167 
2168  /* Find the leftmost page at the next level up */
2169  pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2170  /* Set up a phony stack entry pointing there */
2171  stack = &fakestack;
2172  stack->bts_blkno = BufferGetBlockNumber(pbuf);
2174  stack->bts_parent = NULL;
2175  _bt_relbuf(rel, pbuf);
2176  }
2177 
2178  /* get high key from left, a strict lower bound for new right page */
2179  ritem = (IndexTuple) PageGetItem(page,
2180  PageGetItemId(page, P_HIKEY));
2181 
2182  /* form an index tuple that points at the new right page */
2183  new_item = CopyIndexTuple(ritem);
2184  BTreeTupleSetDownLink(new_item, rbknum);
2185 
2186  /*
2187  * Re-find and write lock the parent of buf.
2188  *
2189  * It's possible that the location of buf's downlink has changed since
2190  * our initial _bt_search() descent. _bt_getstackbuf() will detect
2191  * and recover from this, updating the stack, which ensures that the
2192  * new downlink will be inserted at the correct offset. Even buf's
2193  * parent may have changed.
2194  */
2195  pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2196 
2197  /*
2198  * Unlock the right child. The left child will be unlocked in
2199  * _bt_insertonpg().
2200  *
2201  * Unlocking the right child must be delayed until here to ensure that
2202  * no concurrent VACUUM operation can become confused. Page deletion
2203  * cannot be allowed to fail to re-find a downlink for the rbuf page.
2204  * (Actually, this is just a vestige of how things used to work. The
2205  * page deletion code is expected to check for the INCOMPLETE_SPLIT
2206  * flag on the left child. It won't attempt deletion of the right
2207  * child until the split is complete. Despite all this, we opt to
2208  * conservatively delay unlocking the right child until here.)
2209  */
2210  _bt_relbuf(rel, rbuf);
2211 
2212  if (pbuf == InvalidBuffer)
2213  ereport(ERROR,
2214  (errcode(ERRCODE_INDEX_CORRUPTED),
2215  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2216  RelationGetRelationName(rel), bknum, rbknum)));
2217 
2218  /* Recursively insert into the parent */
2219  _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2220  new_item, MAXALIGN(IndexTupleSize(new_item)),
2221  stack->bts_offset + 1, 0, isonly);
2222 
2223  /* be tidy */
2224  pfree(new_item);
2225  }
2226 }
2227 
2228 /*
2229  * _bt_finish_split() -- Finish an incomplete split
2230  *
2231  * A crash or other failure can leave a split incomplete. The insertion
2232  * routines won't allow to insert on a page that is incompletely split.
2233  * Before inserting on such a page, call _bt_finish_split().
2234  *
2235  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2236  * and unpinned.
2237  *
2238  * Caller must provide a valid heaprel, since finishing a page split requires
2239  * allocating a new page if and when the parent page splits in turn.
2240  */
2241 void
2243 {
2244  Page lpage = BufferGetPage(lbuf);
2245  BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2246  Buffer rbuf;
2247  Page rpage;
2248  BTPageOpaque rpageop;
2249  bool wasroot;
2250  bool wasonly;
2251 
2252  Assert(P_INCOMPLETE_SPLIT(lpageop));
2253  Assert(heaprel != NULL);
2254 
2255  /* Lock right sibling, the one missing the downlink */
2256  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2257  rpage = BufferGetPage(rbuf);
2258  rpageop = BTPageGetOpaque(rpage);
2259 
2260  /* Could this be a root split? */
2261  if (!stack)
2262  {
2263  Buffer metabuf;
2264  Page metapg;
2265  BTMetaPageData *metad;
2266 
2267  /* acquire lock on the metapage */
2268  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2269  metapg = BufferGetPage(metabuf);
2270  metad = BTPageGetMeta(metapg);
2271 
2272  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2273 
2274  _bt_relbuf(rel, metabuf);
2275  }
2276  else
2277  wasroot = false;
2278 
2279  /* Was this the only page on the level before split? */
2280  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2281 
2282  elog(DEBUG1, "finishing incomplete split of %u/%u",
2284 
2285  _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2286 }
2287 
2288 /*
2289  * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2290  * tuple whose downlink points to child page.
2291  *
2292  * Caller passes child's block number, which is used to identify
2293  * associated pivot tuple in parent page using a linear search that
2294  * matches on pivot's downlink/block number. The expected location of
2295  * the pivot tuple is taken from the stack one level above the child
2296  * page. This is used as a starting point. Insertions into the
2297  * parent level could cause the pivot tuple to move right; deletions
2298  * could cause it to move left, but not left of the page we previously
2299  * found it on.
2300  *
2301  * Caller can use its stack to relocate the pivot tuple/downlink for
2302  * any same-level page to the right of the page found by its initial
2303  * descent. This is necessary because of the possibility that caller
2304  * moved right to recover from a concurrent page split. It's also
2305  * convenient for certain callers to be able to step right when there
2306  * wasn't a concurrent page split, while still using their original
2307  * stack. For example, the checkingunique _bt_doinsert() case may
2308  * have to step right when there are many physical duplicates, and its
2309  * scantid forces an insertion to the right of the "first page the
2310  * value could be on". (This is also relied on by all of our callers
2311  * when dealing with !heapkeyspace indexes.)
2312  *
2313  * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2314  * tuple not found (should not happen). Adjusts bts_blkno &
2315  * bts_offset if changed. Page split caller should insert its new
2316  * pivot tuple for its new right sibling page on parent page, at the
2317  * offset number bts_offset + 1.
2318  */
2319 Buffer
2321 {
2322  BlockNumber blkno;
2323  OffsetNumber start;
2324 
2325  blkno = stack->bts_blkno;
2326  start = stack->bts_offset;
2327 
2328  for (;;)
2329  {
2330  Buffer buf;
2331  Page page;
2332  BTPageOpaque opaque;
2333 
2334  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2335  page = BufferGetPage(buf);
2336  opaque = BTPageGetOpaque(page);
2337 
2338  Assert(heaprel != NULL);
2339  if (P_INCOMPLETE_SPLIT(opaque))
2340  {
2341  _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2342  continue;
2343  }
2344 
2345  if (!P_IGNORE(opaque))
2346  {
2347  OffsetNumber offnum,
2348  minoff,
2349  maxoff;
2350  ItemId itemid;
2351  IndexTuple item;
2352 
2353  minoff = P_FIRSTDATAKEY(opaque);
2354  maxoff = PageGetMaxOffsetNumber(page);
2355 
2356  /*
2357  * start = InvalidOffsetNumber means "search the whole page". We
2358  * need this test anyway due to possibility that page has a high
2359  * key now when it didn't before.
2360  */
2361  if (start < minoff)
2362  start = minoff;
2363 
2364  /*
2365  * Need this check too, to guard against possibility that page
2366  * split since we visited it originally.
2367  */
2368  if (start > maxoff)
2369  start = OffsetNumberNext(maxoff);
2370 
2371  /*
2372  * These loops will check every item on the page --- but in an
2373  * order that's attuned to the probability of where it actually
2374  * is. Scan to the right first, then to the left.
2375  */
2376  for (offnum = start;
2377  offnum <= maxoff;
2378  offnum = OffsetNumberNext(offnum))
2379  {
2380  itemid = PageGetItemId(page, offnum);
2381  item = (IndexTuple) PageGetItem(page, itemid);
2382 
2383  if (BTreeTupleGetDownLink(item) == child)
2384  {
2385  /* Return accurate pointer to where link is now */
2386  stack->bts_blkno = blkno;
2387  stack->bts_offset = offnum;
2388  return buf;
2389  }
2390  }
2391 
2392  for (offnum = OffsetNumberPrev(start);
2393  offnum >= minoff;
2394  offnum = OffsetNumberPrev(offnum))
2395  {
2396  itemid = PageGetItemId(page, offnum);
2397  item = (IndexTuple) PageGetItem(page, itemid);
2398 
2399  if (BTreeTupleGetDownLink(item) == child)
2400  {
2401  /* Return accurate pointer to where link is now */
2402  stack->bts_blkno = blkno;
2403  stack->bts_offset = offnum;
2404  return buf;
2405  }
2406  }
2407  }
2408 
2409  /*
2410  * The item we're looking for moved right at least one page.
2411  *
2412  * Lehman and Yao couple/chain locks when moving right here, which we
2413  * can avoid. See nbtree/README.
2414  */
2415  if (P_RIGHTMOST(opaque))
2416  {
2417  _bt_relbuf(rel, buf);
2418  return InvalidBuffer;
2419  }
2420  blkno = opaque->btpo_next;
2421  start = InvalidOffsetNumber;
2422  _bt_relbuf(rel, buf);
2423  }
2424 }
2425 
2426 /*
2427  * _bt_newlevel() -- Create a new level above root page.
2428  *
2429  * We've just split the old root page and need to create a new one.
2430  * In order to do this, we add a new root page to the file, then lock
2431  * the metadata page and update it. This is guaranteed to be deadlock-
2432  * free, because all readers release their locks on the metadata page
2433  * before trying to lock the root, and all writers lock the root before
2434  * trying to lock the metadata page. We have a write lock on the old
2435  * root page, so we have not introduced any cycles into the waits-for
2436  * graph.
2437  *
2438  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2439  * locked. On exit, a new root page exists with entries for the
2440  * two new children, metapage is updated and unlocked/unpinned.
2441  * The new root buffer is returned to caller which has to unlock/unpin
2442  * lbuf, rbuf & rootbuf.
2443  */
2444 static Buffer
2445 _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2446 {
2447  Buffer rootbuf;
2448  Page lpage,
2449  rootpage;
2450  BlockNumber lbkno,
2451  rbkno;
2452  BlockNumber rootblknum;
2453  BTPageOpaque rootopaque;
2454  BTPageOpaque lopaque;
2455  ItemId itemid;
2456  IndexTuple item;
2457  IndexTuple left_item;
2458  Size left_item_sz;
2459  IndexTuple right_item;
2460  Size right_item_sz;
2461  Buffer metabuf;
2462  Page metapg;
2463  BTMetaPageData *metad;
2464 
2465  lbkno = BufferGetBlockNumber(lbuf);
2466  rbkno = BufferGetBlockNumber(rbuf);
2467  lpage = BufferGetPage(lbuf);
2468  lopaque = BTPageGetOpaque(lpage);
2469 
2470  /* get a new root page */
2471  rootbuf = _bt_allocbuf(rel, heaprel);
2472  rootpage = BufferGetPage(rootbuf);
2473  rootblknum = BufferGetBlockNumber(rootbuf);
2474 
2475  /* acquire lock on the metapage */
2476  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2477  metapg = BufferGetPage(metabuf);
2478  metad = BTPageGetMeta(metapg);
2479 
2480  /*
2481  * Create downlink item for left page (old root). The key value used is
2482  * "minus infinity", a sentinel value that's reliably less than any real
2483  * key value that could appear in the left page.
2484  */
2485  left_item_sz = sizeof(IndexTupleData);
2486  left_item = (IndexTuple) palloc(left_item_sz);
2487  left_item->t_info = left_item_sz;
2488  BTreeTupleSetDownLink(left_item, lbkno);
2489  BTreeTupleSetNAtts(left_item, 0, false);
2490 
2491  /*
2492  * Create downlink item for right page. The key for it is obtained from
2493  * the "high key" position in the left page.
2494  */
2495  itemid = PageGetItemId(lpage, P_HIKEY);
2496  right_item_sz = ItemIdGetLength(itemid);
2497  item = (IndexTuple) PageGetItem(lpage, itemid);
2498  right_item = CopyIndexTuple(item);
2499  BTreeTupleSetDownLink(right_item, rbkno);
2500 
2501  /* NO EREPORT(ERROR) from here till newroot op is logged */
2503 
2504  /* upgrade metapage if needed */
2505  if (metad->btm_version < BTREE_NOVAC_VERSION)
2506  _bt_upgrademetapage(metapg);
2507 
2508  /* set btree special data */
2509  rootopaque = BTPageGetOpaque(rootpage);
2510  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2511  rootopaque->btpo_flags = BTP_ROOT;
2512  rootopaque->btpo_level =
2513  (BTPageGetOpaque(lpage))->btpo_level + 1;
2514  rootopaque->btpo_cycleid = 0;
2515 
2516  /* update metapage data */
2517  metad->btm_root = rootblknum;
2518  metad->btm_level = rootopaque->btpo_level;
2519  metad->btm_fastroot = rootblknum;
2520  metad->btm_fastlevel = rootopaque->btpo_level;
2521 
2522  /*
2523  * Insert the left page pointer into the new root page. The root page is
2524  * the rightmost page on its level so there is no "high key" in it; the
2525  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2526  *
2527  * Note: we *must* insert the two items in item-number order, for the
2528  * benefit of _bt_restore_page().
2529  */
2530  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2531  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2532  false, false) == InvalidOffsetNumber)
2533  elog(PANIC, "failed to add leftkey to new root page"
2534  " while splitting block %u of index \"%s\"",
2536 
2537  /*
2538  * insert the right page pointer into the new root page.
2539  */
2540  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2541  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2543  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2544  false, false) == InvalidOffsetNumber)
2545  elog(PANIC, "failed to add rightkey to new root page"
2546  " while splitting block %u of index \"%s\"",
2548 
2549  /* Clear the incomplete-split flag in the left child */
2550  Assert(P_INCOMPLETE_SPLIT(lopaque));
2551  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2552  MarkBufferDirty(lbuf);
2553 
2554  MarkBufferDirty(rootbuf);
2555  MarkBufferDirty(metabuf);
2556 
2557  /* XLOG stuff */
2558  if (RelationNeedsWAL(rel))
2559  {
2560  xl_btree_newroot xlrec;
2561  XLogRecPtr recptr;
2562  xl_btree_metadata md;
2563 
2564  xlrec.rootblk = rootblknum;
2565  xlrec.level = metad->btm_level;
2566 
2567  XLogBeginInsert();
2568  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2569 
2570  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2573 
2575  md.version = metad->btm_version;
2576  md.root = rootblknum;
2577  md.level = metad->btm_level;
2578  md.fastroot = rootblknum;
2579  md.fastlevel = metad->btm_level;
2581  md.allequalimage = metad->btm_allequalimage;
2582 
2583  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2584 
2585  /*
2586  * Direct access to page is not good but faster - we should implement
2587  * some new func in page API.
2588  */
2590  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2591  ((PageHeader) rootpage)->pd_special -
2592  ((PageHeader) rootpage)->pd_upper);
2593 
2594  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2595 
2596  PageSetLSN(lpage, recptr);
2597  PageSetLSN(rootpage, recptr);
2598  PageSetLSN(metapg, recptr);
2599  }
2600 
2601  END_CRIT_SECTION();
2602 
2603  /* done with metapage */
2604  _bt_relbuf(rel, metabuf);
2605 
2606  pfree(left_item);
2607  pfree(right_item);
2608 
2609  return rootbuf;
2610 }
2611 
2612 /*
2613  * _bt_pgaddtup() -- add a data item to a particular page during split.
2614  *
2615  * The difference between this routine and a bare PageAddItem call is
2616  * that this code can deal with the first data item on an internal btree
2617  * page in passing. This data item (which is called "firstright" within
2618  * _bt_split()) has a key that must be treated as minus infinity after
2619  * the split. Therefore, we truncate away all attributes when caller
2620  * specifies it's the first data item on page (downlink is not changed,
2621  * though). This extra step is only needed for the right page of an
2622  * internal page split. There is no need to do this for the first data
2623  * item on the existing/left page, since that will already have been
2624  * truncated during an earlier page split.
2625  *
2626  * See _bt_split() for a high level explanation of why we truncate here.
2627  * Note that this routine has nothing to do with suffix truncation,
2628  * despite using some of the same infrastructure.
2629  */
2630 static inline bool
2632  Size itemsize,
2633  IndexTuple itup,
2634  OffsetNumber itup_off,
2635  bool newfirstdataitem)
2636 {
2637  IndexTupleData trunctuple;
2638 
2639  if (newfirstdataitem)
2640  {
2641  trunctuple = *itup;
2642  trunctuple.t_info = sizeof(IndexTupleData);
2643  BTreeTupleSetNAtts(&trunctuple, 0, false);
2644  itup = &trunctuple;
2645  itemsize = sizeof(IndexTupleData);
2646  }
2647 
2648  if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2649  false) == InvalidOffsetNumber))
2650  return false;
2651 
2652  return true;
2653 }
2654 
2655 /*
2656  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2657  *
2658  * There are three operations performed here: simple index deletion, bottom-up
2659  * index deletion, and deduplication. If all three operations fail to free
2660  * enough space for the incoming item then caller will go on to split the
2661  * page. We always consider simple deletion first. If that doesn't work out
2662  * we consider alternatives. Callers that only want us to consider simple
2663  * deletion (without any fallback) ask for that using the 'simpleonly'
2664  * argument.
2665  *
2666  * We usually pick only one alternative "complex" operation when simple
2667  * deletion alone won't prevent a page split. The 'checkingunique',
2668  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2669  *
2670  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2671  * level flag was found set. The flag was useful back when there wasn't
2672  * necessarily one single page for a duplicate tuple to go on (before heap TID
2673  * became a part of the key space in version 4 indexes). But we don't
2674  * actually look at the flag anymore (it's not a gating condition for our
2675  * caller). That would cause us to miss tuples that are safe to delete,
2676  * without getting any benefit in return. We know that the alternative is to
2677  * split the page; scanning the line pointer array in passing won't have
2678  * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2679  * all this because !heapkeyspace indexes must still do a "getting tired"
2680  * linear search, and so are likely to get some benefit from using it as a
2681  * gating condition.)
2682  */
2683 static void
2685  BTInsertState insertstate,
2686  bool simpleonly, bool checkingunique,
2687  bool uniquedup, bool indexUnchanged)
2688 {
2690  int ndeletable = 0;
2691  OffsetNumber offnum,
2692  minoff,
2693  maxoff;
2694  Buffer buffer = insertstate->buf;
2695  BTScanInsert itup_key = insertstate->itup_key;
2696  Page page = BufferGetPage(buffer);
2697  BTPageOpaque opaque = BTPageGetOpaque(page);
2698 
2699  Assert(P_ISLEAF(opaque));
2700  Assert(simpleonly || itup_key->heapkeyspace);
2701  Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2702 
2703  /*
2704  * Scan over all items to see which ones need to be deleted according to
2705  * LP_DEAD flags. We'll usually manage to delete a few extra items that
2706  * are not marked LP_DEAD in passing. Often the extra items that actually
2707  * end up getting deleted are items that would have had their LP_DEAD bit
2708  * set before long anyway (if we opted not to include them as extras).
2709  */
2710  minoff = P_FIRSTDATAKEY(opaque);
2711  maxoff = PageGetMaxOffsetNumber(page);
2712  for (offnum = minoff;
2713  offnum <= maxoff;
2714  offnum = OffsetNumberNext(offnum))
2715  {
2716  ItemId itemId = PageGetItemId(page, offnum);
2717 
2718  if (ItemIdIsDead(itemId))
2719  deletable[ndeletable++] = offnum;
2720  }
2721 
2722  if (ndeletable > 0)
2723  {
2724  _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2725  insertstate->itup, minoff, maxoff);
2726  insertstate->bounds_valid = false;
2727 
2728  /* Return when a page split has already been avoided */
2729  if (PageGetFreeSpace(page) >= insertstate->itemsz)
2730  return;
2731 
2732  /* Might as well assume duplicates (if checkingunique) */
2733  uniquedup = true;
2734  }
2735 
2736  /*
2737  * We're done with simple deletion. Return early with callers that only
2738  * call here so that simple deletion can be considered. This includes
2739  * callers that explicitly ask for this and checkingunique callers that
2740  * probably don't have any version churn duplicates on the page.
2741  *
2742  * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2743  * return at this point (or when we go on the try either or both of our
2744  * other strategies and they also fail). We do not bother expending a
2745  * separate write to clear it, however. Caller will definitely clear it
2746  * when it goes on to split the page (note also that the deduplication
2747  * process will clear the flag in passing, just to keep things tidy).
2748  */
2749  if (simpleonly || (checkingunique && !uniquedup))
2750  {
2751  Assert(!indexUnchanged);
2752  return;
2753  }
2754 
2755  /* Assume bounds about to be invalidated (this is almost certain now) */
2756  insertstate->bounds_valid = false;
2757 
2758  /*
2759  * Perform bottom-up index deletion pass when executor hint indicated that
2760  * incoming item is logically unchanged, or for a unique index that is
2761  * known to have physical duplicates for some other reason. (There is a
2762  * large overlap between these two cases for a unique index. It's worth
2763  * having both triggering conditions in order to apply the optimization in
2764  * the event of successive related INSERT and DELETE statements.)
2765  *
2766  * We'll go on to do a deduplication pass when a bottom-up pass fails to
2767  * delete an acceptable amount of free space (a significant fraction of
2768  * the page, or space for the new item, whichever is greater).
2769  *
2770  * Note: Bottom-up index deletion uses the same equality/equivalence
2771  * routines as deduplication internally. However, it does not merge
2772  * together index tuples, so the same correctness considerations do not
2773  * apply. We deliberately omit an index-is-allequalimage test here.
2774  */
2775  if ((indexUnchanged || uniquedup) &&
2776  _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2777  return;
2778 
2779  /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2780  if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2781  _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2782  (indexUnchanged || uniquedup));
2783 }
2784 
2785 /*
2786  * _bt_simpledel_pass - Simple index tuple deletion pass.
2787  *
2788  * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2789  * of all such tuples are determined by caller (caller passes these to us as
2790  * its 'deletable' argument).
2791  *
2792  * We might also delete extra index tuples that turn out to be safe to delete
2793  * in passing (though they must be cheap to check in passing to begin with).
2794  * There is no certainty that any extra tuples will be deleted, though. The
2795  * high level goal of the approach we take is to get the most out of each call
2796  * here (without noticeably increasing the per-call overhead compared to what
2797  * we need to do just to be able to delete the page's LP_DEAD-marked index
2798  * tuples).
2799  *
2800  * The number of extra index tuples that turn out to be deletable might
2801  * greatly exceed the number of LP_DEAD-marked index tuples due to various
2802  * locality related effects. For example, it's possible that the total number
2803  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2804  * quite low, in which case we might end up checking if it's possible to
2805  * delete _most_ index tuples on the page (without the tableam needing to
2806  * access additional table blocks). The tableam will sometimes stumble upon
2807  * _many_ extra deletable index tuples in indexes where this pattern is
2808  * common.
2809  *
2810  * See nbtree/README for further details on simple index tuple deletion.
2811  */
2812 static void
2814  OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2815  OffsetNumber minoff, OffsetNumber maxoff)
2816 {
2817  Page page = BufferGetPage(buffer);
2818  BlockNumber *deadblocks;
2819  int ndeadblocks;
2820  TM_IndexDeleteOp delstate;
2821  OffsetNumber offnum;
2822 
2823  /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2824  deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2825  &ndeadblocks);
2826 
2827  /* Initialize tableam state that describes index deletion operation */
2828  delstate.irel = rel;
2829  delstate.iblknum = BufferGetBlockNumber(buffer);
2830  delstate.bottomup = false;
2831  delstate.bottomupfreespace = 0;
2832  delstate.ndeltids = 0;
2833  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2834  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2835 
2836  for (offnum = minoff;
2837  offnum <= maxoff;
2838  offnum = OffsetNumberNext(offnum))
2839  {
2840  ItemId itemid = PageGetItemId(page, offnum);
2841  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2842  TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2843  TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2844  BlockNumber tidblock;
2845  void *match;
2846 
2847  if (!BTreeTupleIsPosting(itup))
2848  {
2849  tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2850  match = bsearch(&tidblock, deadblocks, ndeadblocks,
2851  sizeof(BlockNumber), _bt_blk_cmp);
2852 
2853  if (!match)
2854  {
2855  Assert(!ItemIdIsDead(itemid));
2856  continue;
2857  }
2858 
2859  /*
2860  * TID's table block is among those pointed to by the TIDs from
2861  * LP_DEAD-bit set tuples on page -- add TID to deltids
2862  */
2863  odeltid->tid = itup->t_tid;
2864  odeltid->id = delstate.ndeltids;
2865  ostatus->idxoffnum = offnum;
2866  ostatus->knowndeletable = ItemIdIsDead(itemid);
2867  ostatus->promising = false; /* unused */
2868  ostatus->freespace = 0; /* unused */
2869 
2870  delstate.ndeltids++;
2871  }
2872  else
2873  {
2874  int nitem = BTreeTupleGetNPosting(itup);
2875 
2876  for (int p = 0; p < nitem; p++)
2877  {
2878  ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2879 
2880  tidblock = ItemPointerGetBlockNumber(tid);
2881  match = bsearch(&tidblock, deadblocks, ndeadblocks,
2882  sizeof(BlockNumber), _bt_blk_cmp);
2883 
2884  if (!match)
2885  {
2886  Assert(!ItemIdIsDead(itemid));
2887  continue;
2888  }
2889 
2890  /*
2891  * TID's table block is among those pointed to by the TIDs
2892  * from LP_DEAD-bit set tuples on page -- add TID to deltids
2893  */
2894  odeltid->tid = *tid;
2895  odeltid->id = delstate.ndeltids;
2896  ostatus->idxoffnum = offnum;
2897  ostatus->knowndeletable = ItemIdIsDead(itemid);
2898  ostatus->promising = false; /* unused */
2899  ostatus->freespace = 0; /* unused */
2900 
2901  odeltid++;
2902  ostatus++;
2903  delstate.ndeltids++;
2904  }
2905  }
2906  }
2907 
2908  pfree(deadblocks);
2909 
2910  Assert(delstate.ndeltids >= ndeletable);
2911 
2912  /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2913  _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2914 
2915  pfree(delstate.deltids);
2916  pfree(delstate.status);
2917 }
2918 
2919 /*
2920  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2921  *
2922  * Builds sorted and unique-ified array of table block numbers from index
2923  * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2924  * block from incoming newitem just in case it isn't among the LP_DEAD-related
2925  * table blocks.
2926  *
2927  * Always counting the newitem's table block as an LP_DEAD related block makes
2928  * sense because the cost is consistently low; it is practically certain that
2929  * the table block will not incur a buffer miss in tableam. On the other hand
2930  * the benefit is often quite high. There is a decent chance that there will
2931  * be some deletable items from this block, since in general most garbage
2932  * tuples became garbage in the recent past (in many cases this won't be the
2933  * first logical row that core code added to/modified in table block
2934  * recently).
2935  *
2936  * Returns final array, and sets *nblocks to its final size for caller.
2937  */
2938 static BlockNumber *
2939 _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2940  IndexTuple newitem, int *nblocks)
2941 {
2942  int spacentids,
2943  ntids;
2944  BlockNumber *tidblocks;
2945 
2946  /*
2947  * Accumulate each TID's block in array whose initial size has space for
2948  * one table block per LP_DEAD-set tuple (plus space for the newitem table
2949  * block). Array will only need to grow when there are LP_DEAD-marked
2950  * posting list tuples (which is not that common).
2951  */
2952  spacentids = ndeletable + 1;
2953  ntids = 0;
2954  tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2955 
2956  /*
2957  * First add the table block for the incoming newitem. This is the one
2958  * case where simple deletion can visit a table block that doesn't have
2959  * any known deletable items.
2960  */
2961  Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2962  tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2963 
2964  for (int i = 0; i < ndeletable; i++)
2965  {
2966  ItemId itemid = PageGetItemId(page, deletable[i]);
2967  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2968 
2969  Assert(ItemIdIsDead(itemid));
2970 
2971  if (!BTreeTupleIsPosting(itup))
2972  {
2973  if (ntids + 1 > spacentids)
2974  {
2975  spacentids *= 2;
2976  tidblocks = (BlockNumber *)
2977  repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2978  }
2979 
2980  tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2981  }
2982  else
2983  {
2984  int nposting = BTreeTupleGetNPosting(itup);
2985 
2986  if (ntids + nposting > spacentids)
2987  {
2988  spacentids = Max(spacentids * 2, ntids + nposting);
2989  tidblocks = (BlockNumber *)
2990  repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2991  }
2992 
2993  for (int j = 0; j < nposting; j++)
2994  {
2995  ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2996 
2997  tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
2998  }
2999  }
3000  }
3001 
3002  qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3003  *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3004 
3005  return tidblocks;
3006 }
3007 
3008 /*
3009  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3010  */
3011 static inline int
3012 _bt_blk_cmp(const void *arg1, const void *arg2)
3013 {
3014  BlockNumber b1 = *((BlockNumber *) arg1);
3015  BlockNumber b2 = *((BlockNumber *) arg2);
3016 
3017  return pg_cmp_u32(b1, b2);
3018 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3378
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4561
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2190
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4625
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:735
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:339
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:424
Page PageGetTempPage(Page page)
Definition: bufpage.c:365
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:468
unsigned short uint16
Definition: c.h:494
unsigned int uint32
Definition: c.h:495
#define MAXALIGN(LEN)
Definition: c.h:800
#define PG_UINT32_MAX
Definition: c.h:579
#define Max(x, y)
Definition: c.h:987
#define unlikely(x)
Definition: c.h:300
unsigned char uint8
Definition: c.h:493
uint32 TransactionId
Definition: c.h:641
size_t Size
Definition: c.h:594
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1162
int errdetail(const char *fmt,...)
Definition: elog.c:1208
int errhint(const char *fmt,...)
Definition: elog.c:1322
int errcode(int sqlerrcode)
Definition: elog.c:860
int errmsg(const char *fmt,...)
Definition: elog.c:1075
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
char * BuildIndexValueDescription(Relation indexRelation, const Datum *values, const bool *isnull)
Definition: genam.c:177
IndexUniqueCheck
Definition: genam.h:116
@ UNIQUE_CHECK_NO
Definition: genam.h:117
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:120
@ UNIQUE_CHECK_PARTIAL
Definition: genam.h:119
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:456
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:489
int j
Definition: isn.c:74
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
Pointer Item
Definition: item.h:17
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
struct IndexTupleData IndexTupleData
#define MaxIndexTuplesPerPage
Definition: itup.h:165
Assert(fmt[strlen(fmt) - 1] !='\n')
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:668
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:825
@ XLTW_InsertIndex
Definition: lmgr.h:31
void pfree(void *pointer)
Definition: mcxt.c:1431
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1451
void * palloc(Size size)
Definition: mcxt.c:1201
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:1022
void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
Definition: nbtdedup.c:58
bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
Definition: nbtdedup.c:307
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:318
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:31
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:816
Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2320
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
Definition: nbtinsert.c:2813
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:103
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2631
static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2445
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2100
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
Definition: nbtinsert.c:2684
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2242
static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1468
static int _bt_blk_cmp(const void *arg1, const void *arg2)
Definition: nbtinsert.c:3012
static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1028
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1106
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:409
static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
Definition: nbtinsert.c:2939
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1004
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:108
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:676
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1130
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:870
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:798
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1094
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:846
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1514
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1105
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define P_HIKEY
Definition: nbtree.h:367
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:226
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define BTP_ROOT
Definition: nbtree.h:77
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:562
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
#define P_ISROOT(opaque)
Definition: nbtree.h:221
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:83
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
#define BT_READ
Definition: nbtree.h:719
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
#define BT_WRITE
Definition: nbtree.h:720
#define P_FIRSTKEY
Definition: nbtree.h:368
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:595
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2360
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:96
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:468
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:682
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:130
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2683
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:174
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1994
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:82
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2257
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:32
#define SizeOfBtreeInsert
Definition: nbtxlog.h:84
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:31
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:27
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:28
#define SizeOfBtreeSplit
Definition: nbtxlog.h:158
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:29
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:30
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define INDEX_MAX_KEYS
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:191
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static char * buf
Definition: pg_test_fsync.c:73
#define qsort(a, b, c, d)
Definition: port.h:449
uintptr_t Datum
Definition: postgres.h:64
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3114
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4306
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition: qunique.h:21
#define RelationGetDescr(relation)
Definition: rel.h:530
#define RelationGetRelationName(relation)
Definition: rel.h:538
#define RelationGetTargetBlock(relation)
Definition: rel.h:600
#define RelationNeedsWAL(relation)
Definition: rel.h:627
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:607
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:516
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:523
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5981
#define SnapshotSelf
Definition: snapmgr.h:32
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:40
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
IndexTuple itup
Definition: nbtree.h:811
BTScanInsert itup_key
Definition: nbtree.h:813
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
BlockNumber btm_root
Definition: nbtree.h:107
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint16 btpo_flags
Definition: nbtree.h:67
uint32 btpo_level
Definition: nbtree.h:66
BTCycleId btpo_cycleid
Definition: nbtree.h:68
ItemPointer scantid
Definition: nbtree.h:791
bool allequalimage
Definition: nbtree.h:787
bool heapkeyspace
Definition: nbtree.h:786
bool anynullkeys
Definition: nbtree.h:788
BlockNumber bts_blkno
Definition: nbtree.h:734
struct BTStackData * bts_parent
Definition: nbtree.h:736
OffsetNumber bts_offset
Definition: nbtree.h:735
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
TransactionId xmin
Definition: snapshot.h:157
TransactionId xmax
Definition: snapshot.h:158
uint32 speculativeToken
Definition: snapshot.h:193
TM_IndexStatus * status
Definition: tableam.h:247
int bottomupfreespace
Definition: tableam.h:242
Relation irel
Definition: tableam.h:239
TM_IndexDelete * deltids
Definition: tableam.h:246
BlockNumber iblknum
Definition: tableam.h:240
ItemPointerData tid
Definition: tableam.h:205
bool knowndeletable
Definition: tableam.h:212
bool promising
Definition: tableam.h:215
int16 freespace
Definition: tableam.h:216
OffsetNumber idxoffnum
Definition: tableam.h:211
OffsetNumber offnum
Definition: nbtxlog.h:78
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
uint16 postingoff
Definition: nbtxlog.h:155
OffsetNumber firstrightoff
Definition: nbtxlog.h:153
uint32 level
Definition: nbtxlog.h:152
OffsetNumber newitemoff
Definition: nbtxlog.h:154
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:218
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:365
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:475
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:406
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33