PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/tableam.h"
21 #include "access/transam.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "storage/lmgr.h"
25 #include "storage/predicate.h"
26 #include "storage/smgr.h"
27 
28 /* Minimum tree height for application of fastpath optimization */
29 #define BTREE_FASTPATH_MIN_LEVEL 2
30 
31 
32 static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
34  Relation heapRel,
35  IndexUniqueCheck checkUnique, bool *is_unique,
36  uint32 *speculativeToken);
38  BTInsertState insertstate,
39  bool checkingunique,
40  BTStack stack,
41  Relation heapRel);
42 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
43 static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
44  Buffer buf,
45  Buffer cbuf,
46  BTStack stack,
47  IndexTuple itup,
48  Size itemsz,
49  OffsetNumber newitemoff,
50  int postingoff,
51  bool split_only_page);
52 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
53  Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
54  IndexTuple newitem, IndexTuple orignewitem,
55  IndexTuple nposting, uint16 postingoff);
56 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
57  BTStack stack, bool isroot, bool isonly);
58 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
59 static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
60  OffsetNumber itup_off, bool newfirstdataitem);
61 static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
62  BTInsertState insertstate,
63  bool lpdeadonly, bool checkingunique,
64  bool uniquedup);
65 
66 /*
67  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
68  *
69  * This routine is called by the public interface routine, btinsert.
70  * By here, itup is filled in, including the TID.
71  *
72  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
73  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
74  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
75  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
76  * don't actually insert.
77  *
78  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
79  * it must be true if the entry is known unique, else false.
80  * (In the current implementation we'll also return true after a
81  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
82  * that's just a coding artifact.)
83  */
84 bool
86  IndexUniqueCheck checkUnique, Relation heapRel)
87 {
88  bool is_unique = false;
89  BTInsertStateData insertstate;
90  BTScanInsert itup_key;
91  BTStack stack;
92  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
93 
94  /* we need an insertion scan key to do our search, so build one */
95  itup_key = _bt_mkscankey(rel, itup);
96 
97  if (checkingunique)
98  {
99  if (!itup_key->anynullkeys)
100  {
101  /* No (heapkeyspace) scantid until uniqueness established */
102  itup_key->scantid = NULL;
103  }
104  else
105  {
106  /*
107  * Scan key for new tuple contains NULL key values. Bypass
108  * checkingunique steps. They are unnecessary because core code
109  * considers NULL unequal to every value, including NULL.
110  *
111  * This optimization avoids O(N^2) behavior within the
112  * _bt_findinsertloc() heapkeyspace path when a unique index has a
113  * large number of "duplicates" with NULL key values.
114  */
115  checkingunique = false;
116  /* Tuple is unique in the sense that core code cares about */
117  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
118  is_unique = true;
119  }
120  }
121 
122  /*
123  * Fill in the BTInsertState working area, to track the current page and
124  * position within the page to insert on.
125  *
126  * Note that itemsz is passed down to lower level code that deals with
127  * inserting the item. It must be MAXALIGN()'d. This ensures that space
128  * accounting code consistently considers the alignment overhead that we
129  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
130  * already conservative about alignment, but we don't rely on that from
131  * this distance. Besides, preserving the "true" tuple size in index
132  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
133  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
134  */
135  insertstate.itup = itup;
136  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
137  insertstate.itup_key = itup_key;
138  insertstate.bounds_valid = false;
139  insertstate.buf = InvalidBuffer;
140  insertstate.postingoff = 0;
141 
142 search:
143 
144  /*
145  * Find and lock the leaf page that the tuple should be added to by
146  * searching from the root page. insertstate.buf will hold a buffer that
147  * is locked in exclusive mode afterwards.
148  */
149  stack = _bt_search_insert(rel, &insertstate);
150 
151  /*
152  * checkingunique inserts are not allowed to go ahead when two tuples with
153  * equal key attribute values would be visible to new MVCC snapshots once
154  * the xact commits. Check for conflicts in the locked page/buffer (if
155  * needed) here.
156  *
157  * It might be necessary to check a page to the right in _bt_check_unique,
158  * though that should be very rare. In practice the first page the value
159  * could be on (with scantid omitted) is almost always also the only page
160  * that a matching tuple might be found on. This is due to the behavior
161  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
162  * only be allowed to cross a page boundary when there is no candidate
163  * leaf page split point that avoids it. Also, _bt_check_unique can use
164  * the leaf page high key to determine that there will be no duplicates on
165  * the right sibling without actually visiting it (it uses the high key in
166  * cases where the new item happens to belong at the far right of the leaf
167  * page).
168  *
169  * NOTE: obviously, _bt_check_unique can only detect keys that are already
170  * in the index; so it cannot defend against concurrent insertions of the
171  * same key. We protect against that by means of holding a write lock on
172  * the first page the value could be on, with omitted/-inf value for the
173  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
174  * the same key must acquire a write lock on the same page, so only one
175  * would-be inserter can be making the check at one time. Furthermore,
176  * once we are past the check we hold write locks continuously until we
177  * have performed our insertion, so no later inserter can fail to see our
178  * insertion. (This requires some care in _bt_findinsertloc.)
179  *
180  * If we must wait for another xact, we release the lock while waiting,
181  * and then must perform a new search.
182  *
183  * For a partial uniqueness check, we don't wait for the other xact. Just
184  * let the tuple in and return false for possibly non-unique, or true for
185  * definitely unique.
186  */
187  if (checkingunique)
188  {
189  TransactionId xwait;
190  uint32 speculativeToken;
191 
192  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
193  &is_unique, &speculativeToken);
194 
195  if (unlikely(TransactionIdIsValid(xwait)))
196  {
197  /* Have to wait for the other guy ... */
198  _bt_relbuf(rel, insertstate.buf);
199  insertstate.buf = InvalidBuffer;
200 
201  /*
202  * If it's a speculative insertion, wait for it to finish (ie. to
203  * go ahead with the insertion, or kill the tuple). Otherwise
204  * wait for the transaction to finish as usual.
205  */
206  if (speculativeToken)
207  SpeculativeInsertionWait(xwait, speculativeToken);
208  else
209  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
210 
211  /* start over... */
212  if (stack)
213  _bt_freestack(stack);
214  goto search;
215  }
216 
217  /* Uniqueness is established -- restore heap tid as scantid */
218  if (itup_key->heapkeyspace)
219  itup_key->scantid = &itup->t_tid;
220  }
221 
222  if (checkUnique != UNIQUE_CHECK_EXISTING)
223  {
224  OffsetNumber newitemoff;
225 
226  /*
227  * The only conflict predicate locking cares about for indexes is when
228  * an index tuple insert conflicts with an existing lock. We don't
229  * know the actual page we're going to insert on for sure just yet in
230  * checkingunique and !heapkeyspace cases, but it's okay to use the
231  * first page the value could be on (with scantid omitted) instead.
232  */
234 
235  /*
236  * Do the insertion. Note that insertstate contains cached binary
237  * search bounds established within _bt_check_unique when insertion is
238  * checkingunique.
239  */
240  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
241  stack, heapRel);
242  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
243  itup, insertstate.itemsz, newitemoff,
244  insertstate.postingoff, false);
245  }
246  else
247  {
248  /* just release the buffer */
249  _bt_relbuf(rel, insertstate.buf);
250  }
251 
252  /* be tidy */
253  if (stack)
254  _bt_freestack(stack);
255  pfree(itup_key);
256 
257  return is_unique;
258 }
259 
260 /*
261  * _bt_search_insert() -- _bt_search() wrapper for inserts
262  *
263  * Search the tree for a particular scankey, or more precisely for the first
264  * leaf page it could be on. Try to make use of the fastpath optimization's
265  * rightmost leaf page cache before actually searching the tree from the root
266  * page, though.
267  *
268  * Return value is a stack of parent-page pointers (though see notes about
269  * fastpath optimization and page splits below). insertstate->buf is set to
270  * the address of the leaf-page buffer, which is write-locked and pinned in
271  * all cases (if necessary by creating a new empty root page for caller).
272  *
273  * The fastpath optimization avoids most of the work of searching the tree
274  * repeatedly when a single backend inserts successive new tuples on the
275  * rightmost leaf page of an index. A backend cache of the rightmost leaf
276  * page is maintained within _bt_insertonpg(), and used here. The cache is
277  * invalidated here when an insert of a non-pivot tuple must take place on a
278  * non-rightmost leaf page.
279  *
280  * The optimization helps with indexes on an auto-incremented field. It also
281  * helps with indexes on datetime columns, as well as indexes with lots of
282  * NULL values. (NULLs usually get inserted in the rightmost page for single
283  * column indexes, since they usually get treated as coming after everything
284  * else in the key space. Individual NULL tuples will generally be placed on
285  * the rightmost leaf page due to the influence of the heap TID column.)
286  *
287  * Note that we avoid applying the optimization when there is insufficient
288  * space on the rightmost page to fit caller's new item. This is necessary
289  * because we'll need to return a real descent stack when a page split is
290  * expected (actually, caller can cope with a leaf page split that uses a NULL
291  * stack, but that's very slow and so must be avoided). Note also that the
292  * fastpath optimization acquires the lock on the page conditionally as a way
293  * of reducing extra contention when there are concurrent insertions into the
294  * rightmost page (we give up if we'd have to wait for the lock). We assume
295  * that it isn't useful to apply the optimization when there is contention,
296  * since each per-backend cache won't stay valid for long.
297  */
298 static BTStack
300 {
301  Assert(insertstate->buf == InvalidBuffer);
302  Assert(!insertstate->bounds_valid);
303  Assert(insertstate->postingoff == 0);
304 
306  {
307  /* Simulate a _bt_getbuf() call with conditional locking */
308  insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
309  if (_bt_conditionallockbuf(rel, insertstate->buf))
310  {
311  Page page;
312  BTPageOpaque opaque;
313 
314  _bt_checkpage(rel, insertstate->buf);
315  page = BufferGetPage(insertstate->buf);
316  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
317 
318  /*
319  * Check if the page is still the rightmost leaf page and has
320  * enough free space to accommodate the new tuple. Also check
321  * that the insertion scan key is strictly greater than the first
322  * non-pivot tuple on the page. (Note that we expect itup_key's
323  * scantid to be unset when our caller is a checkingunique
324  * inserter.)
325  */
326  if (P_RIGHTMOST(opaque) &&
327  P_ISLEAF(opaque) &&
328  !P_IGNORE(opaque) &&
329  PageGetFreeSpace(page) > insertstate->itemsz &&
330  PageGetMaxOffsetNumber(page) >= P_HIKEY &&
331  _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
332  {
333  /*
334  * Caller can use the fastpath optimization because cached
335  * block is still rightmost leaf page, which can fit caller's
336  * new tuple without splitting. Keep block in local cache for
337  * next insert, and have caller use NULL stack.
338  *
339  * Note that _bt_insert_parent() has an assertion that catches
340  * leaf page splits that somehow follow from a fastpath insert
341  * (it should only be passed a NULL stack when it must deal
342  * with a concurrent root page split, and never because a NULL
343  * stack was returned here).
344  */
345  return NULL;
346  }
347 
348  /* Page unsuitable for caller, drop lock and pin */
349  _bt_relbuf(rel, insertstate->buf);
350  }
351  else
352  {
353  /* Lock unavailable, drop pin */
354  ReleaseBuffer(insertstate->buf);
355  }
356 
357  /* Forget block, since cache doesn't appear to be useful */
359  }
360 
361  /* Cannot use optimization -- descend tree, return proper descent stack */
362  return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
363  NULL);
364 }
365 
366 /*
367  * _bt_check_unique() -- Check for violation of unique index constraint
368  *
369  * Returns InvalidTransactionId if there is no conflict, else an xact ID
370  * we must wait for to see if it commits a conflicting tuple. If an actual
371  * conflict is detected, no return --- just ereport(). If an xact ID is
372  * returned, and the conflicting tuple still has a speculative insertion in
373  * progress, *speculativeToken is set to non-zero, and the caller can wait for
374  * the verdict on the insertion using SpeculativeInsertionWait().
375  *
376  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
377  * InvalidTransactionId because we don't want to wait. In this case we
378  * set *is_unique to false if there is a potential conflict, and the
379  * core code must redo the uniqueness check later.
380  *
381  * As a side-effect, sets state in insertstate that can later be used by
382  * _bt_findinsertloc() to reuse most of the binary search work we do
383  * here.
384  *
385  * Do not call here when there are NULL values in scan key. NULL should be
386  * considered unequal to NULL when checking for duplicates, but we are not
387  * prepared to handle that correctly.
388  */
389 static TransactionId
391  IndexUniqueCheck checkUnique, bool *is_unique,
392  uint32 *speculativeToken)
393 {
394  IndexTuple itup = insertstate->itup;
395  IndexTuple curitup = NULL;
396  ItemId curitemid;
397  BTScanInsert itup_key = insertstate->itup_key;
398  SnapshotData SnapshotDirty;
399  OffsetNumber offset;
400  OffsetNumber maxoff;
401  Page page;
402  BTPageOpaque opaque;
403  Buffer nbuf = InvalidBuffer;
404  bool found = false;
405  bool inposting = false;
406  bool prevalldead = true;
407  int curposti = 0;
408 
409  /* Assume unique until we find a duplicate */
410  *is_unique = true;
411 
412  InitDirtySnapshot(SnapshotDirty);
413 
414  page = BufferGetPage(insertstate->buf);
415  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
416  maxoff = PageGetMaxOffsetNumber(page);
417 
418  /*
419  * Find the first tuple with the same key.
420  *
421  * This also saves the binary search bounds in insertstate. We use them
422  * in the fastpath below, but also in the _bt_findinsertloc() call later.
423  */
424  Assert(!insertstate->bounds_valid);
425  offset = _bt_binsrch_insert(rel, insertstate);
426 
427  /*
428  * Scan over all equal tuples, looking for live conflicts.
429  */
430  Assert(!insertstate->bounds_valid || insertstate->low == offset);
431  Assert(!itup_key->anynullkeys);
432  Assert(itup_key->scantid == NULL);
433  for (;;)
434  {
435  /*
436  * Each iteration of the loop processes one heap TID, not one index
437  * tuple. Current offset number for page isn't usually advanced on
438  * iterations that process heap TIDs from posting list tuples.
439  *
440  * "inposting" state is set when _inside_ a posting list --- not when
441  * we're at the start (or end) of a posting list. We advance curposti
442  * at the end of the iteration when inside a posting list tuple. In
443  * general, every loop iteration either advances the page offset or
444  * advances curposti --- an iteration that handles the rightmost/max
445  * heap TID in a posting list finally advances the page offset (and
446  * unsets "inposting").
447  *
448  * Make sure the offset points to an actual index tuple before trying
449  * to examine it...
450  */
451  if (offset <= maxoff)
452  {
453  /*
454  * Fastpath: In most cases, we can use cached search bounds to
455  * limit our consideration to items that are definitely
456  * duplicates. This fastpath doesn't apply when the original page
457  * is empty, or when initial offset is past the end of the
458  * original page, which may indicate that we need to examine a
459  * second or subsequent page.
460  *
461  * Note that this optimization allows us to avoid calling
462  * _bt_compare() directly when there are no duplicates, as long as
463  * the offset where the key will go is not at the end of the page.
464  */
465  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
466  {
467  Assert(insertstate->bounds_valid);
468  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
469  Assert(insertstate->low <= insertstate->stricthigh);
470  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
471  break;
472  }
473 
474  /*
475  * We can skip items that are already marked killed.
476  *
477  * In the presence of heavy update activity an index may contain
478  * many killed items with the same key; running _bt_compare() on
479  * each killed item gets expensive. Just advance over killed
480  * items as quickly as we can. We only apply _bt_compare() when
481  * we get to a non-killed item. We could reuse the bounds to
482  * avoid _bt_compare() calls for known equal tuples, but it
483  * doesn't seem worth it. Workloads with heavy update activity
484  * tend to have many deduplication passes, so we'll often avoid
485  * most of those comparisons, too (we call _bt_compare() when the
486  * posting list tuple is initially encountered, though not when
487  * processing later TIDs from the same tuple).
488  */
489  if (!inposting)
490  curitemid = PageGetItemId(page, offset);
491  if (inposting || !ItemIdIsDead(curitemid))
492  {
493  ItemPointerData htid;
494  bool all_dead = false;
495 
496  if (!inposting)
497  {
498  /* Plain tuple, or first TID in posting list tuple */
499  if (_bt_compare(rel, itup_key, page, offset) != 0)
500  break; /* we're past all the equal tuples */
501 
502  /* Advanced curitup */
503  curitup = (IndexTuple) PageGetItem(page, curitemid);
504  Assert(!BTreeTupleIsPivot(curitup));
505  }
506 
507  /* okay, we gotta fetch the heap tuple using htid ... */
508  if (!BTreeTupleIsPosting(curitup))
509  {
510  /* ... htid is from simple non-pivot tuple */
511  Assert(!inposting);
512  htid = curitup->t_tid;
513  }
514  else if (!inposting)
515  {
516  /* ... htid is first TID in new posting list */
517  inposting = true;
518  prevalldead = true;
519  curposti = 0;
520  htid = *BTreeTupleGetPostingN(curitup, 0);
521  }
522  else
523  {
524  /* ... htid is second or subsequent TID in posting list */
525  Assert(curposti > 0);
526  htid = *BTreeTupleGetPostingN(curitup, curposti);
527  }
528 
529  /*
530  * If we are doing a recheck, we expect to find the tuple we
531  * are rechecking. It's not a duplicate, but we have to keep
532  * scanning.
533  */
534  if (checkUnique == UNIQUE_CHECK_EXISTING &&
535  ItemPointerCompare(&htid, &itup->t_tid) == 0)
536  {
537  found = true;
538  }
539 
540  /*
541  * Check if there's any table tuples for this index entry
542  * satisfying SnapshotDirty. This is necessary because for AMs
543  * with optimizations like heap's HOT, we have just a single
544  * index entry for the entire chain.
545  */
546  else if (table_index_fetch_tuple_check(heapRel, &htid,
547  &SnapshotDirty,
548  &all_dead))
549  {
550  TransactionId xwait;
551 
552  /*
553  * It is a duplicate. If we are only doing a partial
554  * check, then don't bother checking if the tuple is being
555  * updated in another transaction. Just return the fact
556  * that it is a potential conflict and leave the full
557  * check till later. Don't invalidate binary search
558  * bounds.
559  */
560  if (checkUnique == UNIQUE_CHECK_PARTIAL)
561  {
562  if (nbuf != InvalidBuffer)
563  _bt_relbuf(rel, nbuf);
564  *is_unique = false;
565  return InvalidTransactionId;
566  }
567 
568  /*
569  * If this tuple is being updated by other transaction
570  * then we have to wait for its commit/abort.
571  */
572  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
573  SnapshotDirty.xmin : SnapshotDirty.xmax;
574 
575  if (TransactionIdIsValid(xwait))
576  {
577  if (nbuf != InvalidBuffer)
578  _bt_relbuf(rel, nbuf);
579  /* Tell _bt_doinsert to wait... */
580  *speculativeToken = SnapshotDirty.speculativeToken;
581  /* Caller releases lock on buf immediately */
582  insertstate->bounds_valid = false;
583  return xwait;
584  }
585 
586  /*
587  * Otherwise we have a definite conflict. But before
588  * complaining, look to see if the tuple we want to insert
589  * is itself now committed dead --- if so, don't complain.
590  * This is a waste of time in normal scenarios but we must
591  * do it to support CREATE INDEX CONCURRENTLY.
592  *
593  * We must follow HOT-chains here because during
594  * concurrent index build, we insert the root TID though
595  * the actual tuple may be somewhere in the HOT-chain.
596  * While following the chain we might not stop at the
597  * exact tuple which triggered the insert, but that's OK
598  * because if we find a live tuple anywhere in this chain,
599  * we have a unique key conflict. The other live tuple is
600  * not part of this chain because it had a different index
601  * entry.
602  */
603  htid = itup->t_tid;
604  if (table_index_fetch_tuple_check(heapRel, &htid,
605  SnapshotSelf, NULL))
606  {
607  /* Normal case --- it's still live */
608  }
609  else
610  {
611  /*
612  * It's been deleted, so no error, and no need to
613  * continue searching
614  */
615  break;
616  }
617 
618  /*
619  * Check for a conflict-in as we would if we were going to
620  * write to this page. We aren't actually going to write,
621  * but we want a chance to report SSI conflicts that would
622  * otherwise be masked by this unique constraint
623  * violation.
624  */
625  CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
626 
627  /*
628  * This is a definite conflict. Break the tuple down into
629  * datums and report the error. But first, make sure we
630  * release the buffer locks we're holding ---
631  * BuildIndexValueDescription could make catalog accesses,
632  * which in the worst case might touch this same index and
633  * cause deadlocks.
634  */
635  if (nbuf != InvalidBuffer)
636  _bt_relbuf(rel, nbuf);
637  _bt_relbuf(rel, insertstate->buf);
638  insertstate->buf = InvalidBuffer;
639  insertstate->bounds_valid = false;
640 
641  {
643  bool isnull[INDEX_MAX_KEYS];
644  char *key_desc;
645 
647  values, isnull);
648 
649  key_desc = BuildIndexValueDescription(rel, values,
650  isnull);
651 
652  ereport(ERROR,
653  (errcode(ERRCODE_UNIQUE_VIOLATION),
654  errmsg("duplicate key value violates unique constraint \"%s\"",
656  key_desc ? errdetail("Key %s already exists.",
657  key_desc) : 0,
658  errtableconstraint(heapRel,
659  RelationGetRelationName(rel))));
660  }
661  }
662  else if (all_dead && (!inposting ||
663  (prevalldead &&
664  curposti == BTreeTupleGetNPosting(curitup) - 1)))
665  {
666  /*
667  * The conflicting tuple (or all HOT chains pointed to by
668  * all posting list TIDs) is dead to everyone, so mark the
669  * index entry killed.
670  */
671  ItemIdMarkDead(curitemid);
672  opaque->btpo_flags |= BTP_HAS_GARBAGE;
673 
674  /*
675  * Mark buffer with a dirty hint, since state is not
676  * crucial. Be sure to mark the proper buffer dirty.
677  */
678  if (nbuf != InvalidBuffer)
679  MarkBufferDirtyHint(nbuf, true);
680  else
681  MarkBufferDirtyHint(insertstate->buf, true);
682  }
683 
684  /*
685  * Remember if posting list tuple has even a single HOT chain
686  * whose members are not all dead
687  */
688  if (!all_dead && inposting)
689  prevalldead = false;
690  }
691  }
692 
693  if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
694  {
695  /* Advance to next TID in same posting list */
696  curposti++;
697  continue;
698  }
699  else if (offset < maxoff)
700  {
701  /* Advance to next tuple */
702  curposti = 0;
703  inposting = false;
704  offset = OffsetNumberNext(offset);
705  }
706  else
707  {
708  int highkeycmp;
709 
710  /* If scankey == hikey we gotta check the next page too */
711  if (P_RIGHTMOST(opaque))
712  break;
713  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
714  Assert(highkeycmp <= 0);
715  if (highkeycmp != 0)
716  break;
717  /* Advance to next non-dead page --- there must be one */
718  for (;;)
719  {
720  BlockNumber nblkno = opaque->btpo_next;
721 
722  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
723  page = BufferGetPage(nbuf);
724  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
725  if (!P_IGNORE(opaque))
726  break;
727  if (P_RIGHTMOST(opaque))
728  elog(ERROR, "fell off the end of index \"%s\"",
730  }
731  /* Will also advance to next tuple */
732  curposti = 0;
733  inposting = false;
734  maxoff = PageGetMaxOffsetNumber(page);
735  offset = P_FIRSTDATAKEY(opaque);
736  /* Don't invalidate binary search bounds */
737  }
738  }
739 
740  /*
741  * If we are doing a recheck then we should have found the tuple we are
742  * checking. Otherwise there's something very wrong --- probably, the
743  * index is on a non-immutable expression.
744  */
745  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
746  ereport(ERROR,
747  (errcode(ERRCODE_INTERNAL_ERROR),
748  errmsg("failed to re-find tuple within index \"%s\"",
750  errhint("This may be because of a non-immutable index expression."),
751  errtableconstraint(heapRel,
752  RelationGetRelationName(rel))));
753 
754  if (nbuf != InvalidBuffer)
755  _bt_relbuf(rel, nbuf);
756 
757  return InvalidTransactionId;
758 }
759 
760 
761 /*
762  * _bt_findinsertloc() -- Finds an insert location for a tuple
763  *
764  * On entry, insertstate buffer contains the page the new tuple belongs
765  * on. It is exclusive-locked and pinned by the caller.
766  *
767  * If 'checkingunique' is true, the buffer on entry is the first page
768  * that contains duplicates of the new key. If there are duplicates on
769  * multiple pages, the correct insertion position might be some page to
770  * the right, rather than the first page. In that case, this function
771  * moves right to the correct target page.
772  *
773  * (In a !heapkeyspace index, there can be multiple pages with the same
774  * high key, where the new tuple could legitimately be placed on. In
775  * that case, the caller passes the first page containing duplicates,
776  * just like when checkingunique=true. If that page doesn't have enough
777  * room for the new tuple, this function moves right, trying to find a
778  * legal page that does.)
779  *
780  * On exit, insertstate buffer contains the chosen insertion page, and
781  * the offset within that page is returned. If _bt_findinsertloc needed
782  * to move right, the lock and pin on the original page are released, and
783  * the new buffer is exclusively locked and pinned instead.
784  *
785  * If insertstate contains cached binary search bounds, we will take
786  * advantage of them. This avoids repeating comparisons that we made in
787  * _bt_check_unique() already.
788  *
789  * If there is not enough room on the page for the new tuple, we try to
790  * make room by removing any LP_DEAD tuples.
791  */
792 static OffsetNumber
794  BTInsertState insertstate,
795  bool checkingunique,
796  BTStack stack,
797  Relation heapRel)
798 {
799  BTScanInsert itup_key = insertstate->itup_key;
800  Page page = BufferGetPage(insertstate->buf);
801  BTPageOpaque opaque;
802  OffsetNumber newitemoff;
803 
804  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
805 
806  /* Check 1/3 of a page restriction */
807  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
808  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
809  insertstate->itup);
810 
811  Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
812  Assert(!insertstate->bounds_valid || checkingunique);
813  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
814  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
815  Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
816 
817  if (itup_key->heapkeyspace)
818  {
819  /* Keep track of whether checkingunique duplicate seen */
820  bool uniquedup = false;
821 
822  /*
823  * If we're inserting into a unique index, we may have to walk right
824  * through leaf pages to find the one leaf page that we must insert on
825  * to.
826  *
827  * This is needed for checkingunique callers because a scantid was not
828  * used when we called _bt_search(). scantid can only be set after
829  * _bt_check_unique() has checked for duplicates. The buffer
830  * initially stored in insertstate->buf has the page where the first
831  * duplicate key might be found, which isn't always the page that new
832  * tuple belongs on. The heap TID attribute for new tuple (scantid)
833  * could force us to insert on a sibling page, though that should be
834  * very rare in practice.
835  */
836  if (checkingunique)
837  {
838  if (insertstate->low < insertstate->stricthigh)
839  {
840  /* Encountered a duplicate in _bt_check_unique() */
841  Assert(insertstate->bounds_valid);
842  uniquedup = true;
843  }
844 
845  for (;;)
846  {
847  /*
848  * Does the new tuple belong on this page?
849  *
850  * The earlier _bt_check_unique() call may well have
851  * established a strict upper bound on the offset for the new
852  * item. If it's not the last item of the page (i.e. if there
853  * is at least one tuple on the page that goes after the tuple
854  * we're inserting) then we know that the tuple belongs on
855  * this page. We can skip the high key check.
856  */
857  if (insertstate->bounds_valid &&
858  insertstate->low <= insertstate->stricthigh &&
859  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
860  break;
861 
862  /* Test '<=', not '!=', since scantid is set now */
863  if (P_RIGHTMOST(opaque) ||
864  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
865  break;
866 
867  _bt_stepright(rel, insertstate, stack);
868  /* Update local state after stepping right */
869  page = BufferGetPage(insertstate->buf);
870  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
871  /* Assume duplicates (if checkingunique) */
872  uniquedup = true;
873  }
874  }
875 
876  /*
877  * If the target page is full, see if we can obtain enough space using
878  * one or more strategies (e.g. erasing LP_DEAD items, deduplication).
879  * Page splits are expensive, and should only go ahead when truly
880  * necessary.
881  */
882  if (PageGetFreeSpace(page) < insertstate->itemsz)
883  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
884  checkingunique, uniquedup);
885  }
886  else
887  {
888  /*----------
889  * This is a !heapkeyspace (version 2 or 3) index. The current page
890  * is the first page that we could insert the new tuple to, but there
891  * may be other pages to the right that we could opt to use instead.
892  *
893  * If the new key is equal to one or more existing keys, we can
894  * legitimately place it anywhere in the series of equal keys. In
895  * fact, if the new key is equal to the page's "high key" we can place
896  * it on the next page. If it is equal to the high key, and there's
897  * not room to insert the new tuple on the current page without
898  * splitting, then we move right hoping to find more free space and
899  * avoid a split.
900  *
901  * Keep scanning right until we
902  * (a) find a page with enough free space,
903  * (b) reach the last page where the tuple can legally go, or
904  * (c) get tired of searching.
905  * (c) is not flippant; it is important because if there are many
906  * pages' worth of equal keys, it's better to split one of the early
907  * pages than to scan all the way to the end of the run of equal keys
908  * on every insert. We implement "get tired" as a random choice,
909  * since stopping after scanning a fixed number of pages wouldn't work
910  * well (we'd never reach the right-hand side of previously split
911  * pages). The probability of moving right is set at 0.99, which may
912  * seem too high to change the behavior much, but it does an excellent
913  * job of preventing O(N^2) behavior with many equal keys.
914  *----------
915  */
916  while (PageGetFreeSpace(page) < insertstate->itemsz)
917  {
918  /*
919  * Before considering moving right, see if we can obtain enough
920  * space by erasing LP_DEAD items
921  */
922  if (P_HAS_GARBAGE(opaque))
923  {
924  /* Erase LP_DEAD items (won't deduplicate) */
925  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
926  checkingunique, false);
927 
928  if (PageGetFreeSpace(page) >= insertstate->itemsz)
929  break; /* OK, now we have enough space */
930  }
931 
932  /*
933  * Nope, so check conditions (b) and (c) enumerated above
934  *
935  * The earlier _bt_check_unique() call may well have established a
936  * strict upper bound on the offset for the new item. If it's not
937  * the last item of the page (i.e. if there is at least one tuple
938  * on the page that's greater than the tuple we're inserting to)
939  * then we know that the tuple belongs on this page. We can skip
940  * the high key check.
941  */
942  if (insertstate->bounds_valid &&
943  insertstate->low <= insertstate->stricthigh &&
944  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
945  break;
946 
947  if (P_RIGHTMOST(opaque) ||
948  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
949  random() <= (MAX_RANDOM_VALUE / 100))
950  break;
951 
952  _bt_stepright(rel, insertstate, stack);
953  /* Update local state after stepping right */
954  page = BufferGetPage(insertstate->buf);
955  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
956  }
957  }
958 
959  /*
960  * We should now be on the correct page. Find the offset within the page
961  * for the new tuple. (Possibly reusing earlier search bounds.)
962  */
963  Assert(P_RIGHTMOST(opaque) ||
964  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
965 
966  newitemoff = _bt_binsrch_insert(rel, insertstate);
967 
968  if (insertstate->postingoff == -1)
969  {
970  /*
971  * There is an overlapping posting list tuple with its LP_DEAD bit
972  * set. We don't want to unnecessarily unset its LP_DEAD bit while
973  * performing a posting list split, so delete all LP_DEAD items early.
974  * This is the only case where LP_DEAD deletes happen even though
975  * there is space for newitem on the page.
976  *
977  * This can only erase LP_DEAD items (it won't deduplicate).
978  */
979  _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
980  checkingunique, false);
981 
982  /*
983  * Do new binary search. New insert location cannot overlap with any
984  * posting list now.
985  */
986  Assert(!insertstate->bounds_valid);
987  insertstate->postingoff = 0;
988  newitemoff = _bt_binsrch_insert(rel, insertstate);
989  Assert(insertstate->postingoff == 0);
990  }
991 
992  return newitemoff;
993 }
994 
995 /*
996  * Step right to next non-dead page, during insertion.
997  *
998  * This is a bit more complicated than moving right in a search. We must
999  * write-lock the target page before releasing write lock on current page;
1000  * else someone else's _bt_check_unique scan could fail to see our insertion.
1001  * Write locks on intermediate dead pages won't do because we don't know when
1002  * they will get de-linked from the tree.
1003  *
1004  * This is more aggressive than it needs to be for non-unique !heapkeyspace
1005  * indexes.
1006  */
1007 static void
1009 {
1010  Page page;
1011  BTPageOpaque opaque;
1012  Buffer rbuf;
1013  BlockNumber rblkno;
1014 
1015  page = BufferGetPage(insertstate->buf);
1016  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1017 
1018  rbuf = InvalidBuffer;
1019  rblkno = opaque->btpo_next;
1020  for (;;)
1021  {
1022  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1023  page = BufferGetPage(rbuf);
1024  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1025 
1026  /*
1027  * If this page was incompletely split, finish the split now. We do
1028  * this while holding a lock on the left sibling, which is not good
1029  * because finishing the split could be a fairly lengthy operation.
1030  * But this should happen very seldom.
1031  */
1032  if (P_INCOMPLETE_SPLIT(opaque))
1033  {
1034  _bt_finish_split(rel, rbuf, stack);
1035  rbuf = InvalidBuffer;
1036  continue;
1037  }
1038 
1039  if (!P_IGNORE(opaque))
1040  break;
1041  if (P_RIGHTMOST(opaque))
1042  elog(ERROR, "fell off the end of index \"%s\"",
1044 
1045  rblkno = opaque->btpo_next;
1046  }
1047  /* rbuf locked; unlock buf, update state for caller */
1048  _bt_relbuf(rel, insertstate->buf);
1049  insertstate->buf = rbuf;
1050  insertstate->bounds_valid = false;
1051 }
1052 
1053 /*----------
1054  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1055  *
1056  * This recursive procedure does the following things:
1057  *
1058  * + if postingoff != 0, splits existing posting list tuple
1059  * (since it overlaps with new 'itup' tuple).
1060  * + if necessary, splits the target page, using 'itup_key' for
1061  * suffix truncation on leaf pages (caller passes NULL for
1062  * non-leaf pages).
1063  * + inserts the new tuple (might be split from posting list).
1064  * + if the page was split, pops the parent stack, and finds the
1065  * right place to insert the new child pointer (by walking
1066  * right using information stored in the parent stack).
1067  * + invokes itself with the appropriate tuple for the right
1068  * child page on the parent.
1069  * + updates the metapage if a true root or fast root is split.
1070  *
1071  * On entry, we must have the correct buffer in which to do the
1072  * insertion, and the buffer must be pinned and write-locked. On return,
1073  * we will have dropped both the pin and the lock on the buffer.
1074  *
1075  * This routine only performs retail tuple insertions. 'itup' should
1076  * always be either a non-highkey leaf item, or a downlink (new high
1077  * key items are created indirectly, when a page is split). When
1078  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1079  * we're inserting the downlink for. This function will clear the
1080  * INCOMPLETE_SPLIT flag on it, and release the buffer.
1081  *----------
1082  */
1083 static void
1085  BTScanInsert itup_key,
1086  Buffer buf,
1087  Buffer cbuf,
1088  BTStack stack,
1089  IndexTuple itup,
1090  Size itemsz,
1091  OffsetNumber newitemoff,
1092  int postingoff,
1093  bool split_only_page)
1094 {
1095  Page page;
1096  BTPageOpaque opaque;
1097  bool isleaf,
1098  isroot,
1099  isrightmost,
1100  isonly;
1101  IndexTuple oposting = NULL;
1102  IndexTuple origitup = NULL;
1103  IndexTuple nposting = NULL;
1104 
1105  page = BufferGetPage(buf);
1106  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1107  isleaf = P_ISLEAF(opaque);
1108  isroot = P_ISROOT(opaque);
1109  isrightmost = P_RIGHTMOST(opaque);
1110  isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1111 
1112  /* child buffer must be given iff inserting on an internal page */
1113  Assert(isleaf == !BufferIsValid(cbuf));
1114  /* tuple must have appropriate number of attributes */
1115  Assert(!isleaf ||
1116  BTreeTupleGetNAtts(itup, rel) ==
1118  Assert(isleaf ||
1119  BTreeTupleGetNAtts(itup, rel) <=
1121  Assert(!BTreeTupleIsPosting(itup));
1122  Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1123  /* Caller must always finish incomplete split for us */
1124  Assert(!P_INCOMPLETE_SPLIT(opaque));
1125 
1126  /*
1127  * Every internal page should have exactly one negative infinity item at
1128  * all times. Only _bt_split() and _bt_newroot() should add items that
1129  * become negative infinity items through truncation, since they're the
1130  * only routines that allocate new internal pages.
1131  */
1132  Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1133 
1134  /*
1135  * Do we need to split an existing posting list item?
1136  */
1137  if (postingoff != 0)
1138  {
1139  ItemId itemid = PageGetItemId(page, newitemoff);
1140 
1141  /*
1142  * The new tuple is a duplicate with a heap TID that falls inside the
1143  * range of an existing posting list tuple on a leaf page. Prepare to
1144  * split an existing posting list. Overwriting the posting list with
1145  * its post-split version is treated as an extra step in either the
1146  * insert or page split critical section.
1147  */
1148  Assert(isleaf && !ItemIdIsDead(itemid));
1149  Assert(itup_key->heapkeyspace && itup_key->allequalimage);
1150  oposting = (IndexTuple) PageGetItem(page, itemid);
1151 
1152  /* use a mutable copy of itup as our itup from here on */
1153  origitup = itup;
1154  itup = CopyIndexTuple(origitup);
1155  nposting = _bt_swap_posting(itup, oposting, postingoff);
1156  /* itup now contains rightmost/max TID from oposting */
1157 
1158  /* Alter offset so that newitem goes after posting list */
1159  newitemoff = OffsetNumberNext(newitemoff);
1160  }
1161 
1162  /*
1163  * Do we need to split the page to fit the item on it?
1164  *
1165  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1166  * so this comparison is correct even though we appear to be accounting
1167  * only for the item and not for its line pointer.
1168  */
1169  if (PageGetFreeSpace(page) < itemsz)
1170  {
1171  Buffer rbuf;
1172 
1173  Assert(!split_only_page);
1174 
1175  /* split the buffer into left and right halves */
1176  rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
1177  origitup, nposting, postingoff);
1179  BufferGetBlockNumber(buf),
1180  BufferGetBlockNumber(rbuf));
1181 
1182  /*----------
1183  * By here,
1184  *
1185  * + our target page has been split;
1186  * + the original tuple has been inserted;
1187  * + we have write locks on both the old (left half)
1188  * and new (right half) buffers, after the split; and
1189  * + we know the key we want to insert into the parent
1190  * (it's the "high key" on the left child page).
1191  *
1192  * We're ready to do the parent insertion. We need to hold onto the
1193  * locks for the child pages until we locate the parent, but we can
1194  * at least release the lock on the right child before doing the
1195  * actual insertion. The lock on the left child will be released
1196  * last of all by parent insertion, where it is the 'cbuf' of parent
1197  * page.
1198  *----------
1199  */
1200  _bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly);
1201  }
1202  else
1203  {
1204  Buffer metabuf = InvalidBuffer;
1205  Page metapg = NULL;
1206  BTMetaPageData *metad = NULL;
1207  BlockNumber blockcache;
1208 
1209  /*
1210  * If we are doing this insert because we split a page that was the
1211  * only one on its tree level, but was not the root, it may have been
1212  * the "fast root". We need to ensure that the fast root link points
1213  * at or above the current page. We can safely acquire a lock on the
1214  * metapage here --- see comments for _bt_newroot().
1215  */
1216  if (unlikely(split_only_page))
1217  {
1218  Assert(!isleaf);
1219  Assert(BufferIsValid(cbuf));
1220 
1221  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1222  metapg = BufferGetPage(metabuf);
1223  metad = BTPageGetMeta(metapg);
1224 
1225  if (metad->btm_fastlevel >= opaque->btpo.level)
1226  {
1227  /* no update wanted */
1228  _bt_relbuf(rel, metabuf);
1229  metabuf = InvalidBuffer;
1230  }
1231  }
1232 
1233  /* Do the update. No ereport(ERROR) until changes are logged */
1235 
1236  if (postingoff != 0)
1237  memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1238 
1239  if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1240  false) == InvalidOffsetNumber)
1241  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1243 
1244  MarkBufferDirty(buf);
1245 
1246  if (BufferIsValid(metabuf))
1247  {
1248  /* upgrade meta-page if needed */
1249  if (metad->btm_version < BTREE_NOVAC_VERSION)
1250  _bt_upgrademetapage(metapg);
1251  metad->btm_fastroot = BufferGetBlockNumber(buf);
1252  metad->btm_fastlevel = opaque->btpo.level;
1253  MarkBufferDirty(metabuf);
1254  }
1255 
1256  /*
1257  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1258  * finishes a split
1259  */
1260  if (!isleaf)
1261  {
1262  Page cpage = BufferGetPage(cbuf);
1263  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1264 
1265  Assert(P_INCOMPLETE_SPLIT(cpageop));
1266  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1267  MarkBufferDirty(cbuf);
1268  }
1269 
1270  /* XLOG stuff */
1271  if (RelationNeedsWAL(rel))
1272  {
1273  xl_btree_insert xlrec;
1274  xl_btree_metadata xlmeta;
1275  uint8 xlinfo;
1276  XLogRecPtr recptr;
1277  uint16 upostingoff;
1278 
1279  xlrec.offnum = newitemoff;
1280 
1281  XLogBeginInsert();
1282  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1283 
1284  if (isleaf && postingoff == 0)
1285  {
1286  /* Simple leaf insert */
1287  xlinfo = XLOG_BTREE_INSERT_LEAF;
1288  }
1289  else if (postingoff != 0)
1290  {
1291  /*
1292  * Leaf insert with posting list split. Must include
1293  * postingoff field before newitem/orignewitem.
1294  */
1295  Assert(isleaf);
1296  xlinfo = XLOG_BTREE_INSERT_POST;
1297  }
1298  else
1299  {
1300  /* Internal page insert, which finishes a split on cbuf */
1301  xlinfo = XLOG_BTREE_INSERT_UPPER;
1303 
1304  if (BufferIsValid(metabuf))
1305  {
1306  /* Actually, it's an internal page insert + meta update */
1307  xlinfo = XLOG_BTREE_INSERT_META;
1308 
1310  xlmeta.version = metad->btm_version;
1311  xlmeta.root = metad->btm_root;
1312  xlmeta.level = metad->btm_level;
1313  xlmeta.fastroot = metad->btm_fastroot;
1314  xlmeta.fastlevel = metad->btm_fastlevel;
1315  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1318  xlmeta.allequalimage = metad->btm_allequalimage;
1319 
1320  XLogRegisterBuffer(2, metabuf,
1322  XLogRegisterBufData(2, (char *) &xlmeta,
1323  sizeof(xl_btree_metadata));
1324  }
1325  }
1326 
1328  if (postingoff == 0)
1329  {
1330  /* Just log itup from caller */
1331  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1332  }
1333  else
1334  {
1335  /*
1336  * Insert with posting list split (XLOG_BTREE_INSERT_POST
1337  * record) case.
1338  *
1339  * Log postingoff. Also log origitup, not itup. REDO routine
1340  * must reconstruct final itup (as well as nposting) using
1341  * _bt_swap_posting().
1342  */
1343  upostingoff = postingoff;
1344 
1345  XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1346  XLogRegisterBufData(0, (char *) origitup,
1347  IndexTupleSize(origitup));
1348  }
1349 
1350  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1351 
1352  if (BufferIsValid(metabuf))
1353  PageSetLSN(metapg, recptr);
1354  if (!isleaf)
1355  PageSetLSN(BufferGetPage(cbuf), recptr);
1356 
1357  PageSetLSN(page, recptr);
1358  }
1359 
1360  END_CRIT_SECTION();
1361 
1362  /* Release subsidiary buffers */
1363  if (BufferIsValid(metabuf))
1364  _bt_relbuf(rel, metabuf);
1365  if (!isleaf)
1366  _bt_relbuf(rel, cbuf);
1367 
1368  /*
1369  * Cache the block number if this is the rightmost leaf page. Cache
1370  * may be used by a future inserter within _bt_search_insert().
1371  */
1372  blockcache = InvalidBlockNumber;
1373  if (isrightmost && isleaf && !isroot)
1374  blockcache = BufferGetBlockNumber(buf);
1375 
1376  /* Release buffer for insertion target block */
1377  _bt_relbuf(rel, buf);
1378 
1379  /*
1380  * If we decided to cache the insertion target block before releasing
1381  * its buffer lock, then cache it now. Check the height of the tree
1382  * first, though. We don't go for the optimization with small
1383  * indexes. Defer final check to this point to ensure that we don't
1384  * call _bt_getrootheight while holding a buffer lock.
1385  */
1386  if (BlockNumberIsValid(blockcache) &&
1388  RelationSetTargetBlock(rel, blockcache);
1389  }
1390 
1391  /* be tidy */
1392  if (postingoff != 0)
1393  {
1394  /* itup is actually a modified copy of caller's original */
1395  pfree(nposting);
1396  pfree(itup);
1397  }
1398 }
1399 
1400 /*
1401  * _bt_split() -- split a page in the btree.
1402  *
1403  * On entry, buf is the page to split, and is pinned and write-locked.
1404  * newitemoff etc. tell us about the new item that must be inserted
1405  * along with the data from the original page.
1406  *
1407  * itup_key is used for suffix truncation on leaf pages (internal
1408  * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1409  * is the left-sibling of the page we're inserting the downlink for.
1410  * This function will clear the INCOMPLETE_SPLIT flag on it, and
1411  * release the buffer.
1412  *
1413  * orignewitem, nposting, and postingoff are needed when an insert of
1414  * orignewitem results in both a posting list split and a page split.
1415  * These extra posting list split details are used here in the same
1416  * way as they are used in the more common case where a posting list
1417  * split does not coincide with a page split. We need to deal with
1418  * posting list splits directly in order to ensure that everything
1419  * that follows from the insert of orignewitem is handled as a single
1420  * atomic operation (though caller's insert of a new pivot/downlink
1421  * into parent page will still be a separate operation). See
1422  * nbtree/README for details on the design of posting list splits.
1423  *
1424  * Returns the new right sibling of buf, pinned and write-locked.
1425  * The pin and lock on buf are maintained.
1426  */
1427 static Buffer
1429  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1430  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1431 {
1432  Buffer rbuf;
1433  Page origpage;
1434  Page leftpage,
1435  rightpage;
1436  BlockNumber origpagenumber,
1437  rightpagenumber;
1438  BTPageOpaque ropaque,
1439  lopaque,
1440  oopaque;
1441  Buffer sbuf = InvalidBuffer;
1442  Page spage = NULL;
1443  BTPageOpaque sopaque = NULL;
1444  Size itemsz;
1445  ItemId itemid;
1446  IndexTuple firstright,
1447  lefthighkey;
1448  OffsetNumber firstrightoff;
1449  OffsetNumber afterleftoff,
1450  afterrightoff,
1451  minusinfoff;
1452  OffsetNumber origpagepostingoff;
1453  OffsetNumber maxoff;
1454  OffsetNumber i;
1455  bool newitemonleft,
1456  isleaf,
1457  isrightmost;
1458 
1459  /*
1460  * origpage is the original page to be split. leftpage is a temporary
1461  * buffer that receives the left-sibling data, which will be copied back
1462  * into origpage on success. rightpage is the new page that will receive
1463  * the right-sibling data.
1464  *
1465  * leftpage is allocated after choosing a split point. rightpage's new
1466  * buffer isn't acquired until after leftpage is initialized and has new
1467  * high key, the last point where splitting the page may fail (barring
1468  * corruption). Failing before acquiring new buffer won't have lasting
1469  * consequences, since origpage won't have been modified and leftpage is
1470  * only workspace.
1471  */
1472  origpage = BufferGetPage(buf);
1473  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1474  isleaf = P_ISLEAF(oopaque);
1475  isrightmost = P_RIGHTMOST(oopaque);
1476  maxoff = PageGetMaxOffsetNumber(origpage);
1477  origpagenumber = BufferGetBlockNumber(buf);
1478 
1479  /*
1480  * Choose a point to split origpage at.
1481  *
1482  * A split point can be thought of as a point _between_ two existing data
1483  * items on origpage (the lastleft and firstright tuples), provided you
1484  * pretend that the new item that didn't fit is already on origpage.
1485  *
1486  * Since origpage does not actually contain newitem, the representation of
1487  * split points needs to work with two boundary cases: splits where
1488  * newitem is lastleft, and splits where newitem is firstright.
1489  * newitemonleft resolves the ambiguity that would otherwise exist when
1490  * newitemoff == firstrightoff. In all other cases it's clear which side
1491  * of the split every tuple goes on from context. newitemonleft is
1492  * usually (but not always) redundant information.
1493  *
1494  * firstrightoff is supposed to be an origpage offset number, but it's
1495  * possible that its value will be maxoff+1, which is "past the end" of
1496  * origpage. This happens in the rare case where newitem goes after all
1497  * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1498  * origpage at the point that leaves newitem alone on new right page. Any
1499  * "!newitemonleft && newitemoff == firstrightoff" split point makes
1500  * newitem the firstright tuple, though, so this case isn't a special
1501  * case.
1502  */
1503  firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1504  newitem, &newitemonleft);
1505 
1506  /* Allocate temp buffer for leftpage */
1507  leftpage = PageGetTempPage(origpage);
1508  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1509  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1510 
1511  /*
1512  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1513  * and HAS_GARBAGE flags.
1514  */
1515  lopaque->btpo_flags = oopaque->btpo_flags;
1516  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1517  /* set flag in leftpage indicating that rightpage has no downlink yet */
1518  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1519  lopaque->btpo_prev = oopaque->btpo_prev;
1520  /* handle btpo_next after rightpage buffer acquired */
1521  lopaque->btpo.level = oopaque->btpo.level;
1522  /* handle btpo_cycleid after rightpage buffer acquired */
1523 
1524  /*
1525  * Copy the original page's LSN into leftpage, which will become the
1526  * updated version of the page. We need this because XLogInsert will
1527  * examine the LSN and possibly dump it in a page image.
1528  */
1529  PageSetLSN(leftpage, PageGetLSN(origpage));
1530 
1531  /*
1532  * Determine page offset number of existing overlapped-with-orignewitem
1533  * posting list when it is necessary to perform a posting list split in
1534  * passing. Note that newitem was already changed by caller (newitem no
1535  * longer has the orignewitem TID).
1536  *
1537  * This page offset number (origpagepostingoff) will be used to pretend
1538  * that the posting split has already taken place, even though the
1539  * required modifications to origpage won't occur until we reach the
1540  * critical section. The lastleft and firstright tuples of our page split
1541  * point should, in effect, come from an imaginary version of origpage
1542  * that has the nposting tuple instead of the original posting list tuple.
1543  *
1544  * Note: _bt_findsplitloc() should have compensated for coinciding posting
1545  * list splits in just the same way, at least in theory. It doesn't
1546  * bother with that, though. In practice it won't affect its choice of
1547  * split point.
1548  */
1549  origpagepostingoff = InvalidOffsetNumber;
1550  if (postingoff != 0)
1551  {
1552  Assert(isleaf);
1553  Assert(ItemPointerCompare(&orignewitem->t_tid,
1554  &newitem->t_tid) < 0);
1555  Assert(BTreeTupleIsPosting(nposting));
1556  origpagepostingoff = OffsetNumberPrev(newitemoff);
1557  }
1558 
1559  /*
1560  * The high key for the new left page is a possibly-truncated copy of
1561  * firstright on the leaf level (it's "firstright itself" on internal
1562  * pages; see !isleaf comments below). This may seem to be contrary to
1563  * Lehman & Yao's approach of using a copy of lastleft as the new high key
1564  * when splitting on the leaf level. It isn't, though.
1565  *
1566  * Suffix truncation will leave the left page's high key fully equal to
1567  * lastleft when lastleft and firstright are equal prior to heap TID (that
1568  * is, the tiebreaker TID value comes from lastleft). It isn't actually
1569  * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1570  * "subtree" invariant to hold. It's sufficient to make sure that the new
1571  * leaf high key is strictly less than firstright, and greater than or
1572  * equal to (not necessarily equal to) lastleft. In other words, when
1573  * suffix truncation isn't possible during a leaf page split, we take
1574  * L&Y's exact approach to generating a new high key for the left page.
1575  * (Actually, that is slightly inaccurate. We don't just use a copy of
1576  * lastleft. A tuple with all the keys from firstright but the max heap
1577  * TID from lastleft is used, to avoid introducing a special case.)
1578  */
1579  if (!newitemonleft && newitemoff == firstrightoff)
1580  {
1581  /* incoming tuple becomes firstright */
1582  itemsz = newitemsz;
1583  firstright = newitem;
1584  }
1585  else
1586  {
1587  /* existing item at firstrightoff becomes firstright */
1588  itemid = PageGetItemId(origpage, firstrightoff);
1589  itemsz = ItemIdGetLength(itemid);
1590  firstright = (IndexTuple) PageGetItem(origpage, itemid);
1591  if (firstrightoff == origpagepostingoff)
1592  firstright = nposting;
1593  }
1594 
1595  if (isleaf)
1596  {
1597  IndexTuple lastleft;
1598 
1599  /* Attempt suffix truncation for leaf page splits */
1600  if (newitemonleft && newitemoff == firstrightoff)
1601  {
1602  /* incoming tuple becomes lastleft */
1603  lastleft = newitem;
1604  }
1605  else
1606  {
1607  OffsetNumber lastleftoff;
1608 
1609  /* existing item before firstrightoff becomes lastleft */
1610  lastleftoff = OffsetNumberPrev(firstrightoff);
1611  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1612  itemid = PageGetItemId(origpage, lastleftoff);
1613  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1614  if (lastleftoff == origpagepostingoff)
1615  lastleft = nposting;
1616  }
1617 
1618  lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1619  itemsz = IndexTupleSize(lefthighkey);
1620  }
1621  else
1622  {
1623  /*
1624  * Don't perform suffix truncation on a copy of firstright to make
1625  * left page high key for internal page splits. Must use firstright
1626  * as new high key directly.
1627  *
1628  * Each distinct separator key value originates as a leaf level high
1629  * key; all other separator keys/pivot tuples are copied from one
1630  * level down. A separator key in a grandparent page must be
1631  * identical to high key in rightmost parent page of the subtree to
1632  * its left, which must itself be identical to high key in rightmost
1633  * child page of that same subtree (this even applies to separator
1634  * from grandparent's high key). There must always be an unbroken
1635  * "seam" of identical separator keys that guide index scans at every
1636  * level, starting from the grandparent. That's why suffix truncation
1637  * is unsafe here.
1638  *
1639  * Internal page splits will truncate firstright into a "negative
1640  * infinity" data item when it gets inserted on the new right page
1641  * below, though. This happens during the call to _bt_pgaddtup() for
1642  * the new first data item for right page. Do not confuse this
1643  * mechanism with suffix truncation. It is just a convenient way of
1644  * implementing page splits that split the internal page "inside"
1645  * firstright. The lefthighkey separator key cannot appear a second
1646  * time in the right page (only firstright's downlink goes in right
1647  * page).
1648  */
1649  lefthighkey = firstright;
1650  }
1651 
1652  /*
1653  * Add new high key to leftpage
1654  */
1655  afterleftoff = P_HIKEY;
1656 
1657  Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1658  Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1660  Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1661  if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1662  false) == InvalidOffsetNumber)
1663  elog(ERROR, "failed to add high key to the left sibling"
1664  " while splitting block %u of index \"%s\"",
1665  origpagenumber, RelationGetRelationName(rel));
1666  afterleftoff = OffsetNumberNext(afterleftoff);
1667 
1668  /*
1669  * Acquire a new right page to split into, now that left page has a new
1670  * high key. From here on, it's not okay to throw an error without
1671  * zeroing rightpage first. This coding rule ensures that we won't
1672  * confuse future VACUUM operations, which might otherwise try to re-find
1673  * a downlink to a leftover junk page as the page undergoes deletion.
1674  *
1675  * It would be reasonable to start the critical section just after the new
1676  * rightpage buffer is acquired instead; that would allow us to avoid
1677  * leftover junk pages without bothering to zero rightpage. We do it this
1678  * way because it avoids an unnecessary PANIC when either origpage or its
1679  * existing sibling page are corrupt.
1680  */
1681  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1682  rightpage = BufferGetPage(rbuf);
1683  rightpagenumber = BufferGetBlockNumber(rbuf);
1684  /* rightpage was initialized by _bt_getbuf */
1685  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1686 
1687  /*
1688  * Finish off remaining leftpage special area fields. They cannot be set
1689  * before both origpage (leftpage) and rightpage buffers are acquired and
1690  * locked.
1691  *
1692  * btpo_cycleid is only used with leaf pages, though we set it here in all
1693  * cases just to be consistent.
1694  */
1695  lopaque->btpo_next = rightpagenumber;
1696  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1697 
1698  /*
1699  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1700  * and HAS_GARBAGE flags.
1701  */
1702  ropaque->btpo_flags = oopaque->btpo_flags;
1703  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1704  ropaque->btpo_prev = origpagenumber;
1705  ropaque->btpo_next = oopaque->btpo_next;
1706  ropaque->btpo.level = oopaque->btpo.level;
1707  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1708 
1709  /*
1710  * Add new high key to rightpage where necessary.
1711  *
1712  * If the page we're splitting is not the rightmost page at its level in
1713  * the tree, then the first entry on the page is the high key from
1714  * origpage.
1715  */
1716  afterrightoff = P_HIKEY;
1717 
1718  if (!isrightmost)
1719  {
1720  IndexTuple righthighkey;
1721 
1722  itemid = PageGetItemId(origpage, P_HIKEY);
1723  itemsz = ItemIdGetLength(itemid);
1724  righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1725  Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1726  Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1728  if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1729  false, false) == InvalidOffsetNumber)
1730  {
1731  memset(rightpage, 0, BufferGetPageSize(rbuf));
1732  elog(ERROR, "failed to add high key to the right sibling"
1733  " while splitting block %u of index \"%s\"",
1734  origpagenumber, RelationGetRelationName(rel));
1735  }
1736  afterrightoff = OffsetNumberNext(afterrightoff);
1737  }
1738 
1739  /*
1740  * Internal page splits truncate first data item on right page -- it
1741  * becomes "minus infinity" item for the page. Set this up here.
1742  */
1743  minusinfoff = InvalidOffsetNumber;
1744  if (!isleaf)
1745  minusinfoff = afterrightoff;
1746 
1747  /*
1748  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1749  * additional pivot tuples in !isleaf case) to the appropriate page.
1750  *
1751  * Note: we *must* insert at least the right page's items in item-number
1752  * order, for the benefit of _bt_restore_page().
1753  */
1754  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1755  {
1756  IndexTuple dataitem;
1757 
1758  itemid = PageGetItemId(origpage, i);
1759  itemsz = ItemIdGetLength(itemid);
1760  dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1761 
1762  /* replace original item with nposting due to posting split? */
1763  if (i == origpagepostingoff)
1764  {
1765  Assert(BTreeTupleIsPosting(dataitem));
1766  Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1767  dataitem = nposting;
1768  }
1769 
1770  /* does new item belong before this one? */
1771  else if (i == newitemoff)
1772  {
1773  if (newitemonleft)
1774  {
1775  Assert(newitemoff <= firstrightoff);
1776  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1777  false))
1778  {
1779  memset(rightpage, 0, BufferGetPageSize(rbuf));
1780  elog(ERROR, "failed to add new item to the left sibling"
1781  " while splitting block %u of index \"%s\"",
1782  origpagenumber, RelationGetRelationName(rel));
1783  }
1784  afterleftoff = OffsetNumberNext(afterleftoff);
1785  }
1786  else
1787  {
1788  Assert(newitemoff >= firstrightoff);
1789  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1790  afterrightoff == minusinfoff))
1791  {
1792  memset(rightpage, 0, BufferGetPageSize(rbuf));
1793  elog(ERROR, "failed to add new item to the right sibling"
1794  " while splitting block %u of index \"%s\"",
1795  origpagenumber, RelationGetRelationName(rel));
1796  }
1797  afterrightoff = OffsetNumberNext(afterrightoff);
1798  }
1799  }
1800 
1801  /* decide which page to put it on */
1802  if (i < firstrightoff)
1803  {
1804  if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1805  {
1806  memset(rightpage, 0, BufferGetPageSize(rbuf));
1807  elog(ERROR, "failed to add old item to the left sibling"
1808  " while splitting block %u of index \"%s\"",
1809  origpagenumber, RelationGetRelationName(rel));
1810  }
1811  afterleftoff = OffsetNumberNext(afterleftoff);
1812  }
1813  else
1814  {
1815  if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1816  afterrightoff == minusinfoff))
1817  {
1818  memset(rightpage, 0, BufferGetPageSize(rbuf));
1819  elog(ERROR, "failed to add old item to the right sibling"
1820  " while splitting block %u of index \"%s\"",
1821  origpagenumber, RelationGetRelationName(rel));
1822  }
1823  afterrightoff = OffsetNumberNext(afterrightoff);
1824  }
1825  }
1826 
1827  /* Handle case where newitem goes at the end of rightpage */
1828  if (i <= newitemoff)
1829  {
1830  /*
1831  * Can't have newitemonleft here; that would imply we were told to put
1832  * *everything* on the left page, which cannot fit (if it could, we'd
1833  * not be splitting the page).
1834  */
1835  Assert(!newitemonleft && newitemoff == maxoff + 1);
1836  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1837  afterrightoff == minusinfoff))
1838  {
1839  memset(rightpage, 0, BufferGetPageSize(rbuf));
1840  elog(ERROR, "failed to add new item to the right sibling"
1841  " while splitting block %u of index \"%s\"",
1842  origpagenumber, RelationGetRelationName(rel));
1843  }
1844  afterrightoff = OffsetNumberNext(afterrightoff);
1845  }
1846 
1847  /*
1848  * We have to grab the original right sibling (if any) and update its prev
1849  * link. We are guaranteed that this is deadlock-free, since we couple
1850  * the locks in the standard order: left to right.
1851  */
1852  if (!isrightmost)
1853  {
1854  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1855  spage = BufferGetPage(sbuf);
1856  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1857  if (sopaque->btpo_prev != origpagenumber)
1858  {
1859  memset(rightpage, 0, BufferGetPageSize(rbuf));
1860  ereport(ERROR,
1861  (errcode(ERRCODE_INDEX_CORRUPTED),
1862  errmsg_internal("right sibling's left-link doesn't match: "
1863  "block %u links to %u instead of expected %u in index \"%s\"",
1864  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1865  RelationGetRelationName(rel))));
1866  }
1867 
1868  /*
1869  * Check to see if we can set the SPLIT_END flag in the right-hand
1870  * split page; this can save some I/O for vacuum since it need not
1871  * proceed to the right sibling. We can set the flag if the right
1872  * sibling has a different cycleid: that means it could not be part of
1873  * a group of pages that were all split off from the same ancestor
1874  * page. If you're confused, imagine that page A splits to A B and
1875  * then again, yielding A C B, while vacuum is in progress. Tuples
1876  * originally in A could now be in either B or C, hence vacuum must
1877  * examine both pages. But if D, our right sibling, has a different
1878  * cycleid then it could not contain any tuples that were in A when
1879  * the vacuum started.
1880  */
1881  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1882  ropaque->btpo_flags |= BTP_SPLIT_END;
1883  }
1884 
1885  /*
1886  * Right sibling is locked, new siblings are prepared, but original page
1887  * is not updated yet.
1888  *
1889  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1890  * not starting the critical section till here because we haven't been
1891  * scribbling on the original page yet; see comments above.
1892  */
1894 
1895  /*
1896  * By here, the original data page has been split into two new halves, and
1897  * these are correct. The algorithm requires that the left page never
1898  * move during a split, so we copy the new left page back on top of the
1899  * original. We need to do this before writing the WAL record, so that
1900  * XLogInsert can WAL log an image of the page if necessary.
1901  */
1902  PageRestoreTempPage(leftpage, origpage);
1903  /* leftpage, lopaque must not be used below here */
1904 
1905  MarkBufferDirty(buf);
1906  MarkBufferDirty(rbuf);
1907 
1908  if (!isrightmost)
1909  {
1910  sopaque->btpo_prev = rightpagenumber;
1911  MarkBufferDirty(sbuf);
1912  }
1913 
1914  /*
1915  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1916  * a split
1917  */
1918  if (!isleaf)
1919  {
1920  Page cpage = BufferGetPage(cbuf);
1921  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1922 
1923  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1924  MarkBufferDirty(cbuf);
1925  }
1926 
1927  /* XLOG stuff */
1928  if (RelationNeedsWAL(rel))
1929  {
1930  xl_btree_split xlrec;
1931  uint8 xlinfo;
1932  XLogRecPtr recptr;
1933 
1934  xlrec.level = ropaque->btpo.level;
1935  /* See comments below on newitem, orignewitem, and posting lists */
1936  xlrec.firstrightoff = firstrightoff;
1937  xlrec.newitemoff = newitemoff;
1938  xlrec.postingoff = 0;
1939  if (postingoff != 0 && origpagepostingoff < firstrightoff)
1940  xlrec.postingoff = postingoff;
1941 
1942  XLogBeginInsert();
1943  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1944 
1947  /* Log original right sibling, since we've changed its prev-pointer */
1948  if (!isrightmost)
1950  if (!isleaf)
1952 
1953  /*
1954  * Log the new item, if it was inserted on the left page. (If it was
1955  * put on the right page, we don't need to explicitly WAL log it
1956  * because it's included with all the other items on the right page.)
1957  * Show the new item as belonging to the left page buffer, so that it
1958  * is not stored if XLogInsert decides it needs a full-page image of
1959  * the left page. We always store newitemoff in the record, though.
1960  *
1961  * The details are sometimes slightly different for page splits that
1962  * coincide with a posting list split. If both the replacement
1963  * posting list and newitem go on the right page, then we don't need
1964  * to log anything extra, just like the simple !newitemonleft
1965  * no-posting-split case (postingoff is set to zero in the WAL record,
1966  * so recovery doesn't need to process a posting list split at all).
1967  * Otherwise, we set postingoff and log orignewitem instead of
1968  * newitem, despite having actually inserted newitem. REDO routine
1969  * must reconstruct nposting and newitem using _bt_swap_posting().
1970  *
1971  * Note: It's possible that our page split point is the point that
1972  * makes the posting list lastleft and newitem firstright. This is
1973  * the only case where we log orignewitem/newitem despite newitem
1974  * going on the right page. If XLogInsert decides that it can omit
1975  * orignewitem due to logging a full-page image of the left page,
1976  * everything still works out, since recovery only needs to log
1977  * orignewitem for items on the left page (just like the regular
1978  * newitem-logged case).
1979  */
1980  if (newitemonleft && xlrec.postingoff == 0)
1981  XLogRegisterBufData(0, (char *) newitem, newitemsz);
1982  else if (xlrec.postingoff != 0)
1983  {
1984  Assert(isleaf);
1985  Assert(newitemonleft || firstrightoff == newitemoff);
1986  Assert(newitemsz == IndexTupleSize(orignewitem));
1987  XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
1988  }
1989 
1990  /* Log the left page's new high key */
1991  if (!isleaf)
1992  {
1993  /* lefthighkey isn't local copy, get current pointer */
1994  itemid = PageGetItemId(origpage, P_HIKEY);
1995  lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1996  }
1997  XLogRegisterBufData(0, (char *) lefthighkey,
1998  MAXALIGN(IndexTupleSize(lefthighkey)));
1999 
2000  /*
2001  * Log the contents of the right page in the format understood by
2002  * _bt_restore_page(). The whole right page will be recreated.
2003  *
2004  * Direct access to page is not good but faster - we should implement
2005  * some new func in page API. Note we only store the tuples
2006  * themselves, knowing that they were inserted in item-number order
2007  * and so the line pointers can be reconstructed. See comments for
2008  * _bt_restore_page().
2009  */
2011  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2012  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2013 
2014  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2015  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2016 
2017  PageSetLSN(origpage, recptr);
2018  PageSetLSN(rightpage, recptr);
2019  if (!isrightmost)
2020  PageSetLSN(spage, recptr);
2021  if (!isleaf)
2022  PageSetLSN(BufferGetPage(cbuf), recptr);
2023  }
2024 
2025  END_CRIT_SECTION();
2026 
2027  /* release the old right sibling */
2028  if (!isrightmost)
2029  _bt_relbuf(rel, sbuf);
2030 
2031  /* release the child */
2032  if (!isleaf)
2033  _bt_relbuf(rel, cbuf);
2034 
2035  /* be tidy */
2036  if (isleaf)
2037  pfree(lefthighkey);
2038 
2039  /* split's done */
2040  return rbuf;
2041 }
2042 
2043 /*
2044  * _bt_insert_parent() -- Insert downlink into parent, completing split.
2045  *
2046  * On entry, buf and rbuf are the left and right split pages, which we
2047  * still hold write locks on. Both locks will be released here. We
2048  * release the rbuf lock once we have a write lock on the page that we
2049  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2050  * The lock on buf is released at the same point as the lock on the parent
2051  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2052  * atomic operation that completes the split by inserting a new downlink.
2053  *
2054  * stack - stack showing how we got here. Will be NULL when splitting true
2055  * root, or during concurrent root split, where we can be inefficient
2056  * isroot - we split the true root
2057  * isonly - we split a page alone on its level (might have been fast root)
2058  */
2059 static void
2061  Buffer buf,
2062  Buffer rbuf,
2063  BTStack stack,
2064  bool isroot,
2065  bool isonly)
2066 {
2067  /*
2068  * Here we have to do something Lehman and Yao don't talk about: deal with
2069  * a root split and construction of a new root. If our stack is empty
2070  * then we have just split a node on what had been the root level when we
2071  * descended the tree. If it was still the root then we perform a
2072  * new-root construction. If it *wasn't* the root anymore, search to find
2073  * the next higher level that someone constructed meanwhile, and find the
2074  * right place to insert as for the normal case.
2075  *
2076  * If we have to search for the parent level, we do so by re-descending
2077  * from the root. This is not super-efficient, but it's rare enough not
2078  * to matter.
2079  */
2080  if (isroot)
2081  {
2082  Buffer rootbuf;
2083 
2084  Assert(stack == NULL);
2085  Assert(isonly);
2086  /* create a new root node and update the metapage */
2087  rootbuf = _bt_newroot(rel, buf, rbuf);
2088  /* release the split buffers */
2089  _bt_relbuf(rel, rootbuf);
2090  _bt_relbuf(rel, rbuf);
2091  _bt_relbuf(rel, buf);
2092  }
2093  else
2094  {
2095  BlockNumber bknum = BufferGetBlockNumber(buf);
2096  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2097  Page page = BufferGetPage(buf);
2098  IndexTuple new_item;
2099  BTStackData fakestack;
2100  IndexTuple ritem;
2101  Buffer pbuf;
2102 
2103  if (stack == NULL)
2104  {
2105  BTPageOpaque opaque;
2106 
2107  elog(DEBUG2, "concurrent ROOT page split");
2108  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2109 
2110  /*
2111  * We should never reach here when a leaf page split takes place
2112  * despite the insert of newitem being able to apply the fastpath
2113  * optimization. Make sure of that with an assertion.
2114  *
2115  * This is more of a performance issue than a correctness issue.
2116  * The fastpath won't have a descent stack. Using a phony stack
2117  * here works, but never rely on that. The fastpath should be
2118  * rejected within _bt_search_insert() when the rightmost leaf
2119  * page will split, since it's faster to go through _bt_search()
2120  * and get a stack in the usual way.
2121  */
2122  Assert(!(P_ISLEAF(opaque) &&
2124 
2125  /* Find the leftmost page at the next level up */
2126  pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL);
2127  /* Set up a phony stack entry pointing there */
2128  stack = &fakestack;
2129  stack->bts_blkno = BufferGetBlockNumber(pbuf);
2131  stack->bts_parent = NULL;
2132  _bt_relbuf(rel, pbuf);
2133  }
2134 
2135  /* get high key from left, a strict lower bound for new right page */
2136  ritem = (IndexTuple) PageGetItem(page,
2137  PageGetItemId(page, P_HIKEY));
2138 
2139  /* form an index tuple that points at the new right page */
2140  new_item = CopyIndexTuple(ritem);
2141  BTreeTupleSetDownLink(new_item, rbknum);
2142 
2143  /*
2144  * Re-find and write lock the parent of buf.
2145  *
2146  * It's possible that the location of buf's downlink has changed since
2147  * our initial _bt_search() descent. _bt_getstackbuf() will detect
2148  * and recover from this, updating the stack, which ensures that the
2149  * new downlink will be inserted at the correct offset. Even buf's
2150  * parent may have changed.
2151  */
2152  pbuf = _bt_getstackbuf(rel, stack, bknum);
2153 
2154  /*
2155  * Unlock the right child. The left child will be unlocked in
2156  * _bt_insertonpg().
2157  *
2158  * Unlocking the right child must be delayed until here to ensure that
2159  * no concurrent VACUUM operation can become confused. Page deletion
2160  * cannot be allowed to fail to re-find a downlink for the rbuf page.
2161  * (Actually, this is just a vestige of how things used to work. The
2162  * page deletion code is expected to check for the INCOMPLETE_SPLIT
2163  * flag on the left child. It won't attempt deletion of the right
2164  * child until the split is complete. Despite all this, we opt to
2165  * conservatively delay unlocking the right child until here.)
2166  */
2167  _bt_relbuf(rel, rbuf);
2168 
2169  if (pbuf == InvalidBuffer)
2170  ereport(ERROR,
2171  (errcode(ERRCODE_INDEX_CORRUPTED),
2172  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2173  RelationGetRelationName(rel), bknum, rbknum)));
2174 
2175  /* Recursively insert into the parent */
2176  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
2177  new_item, MAXALIGN(IndexTupleSize(new_item)),
2178  stack->bts_offset + 1, 0, isonly);
2179 
2180  /* be tidy */
2181  pfree(new_item);
2182  }
2183 }
2184 
2185 /*
2186  * _bt_finish_split() -- Finish an incomplete split
2187  *
2188  * A crash or other failure can leave a split incomplete. The insertion
2189  * routines won't allow to insert on a page that is incompletely split.
2190  * Before inserting on such a page, call _bt_finish_split().
2191  *
2192  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2193  * and unpinned.
2194  */
2195 void
2197 {
2198  Page lpage = BufferGetPage(lbuf);
2199  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
2200  Buffer rbuf;
2201  Page rpage;
2202  BTPageOpaque rpageop;
2203  bool wasroot;
2204  bool wasonly;
2205 
2206  Assert(P_INCOMPLETE_SPLIT(lpageop));
2207 
2208  /* Lock right sibling, the one missing the downlink */
2209  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2210  rpage = BufferGetPage(rbuf);
2211  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
2212 
2213  /* Could this be a root split? */
2214  if (!stack)
2215  {
2216  Buffer metabuf;
2217  Page metapg;
2218  BTMetaPageData *metad;
2219 
2220  /* acquire lock on the metapage */
2221  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2222  metapg = BufferGetPage(metabuf);
2223  metad = BTPageGetMeta(metapg);
2224 
2225  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2226 
2227  _bt_relbuf(rel, metabuf);
2228  }
2229  else
2230  wasroot = false;
2231 
2232  /* Was this the only page on the level before split? */
2233  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2234 
2235  elog(DEBUG1, "finishing incomplete split of %u/%u",
2237 
2238  _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
2239 }
2240 
2241 /*
2242  * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2243  * tuple whose downlink points to child page.
2244  *
2245  * Caller passes child's block number, which is used to identify
2246  * associated pivot tuple in parent page using a linear search that
2247  * matches on pivot's downlink/block number. The expected location of
2248  * the pivot tuple is taken from the stack one level above the child
2249  * page. This is used as a starting point. Insertions into the
2250  * parent level could cause the pivot tuple to move right; deletions
2251  * could cause it to move left, but not left of the page we previously
2252  * found it on.
2253  *
2254  * Caller can use its stack to relocate the pivot tuple/downlink for
2255  * any same-level page to the right of the page found by its initial
2256  * descent. This is necessary because of the possibility that caller
2257  * moved right to recover from a concurrent page split. It's also
2258  * convenient for certain callers to be able to step right when there
2259  * wasn't a concurrent page split, while still using their original
2260  * stack. For example, the checkingunique _bt_doinsert() case may
2261  * have to step right when there are many physical duplicates, and its
2262  * scantid forces an insertion to the right of the "first page the
2263  * value could be on". (This is also relied on by all of our callers
2264  * when dealing with !heapkeyspace indexes.)
2265  *
2266  * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2267  * tuple not found (should not happen). Adjusts bts_blkno &
2268  * bts_offset if changed. Page split caller should insert its new
2269  * pivot tuple for its new right sibling page on parent page, at the
2270  * offset number bts_offset + 1.
2271  */
2272 Buffer
2274 {
2275  BlockNumber blkno;
2276  OffsetNumber start;
2277 
2278  blkno = stack->bts_blkno;
2279  start = stack->bts_offset;
2280 
2281  for (;;)
2282  {
2283  Buffer buf;
2284  Page page;
2285  BTPageOpaque opaque;
2286 
2287  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2288  page = BufferGetPage(buf);
2289  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2290 
2291  if (P_INCOMPLETE_SPLIT(opaque))
2292  {
2293  _bt_finish_split(rel, buf, stack->bts_parent);
2294  continue;
2295  }
2296 
2297  if (!P_IGNORE(opaque))
2298  {
2299  OffsetNumber offnum,
2300  minoff,
2301  maxoff;
2302  ItemId itemid;
2303  IndexTuple item;
2304 
2305  minoff = P_FIRSTDATAKEY(opaque);
2306  maxoff = PageGetMaxOffsetNumber(page);
2307 
2308  /*
2309  * start = InvalidOffsetNumber means "search the whole page". We
2310  * need this test anyway due to possibility that page has a high
2311  * key now when it didn't before.
2312  */
2313  if (start < minoff)
2314  start = minoff;
2315 
2316  /*
2317  * Need this check too, to guard against possibility that page
2318  * split since we visited it originally.
2319  */
2320  if (start > maxoff)
2321  start = OffsetNumberNext(maxoff);
2322 
2323  /*
2324  * These loops will check every item on the page --- but in an
2325  * order that's attuned to the probability of where it actually
2326  * is. Scan to the right first, then to the left.
2327  */
2328  for (offnum = start;
2329  offnum <= maxoff;
2330  offnum = OffsetNumberNext(offnum))
2331  {
2332  itemid = PageGetItemId(page, offnum);
2333  item = (IndexTuple) PageGetItem(page, itemid);
2334 
2335  if (BTreeTupleGetDownLink(item) == child)
2336  {
2337  /* Return accurate pointer to where link is now */
2338  stack->bts_blkno = blkno;
2339  stack->bts_offset = offnum;
2340  return buf;
2341  }
2342  }
2343 
2344  for (offnum = OffsetNumberPrev(start);
2345  offnum >= minoff;
2346  offnum = OffsetNumberPrev(offnum))
2347  {
2348  itemid = PageGetItemId(page, offnum);
2349  item = (IndexTuple) PageGetItem(page, itemid);
2350 
2351  if (BTreeTupleGetDownLink(item) == child)
2352  {
2353  /* Return accurate pointer to where link is now */
2354  stack->bts_blkno = blkno;
2355  stack->bts_offset = offnum;
2356  return buf;
2357  }
2358  }
2359  }
2360 
2361  /*
2362  * The item we're looking for moved right at least one page.
2363  *
2364  * Lehman and Yao couple/chain locks when moving right here, which we
2365  * can avoid. See nbtree/README.
2366  */
2367  if (P_RIGHTMOST(opaque))
2368  {
2369  _bt_relbuf(rel, buf);
2370  return InvalidBuffer;
2371  }
2372  blkno = opaque->btpo_next;
2373  start = InvalidOffsetNumber;
2374  _bt_relbuf(rel, buf);
2375  }
2376 }
2377 
2378 /*
2379  * _bt_newroot() -- Create a new root page for the index.
2380  *
2381  * We've just split the old root page and need to create a new one.
2382  * In order to do this, we add a new root page to the file, then lock
2383  * the metadata page and update it. This is guaranteed to be deadlock-
2384  * free, because all readers release their locks on the metadata page
2385  * before trying to lock the root, and all writers lock the root before
2386  * trying to lock the metadata page. We have a write lock on the old
2387  * root page, so we have not introduced any cycles into the waits-for
2388  * graph.
2389  *
2390  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2391  * locked. On exit, a new root page exists with entries for the
2392  * two new children, metapage is updated and unlocked/unpinned.
2393  * The new root buffer is returned to caller which has to unlock/unpin
2394  * lbuf, rbuf & rootbuf.
2395  */
2396 static Buffer
2398 {
2399  Buffer rootbuf;
2400  Page lpage,
2401  rootpage;
2402  BlockNumber lbkno,
2403  rbkno;
2404  BlockNumber rootblknum;
2405  BTPageOpaque rootopaque;
2406  BTPageOpaque lopaque;
2407  ItemId itemid;
2408  IndexTuple item;
2409  IndexTuple left_item;
2410  Size left_item_sz;
2411  IndexTuple right_item;
2412  Size right_item_sz;
2413  Buffer metabuf;
2414  Page metapg;
2415  BTMetaPageData *metad;
2416 
2417  lbkno = BufferGetBlockNumber(lbuf);
2418  rbkno = BufferGetBlockNumber(rbuf);
2419  lpage = BufferGetPage(lbuf);
2420  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2421 
2422  /* get a new root page */
2423  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2424  rootpage = BufferGetPage(rootbuf);
2425  rootblknum = BufferGetBlockNumber(rootbuf);
2426 
2427  /* acquire lock on the metapage */
2428  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2429  metapg = BufferGetPage(metabuf);
2430  metad = BTPageGetMeta(metapg);
2431 
2432  /*
2433  * Create downlink item for left page (old root). The key value used is
2434  * "minus infinity", a sentinel value that's reliably less than any real
2435  * key value that could appear in the left page.
2436  */
2437  left_item_sz = sizeof(IndexTupleData);
2438  left_item = (IndexTuple) palloc(left_item_sz);
2439  left_item->t_info = left_item_sz;
2440  BTreeTupleSetDownLink(left_item, lbkno);
2441  BTreeTupleSetNAtts(left_item, 0, false);
2442 
2443  /*
2444  * Create downlink item for right page. The key for it is obtained from
2445  * the "high key" position in the left page.
2446  */
2447  itemid = PageGetItemId(lpage, P_HIKEY);
2448  right_item_sz = ItemIdGetLength(itemid);
2449  item = (IndexTuple) PageGetItem(lpage, itemid);
2450  right_item = CopyIndexTuple(item);
2451  BTreeTupleSetDownLink(right_item, rbkno);
2452 
2453  /* NO EREPORT(ERROR) from here till newroot op is logged */
2455 
2456  /* upgrade metapage if needed */
2457  if (metad->btm_version < BTREE_NOVAC_VERSION)
2458  _bt_upgrademetapage(metapg);
2459 
2460  /* set btree special data */
2461  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2462  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2463  rootopaque->btpo_flags = BTP_ROOT;
2464  rootopaque->btpo.level =
2465  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2466  rootopaque->btpo_cycleid = 0;
2467 
2468  /* update metapage data */
2469  metad->btm_root = rootblknum;
2470  metad->btm_level = rootopaque->btpo.level;
2471  metad->btm_fastroot = rootblknum;
2472  metad->btm_fastlevel = rootopaque->btpo.level;
2473 
2474  /*
2475  * Insert the left page pointer into the new root page. The root page is
2476  * the rightmost page on its level so there is no "high key" in it; the
2477  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2478  *
2479  * Note: we *must* insert the two items in item-number order, for the
2480  * benefit of _bt_restore_page().
2481  */
2482  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2483  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2484  false, false) == InvalidOffsetNumber)
2485  elog(PANIC, "failed to add leftkey to new root page"
2486  " while splitting block %u of index \"%s\"",
2488 
2489  /*
2490  * insert the right page pointer into the new root page.
2491  */
2492  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2493  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2495  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2496  false, false) == InvalidOffsetNumber)
2497  elog(PANIC, "failed to add rightkey to new root page"
2498  " while splitting block %u of index \"%s\"",
2500 
2501  /* Clear the incomplete-split flag in the left child */
2502  Assert(P_INCOMPLETE_SPLIT(lopaque));
2503  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2504  MarkBufferDirty(lbuf);
2505 
2506  MarkBufferDirty(rootbuf);
2507  MarkBufferDirty(metabuf);
2508 
2509  /* XLOG stuff */
2510  if (RelationNeedsWAL(rel))
2511  {
2512  xl_btree_newroot xlrec;
2513  XLogRecPtr recptr;
2514  xl_btree_metadata md;
2515 
2516  xlrec.rootblk = rootblknum;
2517  xlrec.level = metad->btm_level;
2518 
2519  XLogBeginInsert();
2520  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2521 
2522  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2525 
2527  md.version = metad->btm_version;
2528  md.root = rootblknum;
2529  md.level = metad->btm_level;
2530  md.fastroot = rootblknum;
2531  md.fastlevel = metad->btm_level;
2534  md.allequalimage = metad->btm_allequalimage;
2535 
2536  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2537 
2538  /*
2539  * Direct access to page is not good but faster - we should implement
2540  * some new func in page API.
2541  */
2543  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2544  ((PageHeader) rootpage)->pd_special -
2545  ((PageHeader) rootpage)->pd_upper);
2546 
2547  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2548 
2549  PageSetLSN(lpage, recptr);
2550  PageSetLSN(rootpage, recptr);
2551  PageSetLSN(metapg, recptr);
2552  }
2553 
2554  END_CRIT_SECTION();
2555 
2556  /* done with metapage */
2557  _bt_relbuf(rel, metabuf);
2558 
2559  pfree(left_item);
2560  pfree(right_item);
2561 
2562  return rootbuf;
2563 }
2564 
2565 /*
2566  * _bt_pgaddtup() -- add a data item to a particular page during split.
2567  *
2568  * The difference between this routine and a bare PageAddItem call is
2569  * that this code can deal with the first data item on an internal btree
2570  * page in passing. This data item (which is called "firstright" within
2571  * _bt_split()) has a key that must be treated as minus infinity after
2572  * the split. Therefore, we truncate away all attributes when caller
2573  * specifies it's the first data item on page (downlink is not changed,
2574  * though). This extra step is only needed for the right page of an
2575  * internal page split. There is no need to do this for the first data
2576  * item on the existing/left page, since that will already have been
2577  * truncated during an earlier page split.
2578  *
2579  * See _bt_split() for a high level explanation of why we truncate here.
2580  * Note that this routine has nothing to do with suffix truncation,
2581  * despite using some of the same infrastructure.
2582  */
2583 static inline bool
2585  Size itemsize,
2586  IndexTuple itup,
2587  OffsetNumber itup_off,
2588  bool newfirstdataitem)
2589 {
2590  IndexTupleData trunctuple;
2591 
2592  if (newfirstdataitem)
2593  {
2594  trunctuple = *itup;
2595  trunctuple.t_info = sizeof(IndexTupleData);
2596  BTreeTupleSetNAtts(&trunctuple, 0, false);
2597  itup = &trunctuple;
2598  itemsize = sizeof(IndexTupleData);
2599  }
2600 
2601  if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2602  false) == InvalidOffsetNumber))
2603  return false;
2604 
2605  return true;
2606 }
2607 
2608 /*
2609  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split by attempting
2610  * a variety of operations.
2611  *
2612  * There are two operations performed here: deleting items already marked
2613  * LP_DEAD, and deduplication. If both operations fail to free enough space
2614  * for the incoming item then caller will go on to split the page. We always
2615  * attempt our preferred strategy (which is to delete items whose LP_DEAD bit
2616  * are set) first. If that doesn't work out we move on to deduplication.
2617  *
2618  * Caller's checkingunique and uniquedup arguments help us decide if we should
2619  * perform deduplication, which is primarily useful with low cardinality data,
2620  * but can sometimes absorb version churn.
2621  *
2622  * Callers that only want us to look for/delete LP_DEAD items can ask for that
2623  * directly by passing true 'lpdeadonly' argument.
2624  *
2625  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2626  * level flag was found set. The flag was useful back when there wasn't
2627  * necessarily one single page for a duplicate tuple to go on (before heap TID
2628  * became a part of the key space in version 4 indexes). But we don't
2629  * actually look at the flag anymore (it's not a gating condition for our
2630  * caller). That would cause us to miss tuples that are safe to delete,
2631  * without getting any benefit in return. We know that the alternative is to
2632  * split the page; scanning the line pointer array in passing won't have
2633  * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2634  * all this because !heapkeyspace indexes must still do a "getting tired"
2635  * linear search, and so are likely to get some benefit from using it as a
2636  * gating condition.)
2637  */
2638 static void
2640  BTInsertState insertstate,
2641  bool lpdeadonly, bool checkingunique,
2642  bool uniquedup)
2643 {
2645  int ndeletable = 0;
2646  OffsetNumber offnum,
2647  maxoff;
2648  Buffer buffer = insertstate->buf;
2649  BTScanInsert itup_key = insertstate->itup_key;
2650  Page page = BufferGetPage(buffer);
2652 
2653  Assert(P_ISLEAF(opaque));
2654  Assert(lpdeadonly || itup_key->heapkeyspace);
2655 
2656  /*
2657  * Scan over all items to see which ones need to be deleted according to
2658  * LP_DEAD flags.
2659  */
2660  maxoff = PageGetMaxOffsetNumber(page);
2661  for (offnum = P_FIRSTDATAKEY(opaque);
2662  offnum <= maxoff;
2663  offnum = OffsetNumberNext(offnum))
2664  {
2665  ItemId itemId = PageGetItemId(page, offnum);
2666 
2667  if (ItemIdIsDead(itemId))
2668  deletable[ndeletable++] = offnum;
2669  }
2670 
2671  if (ndeletable > 0)
2672  {
2673  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2674  insertstate->bounds_valid = false;
2675 
2676  /* Return when a page split has already been avoided */
2677  if (PageGetFreeSpace(page) >= insertstate->itemsz)
2678  return;
2679 
2680  /* Might as well assume duplicates (if checkingunique) */
2681  uniquedup = true;
2682  }
2683 
2684  /*
2685  * Some callers only want to delete LP_DEAD items. Return early for these
2686  * callers.
2687  *
2688  * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2689  * return at this point (or when we go on the try either or both of our
2690  * other strategies and they also fail). We do not bother expending a
2691  * separate write to clear it, however. Caller will definitely clear it
2692  * when it goes on to split the page (plus deduplication knows to clear
2693  * the flag when it actually modifies the page).
2694  */
2695  if (lpdeadonly)
2696  return;
2697 
2698  /*
2699  * We can get called in the checkingunique case when there is no reason to
2700  * believe that there are any duplicates on the page; we should at least
2701  * still check for LP_DEAD items. If that didn't work out, give up and
2702  * let caller split the page. Deduplication cannot be justified given
2703  * there is no reason to think that there are duplicates.
2704  */
2705  if (checkingunique && !uniquedup)
2706  return;
2707 
2708  /* Assume bounds about to be invalidated (this is almost certain now) */
2709  insertstate->bounds_valid = false;
2710 
2711  /*
2712  * Perform deduplication pass, though only when it is enabled for the
2713  * index and known to be safe (it must be an allequalimage index).
2714  */
2715  if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2716  _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
2717  insertstate->itemsz, checkingunique);
2718 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1084
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
bool allequalimage
Definition: nbtxlog.h:57
BlockNumber rootblk
Definition: nbtxlog.h:318
#define BTP_ROOT
Definition: nbtree.h:73
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1935
#define BTP_SPLIT_END
Definition: nbtree.h:77
BlockNumber btpo_next
Definition: nbtree.h:59
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:75
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:1149
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:85
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:411
#define P_IGNORE(opaque)
Definition: nbtree.h:219
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:101
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
bool bounds_valid
Definition: nbtree.h:696
uint32 TransactionId
Definition: c.h:575
uint32 btm_version
Definition: nbtree.h:101
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:604
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:939
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3583
#define RelationGetDescr(relation)
Definition: rel.h:483
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
ItemPointer scantid
Definition: nbtree.h:664
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:803
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1471
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:976
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2060
long random(void)
Definition: random.c:22
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define SizeOfBtreeSplit
Definition: nbtxlog.h:161
ItemPointerData t_tid
Definition: itup.h:37
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:51
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:793
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:427
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:220
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
OffsetNumber stricthigh
Definition: nbtree.h:698
int errcode(int sqlerrcode)
Definition: elog.c:691
uint32 level
Definition: nbtxlog.h:319
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:79
bool allequalimage
Definition: nbtree.h:660
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1008
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3513
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2397
#define P_NEW
Definition: bufmgr.h:91
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
OffsetNumber low
Definition: nbtree.h:697
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2295
#define PANIC
Definition: elog.h:53
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:730
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:542
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:790
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1428
uint16 OffsetNumber
Definition: off.h:24
IndexUniqueCheck
Definition: genam.h:112
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:463
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2200
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5552
#define BT_READ
Definition: nbtree.h:587
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
unsigned short uint16
Definition: c.h:428
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
void pfree(void *pointer)
Definition: mcxt.c:1057
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2196
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:726
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
OffsetNumber newitemoff
Definition: nbtxlog.h:157
BTCycleId btpo_cycleid
Definition: nbtree.h:66
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1282
#define DEBUG2
Definition: elog.h:24
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:796
#define BTPageGetMeta(p)
Definition: nbtree.h:114
bool btm_allequalimage
Definition: nbtree.h:111
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
#define SnapshotSelf
Definition: snapmgr.h:67
OffsetNumber bts_offset
Definition: nbtree.h:603
static char * buf
Definition: pg_test_fsync.c:68
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:447
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
int errdetail(const char *fmt,...)
Definition: elog.c:1035
#define MAX_RANDOM_VALUE
#define P_FIRSTKEY
Definition: nbtree.h:243
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:469
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:491
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
unsigned int uint32
Definition: c.h:429
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:434
TransactionId xmax
Definition: snapshot.h:158
TransactionId xmin
Definition: snapshot.h:157
IndexTuple itup
Definition: nbtree.h:684
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:430
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:476
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:644
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:299
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:219
#define BTREE_METAPAGE
Definition: nbtree.h:141
#define P_ISROOT(opaque)
Definition: nbtree.h:215
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:29
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:322
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:130
uint32 version
Definition: nbtxlog.h:50
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
bool anynullkeys
Definition: nbtree.h:661
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
Page PageGetTempPage(Page page)
Definition: bufpage.c:352
uintptr_t Datum
Definition: postgres.h:367
uint32 level
Definition: nbtxlog.h:155
OffsetNumber offnum
Definition: nbtxlog.h:81
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
BlockNumber btm_root
Definition: nbtree.h:102
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4375
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:959
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique)
Definition: nbtdedup.c:52
#define ereport(elevel,...)
Definition: elog.h:155
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
int errmsg_internal(const char *fmt,...)
Definition: elog.c:989
uint32 speculativeToken
Definition: snapshot.h:193
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:639
uint64 XLogRecPtr
Definition: xlogdefs.h:21
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:800
bool heapkeyspace
Definition: nbtree.h:659
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool lpdeadonly, bool checkingunique, bool uniquedup)
Definition: nbtinsert.c:2639
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:31
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:607
#define INDEX_MAX_KEYS
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:549
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:528
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
#define MAXALIGN(LEN)
Definition: c.h:753
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
struct BTStackData * bts_parent
Definition: nbtree.h:604
#define RelationNeedsWAL(relation)
Definition: rel.h:563
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2634
#define PageGetLSN(page)
Definition: bufpage.h:366
#define BTMaxItemSize(page)
Definition: nbtree.h:157
#define P_HIKEY
Definition: nbtree.h:242
static Datum values[MAXATTR]
Definition: bootstrap.c:165
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:390
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:902
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:228
uint32 level
Definition: nbtxlog.h:52
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1066
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2584
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
#define unlikely(x)
Definition: c.h:261
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1029
BlockNumber fastroot
Definition: nbtxlog.h:53
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:177
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:588
uint16 postingoff
Definition: nbtxlog.h:158
void XLogBeginInsert(void)
Definition: xloginsert.c:123
uint16 btpo_flags
Definition: nbtree.h:65
OffsetNumber firstrightoff
Definition: nbtxlog.h:156
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define SizeOfBtreeInsert
Definition: nbtxlog.h:87
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3096
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2273
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214