PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/tableam.h"
21 #include "access/transam.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "storage/lmgr.h"
25 #include "storage/predicate.h"
26 #include "storage/smgr.h"
27 
28 /* Minimum tree height for application of fastpath optimization */
29 #define BTREE_FASTPATH_MIN_LEVEL 2
30 
31 
32 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
33 
35  Relation heapRel,
36  IndexUniqueCheck checkUnique, bool *is_unique,
37  uint32 *speculativeToken);
39  BTInsertState insertstate,
40  bool checkingunique,
41  BTStack stack,
42  Relation heapRel);
43 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
44 static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
45  Buffer buf,
46  Buffer cbuf,
47  BTStack stack,
48  IndexTuple itup,
49  OffsetNumber newitemoff,
50  bool split_only_page);
51 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
52  Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
53  IndexTuple newitem);
54 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
55  BTStack stack, bool is_root, bool is_only);
56 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
57  OffsetNumber itup_off);
58 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
59 
60 /*
61  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
62  *
63  * This routine is called by the public interface routine, btinsert.
64  * By here, itup is filled in, including the TID.
65  *
66  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
67  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
68  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
69  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
70  * don't actually insert.
71  *
72  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
73  * it must be true if the entry is known unique, else false.
74  * (In the current implementation we'll also return true after a
75  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
76  * that's just a coding artifact.)
77  */
78 bool
80  IndexUniqueCheck checkUnique, Relation heapRel)
81 {
82  bool is_unique = false;
83  BTInsertStateData insertstate;
84  BTScanInsert itup_key;
85  BTStack stack = NULL;
86  Buffer buf;
87  bool fastpath;
88  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
89 
90  /* we need an insertion scan key to do our search, so build one */
91  itup_key = _bt_mkscankey(rel, itup);
92 
93  if (checkingunique)
94  {
95  if (!itup_key->anynullkeys)
96  {
97  /* No (heapkeyspace) scantid until uniqueness established */
98  itup_key->scantid = NULL;
99  }
100  else
101  {
102  /*
103  * Scan key for new tuple contains NULL key values. Bypass
104  * checkingunique steps. They are unnecessary because core code
105  * considers NULL unequal to every value, including NULL.
106  *
107  * This optimization avoids O(N^2) behavior within the
108  * _bt_findinsertloc() heapkeyspace path when a unique index has a
109  * large number of "duplicates" with NULL key values.
110  */
111  checkingunique = false;
112  /* Tuple is unique in the sense that core code cares about */
113  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
114  is_unique = true;
115  }
116  }
117 
118  /*
119  * Fill in the BTInsertState working area, to track the current page and
120  * position within the page to insert on
121  */
122  insertstate.itup = itup;
123  /* PageAddItem will MAXALIGN(), but be consistent */
124  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
125  insertstate.itup_key = itup_key;
126  insertstate.bounds_valid = false;
127  insertstate.buf = InvalidBuffer;
128 
129  /*
130  * It's very common to have an index on an auto-incremented or
131  * monotonically increasing value. In such cases, every insertion happens
132  * towards the end of the index. We try to optimize that case by caching
133  * the right-most leaf of the index. If our cached block is still the
134  * rightmost leaf, has enough free space to accommodate a new entry and
135  * the insertion key is strictly greater than the first key in this page,
136  * then we can safely conclude that the new key will be inserted in the
137  * cached block. So we simply search within the cached block and insert
138  * the key at the appropriate location. We call it a fastpath.
139  *
140  * Testing has revealed, though, that the fastpath can result in increased
141  * contention on the exclusive-lock on the rightmost leaf page. So we
142  * conditionally check if the lock is available. If it's not available
143  * then we simply abandon the fastpath and take the regular path. This
144  * makes sense because unavailability of the lock also signals that some
145  * other backend might be concurrently inserting into the page, thus
146  * reducing our chances to finding an insertion place in this page.
147  */
148 top:
149  fastpath = false;
151  {
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change,
159  * as far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170 
171  /*
172  * Check if the page is still the rightmost leaf page, has enough
173  * free space to accommodate the new tuple, and the insertion scan
174  * key is strictly greater than the first key on the page.
175  */
176  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
177  !P_IGNORE(lpageop) &&
178  (PageGetFreeSpace(page) > insertstate.itemsz) &&
179  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
180  _bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
181  {
182  fastpath = true;
183  }
184  else
185  {
186  _bt_relbuf(rel, buf);
187 
188  /*
189  * Something did not work out. Just forget about the cached
190  * block and follow the normal path. It might be set again if
191  * the conditions are favourable.
192  */
194  }
195  }
196  else
197  {
198  ReleaseBuffer(buf);
199 
200  /*
201  * If someone's holding a lock, it's likely to change anyway, so
202  * don't try again until we get an updated rightmost leaf.
203  */
205  }
206  }
207 
208  if (!fastpath)
209  {
210  /*
211  * Find the first page containing this key. Buffer returned by
212  * _bt_search() is locked in exclusive mode.
213  */
214  stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
215  }
216 
217  insertstate.buf = buf;
218  buf = InvalidBuffer; /* insertstate.buf now owns the buffer */
219 
220  /*
221  * If we're not allowing duplicates, make sure the key isn't already in
222  * the index.
223  *
224  * NOTE: obviously, _bt_check_unique can only detect keys that are already
225  * in the index; so it cannot defend against concurrent insertions of the
226  * same key. We protect against that by means of holding a write lock on
227  * the first page the value could be on, with omitted/-inf value for the
228  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
229  * the same key must acquire a write lock on the same page, so only one
230  * would-be inserter can be making the check at one time. Furthermore,
231  * once we are past the check we hold write locks continuously until we
232  * have performed our insertion, so no later inserter can fail to see our
233  * insertion. (This requires some care in _bt_findinsertloc.)
234  *
235  * If we must wait for another xact, we release the lock while waiting,
236  * and then must start over completely.
237  *
238  * For a partial uniqueness check, we don't wait for the other xact. Just
239  * let the tuple in and return false for possibly non-unique, or true for
240  * definitely unique.
241  */
242  if (checkingunique)
243  {
244  TransactionId xwait;
245  uint32 speculativeToken;
246 
247  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
248  &is_unique, &speculativeToken);
249 
250  if (TransactionIdIsValid(xwait))
251  {
252  /* Have to wait for the other guy ... */
253  _bt_relbuf(rel, insertstate.buf);
254  insertstate.buf = InvalidBuffer;
255 
256  /*
257  * If it's a speculative insertion, wait for it to finish (ie. to
258  * go ahead with the insertion, or kill the tuple). Otherwise
259  * wait for the transaction to finish as usual.
260  */
261  if (speculativeToken)
262  SpeculativeInsertionWait(xwait, speculativeToken);
263  else
264  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
265 
266  /* start over... */
267  if (stack)
268  _bt_freestack(stack);
269  goto top;
270  }
271 
272  /* Uniqueness is established -- restore heap tid as scantid */
273  if (itup_key->heapkeyspace)
274  itup_key->scantid = &itup->t_tid;
275  }
276 
277  if (checkUnique != UNIQUE_CHECK_EXISTING)
278  {
279  OffsetNumber newitemoff;
280 
281  /*
282  * The only conflict predicate locking cares about for indexes is when
283  * an index tuple insert conflicts with an existing lock. We don't
284  * know the actual page we're going to insert on for sure just yet in
285  * checkingunique and !heapkeyspace cases, but it's okay to use the
286  * first page the value could be on (with scantid omitted) instead.
287  */
289 
290  /*
291  * Do the insertion. Note that insertstate contains cached binary
292  * search bounds established within _bt_check_unique when insertion is
293  * checkingunique.
294  */
295  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
296  stack, heapRel);
297  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
298  itup, newitemoff, false);
299  }
300  else
301  {
302  /* just release the buffer */
303  _bt_relbuf(rel, insertstate.buf);
304  }
305 
306  /* be tidy */
307  if (stack)
308  _bt_freestack(stack);
309  pfree(itup_key);
310 
311  return is_unique;
312 }
313 
314 /*
315  * _bt_check_unique() -- Check for violation of unique index constraint
316  *
317  * Returns InvalidTransactionId if there is no conflict, else an xact ID
318  * we must wait for to see if it commits a conflicting tuple. If an actual
319  * conflict is detected, no return --- just ereport(). If an xact ID is
320  * returned, and the conflicting tuple still has a speculative insertion in
321  * progress, *speculativeToken is set to non-zero, and the caller can wait for
322  * the verdict on the insertion using SpeculativeInsertionWait().
323  *
324  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
325  * InvalidTransactionId because we don't want to wait. In this case we
326  * set *is_unique to false if there is a potential conflict, and the
327  * core code must redo the uniqueness check later.
328  *
329  * As a side-effect, sets state in insertstate that can later be used by
330  * _bt_findinsertloc() to reuse most of the binary search work we do
331  * here.
332  *
333  * Do not call here when there are NULL values in scan key. NULL should be
334  * considered unequal to NULL when checking for duplicates, but we are not
335  * prepared to handle that correctly.
336  */
337 static TransactionId
339  IndexUniqueCheck checkUnique, bool *is_unique,
340  uint32 *speculativeToken)
341 {
342  IndexTuple itup = insertstate->itup;
343  BTScanInsert itup_key = insertstate->itup_key;
344  SnapshotData SnapshotDirty;
345  OffsetNumber offset;
346  OffsetNumber maxoff;
347  Page page;
348  BTPageOpaque opaque;
349  Buffer nbuf = InvalidBuffer;
350  bool found = false;
351 
352  /* Assume unique until we find a duplicate */
353  *is_unique = true;
354 
355  InitDirtySnapshot(SnapshotDirty);
356 
357  page = BufferGetPage(insertstate->buf);
358  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
359  maxoff = PageGetMaxOffsetNumber(page);
360 
361  /*
362  * Find the first tuple with the same key.
363  *
364  * This also saves the binary search bounds in insertstate. We use them
365  * in the fastpath below, but also in the _bt_findinsertloc() call later.
366  */
367  Assert(!insertstate->bounds_valid);
368  offset = _bt_binsrch_insert(rel, insertstate);
369 
370  /*
371  * Scan over all equal tuples, looking for live conflicts.
372  */
373  Assert(!insertstate->bounds_valid || insertstate->low == offset);
374  Assert(!itup_key->anynullkeys);
375  Assert(itup_key->scantid == NULL);
376  for (;;)
377  {
378  ItemId curitemid;
379  IndexTuple curitup;
380  BlockNumber nblkno;
381 
382  /*
383  * make sure the offset points to an actual item before trying to
384  * examine it...
385  */
386  if (offset <= maxoff)
387  {
388  /*
389  * Fastpath: In most cases, we can use cached search bounds to
390  * limit our consideration to items that are definitely
391  * duplicates. This fastpath doesn't apply when the original page
392  * is empty, or when initial offset is past the end of the
393  * original page, which may indicate that we need to examine a
394  * second or subsequent page.
395  *
396  * Note that this optimization allows us to avoid calling
397  * _bt_compare() directly when there are no duplicates, as long as
398  * the offset where the key will go is not at the end of the page.
399  */
400  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
401  {
402  Assert(insertstate->bounds_valid);
403  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
404  Assert(insertstate->low <= insertstate->stricthigh);
405  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
406  break;
407  }
408 
409  curitemid = PageGetItemId(page, offset);
410 
411  /*
412  * We can skip items that are marked killed.
413  *
414  * In the presence of heavy update activity an index may contain
415  * many killed items with the same key; running _bt_compare() on
416  * each killed item gets expensive. Just advance over killed
417  * items as quickly as we can. We only apply _bt_compare() when
418  * we get to a non-killed item. Even those comparisons could be
419  * avoided (in the common case where there is only one page to
420  * visit) by reusing bounds, but just skipping dead items is fast
421  * enough.
422  */
423  if (!ItemIdIsDead(curitemid))
424  {
425  ItemPointerData htid;
426  bool all_dead;
427 
428  if (_bt_compare(rel, itup_key, page, offset) != 0)
429  break; /* we're past all the equal tuples */
430 
431  /* okay, we gotta fetch the heap tuple ... */
432  curitup = (IndexTuple) PageGetItem(page, curitemid);
433  htid = curitup->t_tid;
434 
435  /*
436  * If we are doing a recheck, we expect to find the tuple we
437  * are rechecking. It's not a duplicate, but we have to keep
438  * scanning.
439  */
440  if (checkUnique == UNIQUE_CHECK_EXISTING &&
441  ItemPointerCompare(&htid, &itup->t_tid) == 0)
442  {
443  found = true;
444  }
445 
446  /*
447  * Check if there's any table tuples for this index entry
448  * satisfying SnapshotDirty. This is necessary because for AMs
449  * with optimizations like heap's HOT, we have just a single
450  * index entry for the entire chain.
451  */
452  else if (table_index_fetch_tuple_check(heapRel, &htid,
453  &SnapshotDirty,
454  &all_dead))
455  {
456  TransactionId xwait;
457 
458  /*
459  * It is a duplicate. If we are only doing a partial
460  * check, then don't bother checking if the tuple is being
461  * updated in another transaction. Just return the fact
462  * that it is a potential conflict and leave the full
463  * check till later. Don't invalidate binary search
464  * bounds.
465  */
466  if (checkUnique == UNIQUE_CHECK_PARTIAL)
467  {
468  if (nbuf != InvalidBuffer)
469  _bt_relbuf(rel, nbuf);
470  *is_unique = false;
471  return InvalidTransactionId;
472  }
473 
474  /*
475  * If this tuple is being updated by other transaction
476  * then we have to wait for its commit/abort.
477  */
478  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
479  SnapshotDirty.xmin : SnapshotDirty.xmax;
480 
481  if (TransactionIdIsValid(xwait))
482  {
483  if (nbuf != InvalidBuffer)
484  _bt_relbuf(rel, nbuf);
485  /* Tell _bt_doinsert to wait... */
486  *speculativeToken = SnapshotDirty.speculativeToken;
487  /* Caller releases lock on buf immediately */
488  insertstate->bounds_valid = false;
489  return xwait;
490  }
491 
492  /*
493  * Otherwise we have a definite conflict. But before
494  * complaining, look to see if the tuple we want to insert
495  * is itself now committed dead --- if so, don't complain.
496  * This is a waste of time in normal scenarios but we must
497  * do it to support CREATE INDEX CONCURRENTLY.
498  *
499  * We must follow HOT-chains here because during
500  * concurrent index build, we insert the root TID though
501  * the actual tuple may be somewhere in the HOT-chain.
502  * While following the chain we might not stop at the
503  * exact tuple which triggered the insert, but that's OK
504  * because if we find a live tuple anywhere in this chain,
505  * we have a unique key conflict. The other live tuple is
506  * not part of this chain because it had a different index
507  * entry.
508  */
509  htid = itup->t_tid;
510  if (table_index_fetch_tuple_check(heapRel, &htid,
511  SnapshotSelf, NULL))
512  {
513  /* Normal case --- it's still live */
514  }
515  else
516  {
517  /*
518  * It's been deleted, so no error, and no need to
519  * continue searching
520  */
521  break;
522  }
523 
524  /*
525  * Check for a conflict-in as we would if we were going to
526  * write to this page. We aren't actually going to write,
527  * but we want a chance to report SSI conflicts that would
528  * otherwise be masked by this unique constraint
529  * violation.
530  */
531  CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
532 
533  /*
534  * This is a definite conflict. Break the tuple down into
535  * datums and report the error. But first, make sure we
536  * release the buffer locks we're holding ---
537  * BuildIndexValueDescription could make catalog accesses,
538  * which in the worst case might touch this same index and
539  * cause deadlocks.
540  */
541  if (nbuf != InvalidBuffer)
542  _bt_relbuf(rel, nbuf);
543  _bt_relbuf(rel, insertstate->buf);
544  insertstate->buf = InvalidBuffer;
545  insertstate->bounds_valid = false;
546 
547  {
549  bool isnull[INDEX_MAX_KEYS];
550  char *key_desc;
551 
553  values, isnull);
554 
555  key_desc = BuildIndexValueDescription(rel, values,
556  isnull);
557 
558  ereport(ERROR,
559  (errcode(ERRCODE_UNIQUE_VIOLATION),
560  errmsg("duplicate key value violates unique constraint \"%s\"",
562  key_desc ? errdetail("Key %s already exists.",
563  key_desc) : 0,
564  errtableconstraint(heapRel,
565  RelationGetRelationName(rel))));
566  }
567  }
568  else if (all_dead)
569  {
570  /*
571  * The conflicting tuple (or whole HOT chain) is dead to
572  * everyone, so we may as well mark the index entry
573  * killed.
574  */
575  ItemIdMarkDead(curitemid);
576  opaque->btpo_flags |= BTP_HAS_GARBAGE;
577 
578  /*
579  * Mark buffer with a dirty hint, since state is not
580  * crucial. Be sure to mark the proper buffer dirty.
581  */
582  if (nbuf != InvalidBuffer)
583  MarkBufferDirtyHint(nbuf, true);
584  else
585  MarkBufferDirtyHint(insertstate->buf, true);
586  }
587  }
588  }
589 
590  /*
591  * Advance to next tuple to continue checking.
592  */
593  if (offset < maxoff)
594  offset = OffsetNumberNext(offset);
595  else
596  {
597  int highkeycmp;
598 
599  /* If scankey == hikey we gotta check the next page too */
600  if (P_RIGHTMOST(opaque))
601  break;
602  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
603  Assert(highkeycmp <= 0);
604  if (highkeycmp != 0)
605  break;
606  /* Advance to next non-dead page --- there must be one */
607  for (;;)
608  {
609  nblkno = opaque->btpo_next;
610  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
611  page = BufferGetPage(nbuf);
612  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
613  if (!P_IGNORE(opaque))
614  break;
615  if (P_RIGHTMOST(opaque))
616  elog(ERROR, "fell off the end of index \"%s\"",
618  }
619  maxoff = PageGetMaxOffsetNumber(page);
620  offset = P_FIRSTDATAKEY(opaque);
621  /* Don't invalidate binary search bounds */
622  }
623  }
624 
625  /*
626  * If we are doing a recheck then we should have found the tuple we are
627  * checking. Otherwise there's something very wrong --- probably, the
628  * index is on a non-immutable expression.
629  */
630  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
631  ereport(ERROR,
632  (errcode(ERRCODE_INTERNAL_ERROR),
633  errmsg("failed to re-find tuple within index \"%s\"",
635  errhint("This may be because of a non-immutable index expression."),
636  errtableconstraint(heapRel,
637  RelationGetRelationName(rel))));
638 
639  if (nbuf != InvalidBuffer)
640  _bt_relbuf(rel, nbuf);
641 
642  return InvalidTransactionId;
643 }
644 
645 
646 /*
647  * _bt_findinsertloc() -- Finds an insert location for a tuple
648  *
649  * On entry, insertstate buffer contains the page the new tuple belongs
650  * on. It is exclusive-locked and pinned by the caller.
651  *
652  * If 'checkingunique' is true, the buffer on entry is the first page
653  * that contains duplicates of the new key. If there are duplicates on
654  * multiple pages, the correct insertion position might be some page to
655  * the right, rather than the first page. In that case, this function
656  * moves right to the correct target page.
657  *
658  * (In a !heapkeyspace index, there can be multiple pages with the same
659  * high key, where the new tuple could legitimately be placed on. In
660  * that case, the caller passes the first page containing duplicates,
661  * just like when checkingunique=true. If that page doesn't have enough
662  * room for the new tuple, this function moves right, trying to find a
663  * legal page that does.)
664  *
665  * On exit, insertstate buffer contains the chosen insertion page, and
666  * the offset within that page is returned. If _bt_findinsertloc needed
667  * to move right, the lock and pin on the original page are released, and
668  * the new buffer is exclusively locked and pinned instead.
669  *
670  * If insertstate contains cached binary search bounds, we will take
671  * advantage of them. This avoids repeating comparisons that we made in
672  * _bt_check_unique() already.
673  *
674  * If there is not enough room on the page for the new tuple, we try to
675  * make room by removing any LP_DEAD tuples.
676  */
677 static OffsetNumber
679  BTInsertState insertstate,
680  bool checkingunique,
681  BTStack stack,
682  Relation heapRel)
683 {
684  BTScanInsert itup_key = insertstate->itup_key;
685  Page page = BufferGetPage(insertstate->buf);
686  BTPageOpaque lpageop;
687 
688  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
689 
690  /* Check 1/3 of a page restriction */
691  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
692  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
693  insertstate->itup);
694 
695  Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
696  Assert(!insertstate->bounds_valid || checkingunique);
697  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
698  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
699 
700  if (itup_key->heapkeyspace)
701  {
702  /*
703  * If we're inserting into a unique index, we may have to walk right
704  * through leaf pages to find the one leaf page that we must insert on
705  * to.
706  *
707  * This is needed for checkingunique callers because a scantid was not
708  * used when we called _bt_search(). scantid can only be set after
709  * _bt_check_unique() has checked for duplicates. The buffer
710  * initially stored in insertstate->buf has the page where the first
711  * duplicate key might be found, which isn't always the page that new
712  * tuple belongs on. The heap TID attribute for new tuple (scantid)
713  * could force us to insert on a sibling page, though that should be
714  * very rare in practice.
715  */
716  if (checkingunique)
717  {
718  for (;;)
719  {
720  /*
721  * Does the new tuple belong on this page?
722  *
723  * The earlier _bt_check_unique() call may well have
724  * established a strict upper bound on the offset for the new
725  * item. If it's not the last item of the page (i.e. if there
726  * is at least one tuple on the page that goes after the tuple
727  * we're inserting) then we know that the tuple belongs on
728  * this page. We can skip the high key check.
729  */
730  if (insertstate->bounds_valid &&
731  insertstate->low <= insertstate->stricthigh &&
732  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
733  break;
734 
735  /* Test '<=', not '!=', since scantid is set now */
736  if (P_RIGHTMOST(lpageop) ||
737  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
738  break;
739 
740  _bt_stepright(rel, insertstate, stack);
741  /* Update local state after stepping right */
742  page = BufferGetPage(insertstate->buf);
743  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
744  }
745  }
746 
747  /*
748  * If the target page is full, see if we can obtain enough space by
749  * erasing LP_DEAD items
750  */
751  if (PageGetFreeSpace(page) < insertstate->itemsz &&
752  P_HAS_GARBAGE(lpageop))
753  {
754  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
755  insertstate->bounds_valid = false;
756  }
757  }
758  else
759  {
760  /*----------
761  * This is a !heapkeyspace (version 2 or 3) index. The current page
762  * is the first page that we could insert the new tuple to, but there
763  * may be other pages to the right that we could opt to use instead.
764  *
765  * If the new key is equal to one or more existing keys, we can
766  * legitimately place it anywhere in the series of equal keys. In
767  * fact, if the new key is equal to the page's "high key" we can place
768  * it on the next page. If it is equal to the high key, and there's
769  * not room to insert the new tuple on the current page without
770  * splitting, then we move right hoping to find more free space and
771  * avoid a split.
772  *
773  * Keep scanning right until we
774  * (a) find a page with enough free space,
775  * (b) reach the last page where the tuple can legally go, or
776  * (c) get tired of searching.
777  * (c) is not flippant; it is important because if there are many
778  * pages' worth of equal keys, it's better to split one of the early
779  * pages than to scan all the way to the end of the run of equal keys
780  * on every insert. We implement "get tired" as a random choice,
781  * since stopping after scanning a fixed number of pages wouldn't work
782  * well (we'd never reach the right-hand side of previously split
783  * pages). The probability of moving right is set at 0.99, which may
784  * seem too high to change the behavior much, but it does an excellent
785  * job of preventing O(N^2) behavior with many equal keys.
786  *----------
787  */
788  while (PageGetFreeSpace(page) < insertstate->itemsz)
789  {
790  /*
791  * Before considering moving right, see if we can obtain enough
792  * space by erasing LP_DEAD items
793  */
794  if (P_HAS_GARBAGE(lpageop))
795  {
796  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
797  insertstate->bounds_valid = false;
798 
799  if (PageGetFreeSpace(page) >= insertstate->itemsz)
800  break; /* OK, now we have enough space */
801  }
802 
803  /*
804  * Nope, so check conditions (b) and (c) enumerated above
805  *
806  * The earlier _bt_check_unique() call may well have established a
807  * strict upper bound on the offset for the new item. If it's not
808  * the last item of the page (i.e. if there is at least one tuple
809  * on the page that's greater than the tuple we're inserting to)
810  * then we know that the tuple belongs on this page. We can skip
811  * the high key check.
812  */
813  if (insertstate->bounds_valid &&
814  insertstate->low <= insertstate->stricthigh &&
815  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
816  break;
817 
818  if (P_RIGHTMOST(lpageop) ||
819  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
820  random() <= (MAX_RANDOM_VALUE / 100))
821  break;
822 
823  _bt_stepright(rel, insertstate, stack);
824  /* Update local state after stepping right */
825  page = BufferGetPage(insertstate->buf);
826  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
827  }
828  }
829 
830  /*
831  * We should now be on the correct page. Find the offset within the page
832  * for the new tuple. (Possibly reusing earlier search bounds.)
833  */
834  Assert(P_RIGHTMOST(lpageop) ||
835  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
836 
837  return _bt_binsrch_insert(rel, insertstate);
838 }
839 
840 /*
841  * Step right to next non-dead page, during insertion.
842  *
843  * This is a bit more complicated than moving right in a search. We must
844  * write-lock the target page before releasing write lock on current page;
845  * else someone else's _bt_check_unique scan could fail to see our insertion.
846  * Write locks on intermediate dead pages won't do because we don't know when
847  * they will get de-linked from the tree.
848  *
849  * This is more aggressive than it needs to be for non-unique !heapkeyspace
850  * indexes.
851  */
852 static void
853 _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
854 {
855  Page page;
856  BTPageOpaque lpageop;
857  Buffer rbuf;
858  BlockNumber rblkno;
859 
860  page = BufferGetPage(insertstate->buf);
861  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
862 
863  rbuf = InvalidBuffer;
864  rblkno = lpageop->btpo_next;
865  for (;;)
866  {
867  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
868  page = BufferGetPage(rbuf);
869  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
870 
871  /*
872  * If this page was incompletely split, finish the split now. We do
873  * this while holding a lock on the left sibling, which is not good
874  * because finishing the split could be a fairly lengthy operation.
875  * But this should happen very seldom.
876  */
877  if (P_INCOMPLETE_SPLIT(lpageop))
878  {
879  _bt_finish_split(rel, rbuf, stack);
880  rbuf = InvalidBuffer;
881  continue;
882  }
883 
884  if (!P_IGNORE(lpageop))
885  break;
886  if (P_RIGHTMOST(lpageop))
887  elog(ERROR, "fell off the end of index \"%s\"",
889 
890  rblkno = lpageop->btpo_next;
891  }
892  /* rbuf locked; unlock buf, update state for caller */
893  _bt_relbuf(rel, insertstate->buf);
894  insertstate->buf = rbuf;
895  insertstate->bounds_valid = false;
896 }
897 
898 /*----------
899  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
900  *
901  * This recursive procedure does the following things:
902  *
903  * + if necessary, splits the target page, using 'itup_key' for
904  * suffix truncation on leaf pages (caller passes NULL for
905  * non-leaf pages).
906  * + inserts the tuple.
907  * + if the page was split, pops the parent stack, and finds the
908  * right place to insert the new child pointer (by walking
909  * right using information stored in the parent stack).
910  * + invokes itself with the appropriate tuple for the right
911  * child page on the parent.
912  * + updates the metapage if a true root or fast root is split.
913  *
914  * On entry, we must have the correct buffer in which to do the
915  * insertion, and the buffer must be pinned and write-locked. On return,
916  * we will have dropped both the pin and the lock on the buffer.
917  *
918  * This routine only performs retail tuple insertions. 'itup' should
919  * always be either a non-highkey leaf item, or a downlink (new high
920  * key items are created indirectly, when a page is split). When
921  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
922  * we're inserting the downlink for. This function will clear the
923  * INCOMPLETE_SPLIT flag on it, and release the buffer.
924  *----------
925  */
926 static void
928  BTScanInsert itup_key,
929  Buffer buf,
930  Buffer cbuf,
931  BTStack stack,
932  IndexTuple itup,
933  OffsetNumber newitemoff,
934  bool split_only_page)
935 {
936  Page page;
937  BTPageOpaque lpageop;
938  Size itemsz;
939 
940  page = BufferGetPage(buf);
941  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
942 
943  /* child buffer must be given iff inserting on an internal page */
944  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
945  /* tuple must have appropriate number of attributes */
946  Assert(!P_ISLEAF(lpageop) ||
947  BTreeTupleGetNAtts(itup, rel) ==
949  Assert(P_ISLEAF(lpageop) ||
950  BTreeTupleGetNAtts(itup, rel) <=
952 
953  /* The caller should've finished any incomplete splits already. */
954  if (P_INCOMPLETE_SPLIT(lpageop))
955  elog(ERROR, "cannot insert to incompletely split page %u",
956  BufferGetBlockNumber(buf));
957 
958  itemsz = IndexTupleSize(itup);
959  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
960  * need to be consistent */
961 
962  /*
963  * Do we need to split the page to fit the item on it?
964  *
965  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
966  * so this comparison is correct even though we appear to be accounting
967  * only for the item and not for its line pointer.
968  */
969  if (PageGetFreeSpace(page) < itemsz)
970  {
971  bool is_root = P_ISROOT(lpageop);
972  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
973  Buffer rbuf;
974 
975  /*
976  * If we're here then a pagesplit is needed. We should never reach
977  * here if we're using the fastpath since we should have checked for
978  * all the required conditions, including the fact that this page has
979  * enough freespace. Note that this routine can in theory deal with
980  * the situation where a NULL stack pointer is passed (that's what
981  * would happen if the fastpath is taken). But that path is much
982  * slower, defeating the very purpose of the optimization. The
983  * following assertion should protect us from any future code changes
984  * that invalidate those assumptions.
985  *
986  * Note that whenever we fail to take the fastpath, we clear the
987  * cached block. Checking for a valid cached block at this point is
988  * enough to decide whether we're in a fastpath or not.
989  */
990  Assert(!(P_ISLEAF(lpageop) &&
992 
993  /* split the buffer into left and right halves */
994  rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
997  BufferGetBlockNumber(rbuf));
998 
999  /*----------
1000  * By here,
1001  *
1002  * + our target page has been split;
1003  * + the original tuple has been inserted;
1004  * + we have write locks on both the old (left half)
1005  * and new (right half) buffers, after the split; and
1006  * + we know the key we want to insert into the parent
1007  * (it's the "high key" on the left child page).
1008  *
1009  * We're ready to do the parent insertion. We need to hold onto the
1010  * locks for the child pages until we locate the parent, but we can
1011  * at least release the lock on the right child before doing the
1012  * actual insertion. The lock on the left child will be released
1013  * last of all by parent insertion, where it is the 'cbuf' of parent
1014  * page.
1015  *----------
1016  */
1017  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
1018  }
1019  else
1020  {
1021  Buffer metabuf = InvalidBuffer;
1022  Page metapg = NULL;
1023  BTMetaPageData *metad = NULL;
1024  OffsetNumber itup_off;
1025  BlockNumber itup_blkno;
1026  BlockNumber cachedBlock = InvalidBlockNumber;
1027 
1028  itup_off = newitemoff;
1029  itup_blkno = BufferGetBlockNumber(buf);
1030 
1031  /*
1032  * If we are doing this insert because we split a page that was the
1033  * only one on its tree level, but was not the root, it may have been
1034  * the "fast root". We need to ensure that the fast root link points
1035  * at or above the current page. We can safely acquire a lock on the
1036  * metapage here --- see comments for _bt_newroot().
1037  */
1038  if (split_only_page)
1039  {
1040  Assert(!P_ISLEAF(lpageop));
1041 
1042  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1043  metapg = BufferGetPage(metabuf);
1044  metad = BTPageGetMeta(metapg);
1045 
1046  if (metad->btm_fastlevel >= lpageop->btpo.level)
1047  {
1048  /* no update wanted */
1049  _bt_relbuf(rel, metabuf);
1050  metabuf = InvalidBuffer;
1051  }
1052  }
1053 
1054  /*
1055  * Every internal page should have exactly one negative infinity item
1056  * at all times. Only _bt_split() and _bt_newroot() should add items
1057  * that become negative infinity items through truncation, since
1058  * they're the only routines that allocate new internal pages. Do not
1059  * allow a retail insertion of a new item at the negative infinity
1060  * offset.
1061  */
1062  if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
1063  elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
1064  itup_blkno, RelationGetRelationName(rel));
1065 
1066  /* Do the update. No ereport(ERROR) until changes are logged */
1068 
1069  if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
1070  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1071  itup_blkno, RelationGetRelationName(rel));
1072 
1073  MarkBufferDirty(buf);
1074 
1075  if (BufferIsValid(metabuf))
1076  {
1077  /* upgrade meta-page if needed */
1078  if (metad->btm_version < BTREE_NOVAC_VERSION)
1079  _bt_upgrademetapage(metapg);
1080  metad->btm_fastroot = itup_blkno;
1081  metad->btm_fastlevel = lpageop->btpo.level;
1082  MarkBufferDirty(metabuf);
1083  }
1084 
1085  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
1086  if (BufferIsValid(cbuf))
1087  {
1088  Page cpage = BufferGetPage(cbuf);
1089  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1090 
1091  Assert(P_INCOMPLETE_SPLIT(cpageop));
1092  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1093  MarkBufferDirty(cbuf);
1094  }
1095 
1096  /*
1097  * Cache the block information if we just inserted into the rightmost
1098  * leaf page of the index and it's not the root page. For very small
1099  * index where root is also the leaf, there is no point trying for any
1100  * optimization.
1101  */
1102  if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
1103  cachedBlock = BufferGetBlockNumber(buf);
1104 
1105  /* XLOG stuff */
1106  if (RelationNeedsWAL(rel))
1107  {
1108  xl_btree_insert xlrec;
1109  xl_btree_metadata xlmeta;
1110  uint8 xlinfo;
1111  XLogRecPtr recptr;
1112 
1113  xlrec.offnum = itup_off;
1114 
1115  XLogBeginInsert();
1116  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1117 
1118  if (P_ISLEAF(lpageop))
1119  xlinfo = XLOG_BTREE_INSERT_LEAF;
1120  else
1121  {
1122  /*
1123  * Register the left child whose INCOMPLETE_SPLIT flag was
1124  * cleared.
1125  */
1127 
1128  xlinfo = XLOG_BTREE_INSERT_UPPER;
1129  }
1130 
1131  if (BufferIsValid(metabuf))
1132  {
1134  xlmeta.version = metad->btm_version;
1135  xlmeta.root = metad->btm_root;
1136  xlmeta.level = metad->btm_level;
1137  xlmeta.fastroot = metad->btm_fastroot;
1138  xlmeta.fastlevel = metad->btm_fastlevel;
1139  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1142 
1144  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1145 
1146  xlinfo = XLOG_BTREE_INSERT_META;
1147  }
1148 
1150  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1151 
1152  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1153 
1154  if (BufferIsValid(metabuf))
1155  {
1156  PageSetLSN(metapg, recptr);
1157  }
1158  if (BufferIsValid(cbuf))
1159  {
1160  PageSetLSN(BufferGetPage(cbuf), recptr);
1161  }
1162 
1163  PageSetLSN(page, recptr);
1164  }
1165 
1166  END_CRIT_SECTION();
1167 
1168  /* release buffers */
1169  if (BufferIsValid(metabuf))
1170  _bt_relbuf(rel, metabuf);
1171  if (BufferIsValid(cbuf))
1172  _bt_relbuf(rel, cbuf);
1173  _bt_relbuf(rel, buf);
1174 
1175  /*
1176  * If we decided to cache the insertion target block, then set it now.
1177  * But before that, check for the height of the tree and don't go for
1178  * the optimization for small indexes. We defer that check to this
1179  * point to ensure that we don't call _bt_getrootheight while holding
1180  * lock on any other block.
1181  *
1182  * We do this after dropping locks on all buffers. So the information
1183  * about whether the insertion block is still the rightmost block or
1184  * not may have changed in between. But we will deal with that during
1185  * next insert operation. No special care is required while setting
1186  * it.
1187  */
1188  if (BlockNumberIsValid(cachedBlock) &&
1190  RelationSetTargetBlock(rel, cachedBlock);
1191  }
1192 }
1193 
1194 /*
1195  * _bt_split() -- split a page in the btree.
1196  *
1197  * On entry, buf is the page to split, and is pinned and write-locked.
1198  * newitemoff etc. tell us about the new item that must be inserted
1199  * along with the data from the original page.
1200  *
1201  * itup_key is used for suffix truncation on leaf pages (internal
1202  * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1203  * is the left-sibling of the page we're inserting the downlink for.
1204  * This function will clear the INCOMPLETE_SPLIT flag on it, and
1205  * release the buffer.
1206  *
1207  * Returns the new right sibling of buf, pinned and write-locked.
1208  * The pin and lock on buf are maintained.
1209  */
1210 static Buffer
1212  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
1213 {
1214  Buffer rbuf;
1215  Page origpage;
1216  Page leftpage,
1217  rightpage;
1218  BlockNumber origpagenumber,
1219  rightpagenumber;
1220  BTPageOpaque ropaque,
1221  lopaque,
1222  oopaque;
1223  Buffer sbuf = InvalidBuffer;
1224  Page spage = NULL;
1225  BTPageOpaque sopaque = NULL;
1226  Size itemsz;
1227  ItemId itemid;
1228  IndexTuple item;
1229  OffsetNumber leftoff,
1230  rightoff;
1231  OffsetNumber firstright;
1232  OffsetNumber maxoff;
1233  OffsetNumber i;
1234  bool newitemonleft,
1235  isleaf;
1236  IndexTuple lefthikey;
1237  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1238  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1239 
1240  /*
1241  * origpage is the original page to be split. leftpage is a temporary
1242  * buffer that receives the left-sibling data, which will be copied back
1243  * into origpage on success. rightpage is the new page that will receive
1244  * the right-sibling data.
1245  *
1246  * leftpage is allocated after choosing a split point. rightpage's new
1247  * buffer isn't acquired until after leftpage is initialized and has new
1248  * high key, the last point where splitting the page may fail (barring
1249  * corruption). Failing before acquiring new buffer won't have lasting
1250  * consequences, since origpage won't have been modified and leftpage is
1251  * only workspace.
1252  */
1253  origpage = BufferGetPage(buf);
1254  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1255  origpagenumber = BufferGetBlockNumber(buf);
1256 
1257  /*
1258  * Choose a point to split origpage at.
1259  *
1260  * A split point can be thought of as a point _between_ two existing
1261  * tuples on origpage (lastleft and firstright tuples), provided you
1262  * pretend that the new item that didn't fit is already on origpage.
1263  *
1264  * Since origpage does not actually contain newitem, the representation of
1265  * split points needs to work with two boundary cases: splits where
1266  * newitem is lastleft, and splits where newitem is firstright.
1267  * newitemonleft resolves the ambiguity that would otherwise exist when
1268  * newitemoff == firstright. In all other cases it's clear which side of
1269  * the split every tuple goes on from context. newitemonleft is usually
1270  * (but not always) redundant information.
1271  */
1272  firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1273  newitem, &newitemonleft);
1274 
1275  /* Allocate temp buffer for leftpage */
1276  leftpage = PageGetTempPage(origpage);
1277  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1278  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1279 
1280  /*
1281  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1282  * and HAS_GARBAGE flags.
1283  */
1284  lopaque->btpo_flags = oopaque->btpo_flags;
1285  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1286  /* set flag in leftpage indicating that rightpage has no downlink yet */
1287  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1288  lopaque->btpo_prev = oopaque->btpo_prev;
1289  /* handle btpo_next after rightpage buffer acquired */
1290  lopaque->btpo.level = oopaque->btpo.level;
1291  /* handle btpo_cycleid after rightpage buffer acquired */
1292 
1293  /*
1294  * Copy the original page's LSN into leftpage, which will become the
1295  * updated version of the page. We need this because XLogInsert will
1296  * examine the LSN and possibly dump it in a page image.
1297  */
1298  PageSetLSN(leftpage, PageGetLSN(origpage));
1299  isleaf = P_ISLEAF(oopaque);
1300 
1301  /*
1302  * The "high key" for the new left page will be the first key that's going
1303  * to go into the new right page, or a truncated version if this is a leaf
1304  * page split.
1305  *
1306  * The high key for the left page is formed using the first item on the
1307  * right page, which may seem to be contrary to Lehman & Yao's approach of
1308  * using the left page's last item as its new high key when splitting on
1309  * the leaf level. It isn't, though: suffix truncation will leave the
1310  * left page's high key fully equal to the last item on the left page when
1311  * two tuples with equal key values (excluding heap TID) enclose the split
1312  * point. It isn't actually necessary for a new leaf high key to be equal
1313  * to the last item on the left for the L&Y "subtree" invariant to hold.
1314  * It's sufficient to make sure that the new leaf high key is strictly
1315  * less than the first item on the right leaf page, and greater than or
1316  * equal to (not necessarily equal to) the last item on the left leaf
1317  * page.
1318  *
1319  * In other words, when suffix truncation isn't possible, L&Y's exact
1320  * approach to leaf splits is taken. (Actually, even that is slightly
1321  * inaccurate. A tuple with all the keys from firstright but the heap TID
1322  * from lastleft will be used as the new high key, since the last left
1323  * tuple could be physically larger despite being opclass-equal in respect
1324  * of all attributes prior to the heap TID attribute.)
1325  */
1326  if (!newitemonleft && newitemoff == firstright)
1327  {
1328  /* incoming tuple will become first on right page */
1329  itemsz = newitemsz;
1330  item = newitem;
1331  }
1332  else
1333  {
1334  /* existing item at firstright will become first on right page */
1335  itemid = PageGetItemId(origpage, firstright);
1336  itemsz = ItemIdGetLength(itemid);
1337  item = (IndexTuple) PageGetItem(origpage, itemid);
1338  }
1339 
1340  /*
1341  * Truncate unneeded key and non-key attributes of the high key item
1342  * before inserting it on the left page. This can only happen at the leaf
1343  * level, since in general all pivot tuple values originate from leaf
1344  * level high keys. A pivot tuple in a grandparent page must guide a
1345  * search not only to the correct parent page, but also to the correct
1346  * leaf page.
1347  */
1348  if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
1349  {
1350  IndexTuple lastleft;
1351 
1352  /*
1353  * Determine which tuple will become the last on the left page. This
1354  * is needed to decide how many attributes from the first item on the
1355  * right page must remain in new high key for left page.
1356  */
1357  if (newitemonleft && newitemoff == firstright)
1358  {
1359  /* incoming tuple will become last on left page */
1360  lastleft = newitem;
1361  }
1362  else
1363  {
1364  OffsetNumber lastleftoff;
1365 
1366  /* item just before firstright will become last on left page */
1367  lastleftoff = OffsetNumberPrev(firstright);
1368  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1369  itemid = PageGetItemId(origpage, lastleftoff);
1370  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1371  }
1372 
1373  Assert(lastleft != item);
1374  lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
1375  itemsz = IndexTupleSize(lefthikey);
1376  itemsz = MAXALIGN(itemsz);
1377  }
1378  else
1379  lefthikey = item;
1380 
1381  /*
1382  * Add new high key to leftpage
1383  */
1384  leftoff = P_HIKEY;
1385 
1386  Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
1387  Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
1388  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1389  false, false) == InvalidOffsetNumber)
1390  elog(ERROR, "failed to add hikey to the left sibling"
1391  " while splitting block %u of index \"%s\"",
1392  origpagenumber, RelationGetRelationName(rel));
1393  leftoff = OffsetNumberNext(leftoff);
1394  /* be tidy */
1395  if (lefthikey != item)
1396  pfree(lefthikey);
1397 
1398  /*
1399  * Acquire a new right page to split into, now that left page has a new
1400  * high key. From here on, it's not okay to throw an error without
1401  * zeroing rightpage first. This coding rule ensures that we won't
1402  * confuse future VACUUM operations, which might otherwise try to re-find
1403  * a downlink to a leftover junk page as the page undergoes deletion.
1404  *
1405  * It would be reasonable to start the critical section just after the new
1406  * rightpage buffer is acquired instead; that would allow us to avoid
1407  * leftover junk pages without bothering to zero rightpage. We do it this
1408  * way because it avoids an unnecessary PANIC when either origpage or its
1409  * existing sibling page are corrupt.
1410  */
1411  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1412  rightpage = BufferGetPage(rbuf);
1413  rightpagenumber = BufferGetBlockNumber(rbuf);
1414  /* rightpage was initialized by _bt_getbuf */
1415  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1416 
1417  /*
1418  * Finish off remaining leftpage special area fields. They cannot be set
1419  * before both origpage (leftpage) and rightpage buffers are acquired and
1420  * locked.
1421  */
1422  lopaque->btpo_next = rightpagenumber;
1423  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1424 
1425  /*
1426  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1427  * and HAS_GARBAGE flags.
1428  */
1429  ropaque->btpo_flags = oopaque->btpo_flags;
1430  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1431  ropaque->btpo_prev = origpagenumber;
1432  ropaque->btpo_next = oopaque->btpo_next;
1433  ropaque->btpo.level = oopaque->btpo.level;
1434  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1435 
1436  /*
1437  * Add new high key to rightpage where necessary.
1438  *
1439  * If the page we're splitting is not the rightmost page at its level in
1440  * the tree, then the first entry on the page is the high key from
1441  * origpage.
1442  */
1443  rightoff = P_HIKEY;
1444 
1445  if (!P_RIGHTMOST(oopaque))
1446  {
1447  itemid = PageGetItemId(origpage, P_HIKEY);
1448  itemsz = ItemIdGetLength(itemid);
1449  item = (IndexTuple) PageGetItem(origpage, itemid);
1450  Assert(BTreeTupleGetNAtts(item, rel) > 0);
1451  Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
1452  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1453  false, false) == InvalidOffsetNumber)
1454  {
1455  memset(rightpage, 0, BufferGetPageSize(rbuf));
1456  elog(ERROR, "failed to add hikey to the right sibling"
1457  " while splitting block %u of index \"%s\"",
1458  origpagenumber, RelationGetRelationName(rel));
1459  }
1460  rightoff = OffsetNumberNext(rightoff);
1461  }
1462 
1463  /*
1464  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1465  * additional pivot tuples in !isleaf case) to the appropriate page.
1466  *
1467  * Note: we *must* insert at least the right page's items in item-number
1468  * order, for the benefit of _bt_restore_page().
1469  */
1470  maxoff = PageGetMaxOffsetNumber(origpage);
1471 
1472  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1473  {
1474  itemid = PageGetItemId(origpage, i);
1475  itemsz = ItemIdGetLength(itemid);
1476  item = (IndexTuple) PageGetItem(origpage, itemid);
1477 
1478  /* does new item belong before this one? */
1479  if (i == newitemoff)
1480  {
1481  if (newitemonleft)
1482  {
1483  Assert(newitemoff <= firstright);
1484  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1485  {
1486  memset(rightpage, 0, BufferGetPageSize(rbuf));
1487  elog(ERROR, "failed to add new item to the left sibling"
1488  " while splitting block %u of index \"%s\"",
1489  origpagenumber, RelationGetRelationName(rel));
1490  }
1491  leftoff = OffsetNumberNext(leftoff);
1492  }
1493  else
1494  {
1495  Assert(newitemoff >= firstright);
1496  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1497  {
1498  memset(rightpage, 0, BufferGetPageSize(rbuf));
1499  elog(ERROR, "failed to add new item to the right sibling"
1500  " while splitting block %u of index \"%s\"",
1501  origpagenumber, RelationGetRelationName(rel));
1502  }
1503  rightoff = OffsetNumberNext(rightoff);
1504  }
1505  }
1506 
1507  /* decide which page to put it on */
1508  if (i < firstright)
1509  {
1510  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1511  {
1512  memset(rightpage, 0, BufferGetPageSize(rbuf));
1513  elog(ERROR, "failed to add old item to the left sibling"
1514  " while splitting block %u of index \"%s\"",
1515  origpagenumber, RelationGetRelationName(rel));
1516  }
1517  leftoff = OffsetNumberNext(leftoff);
1518  }
1519  else
1520  {
1521  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1522  {
1523  memset(rightpage, 0, BufferGetPageSize(rbuf));
1524  elog(ERROR, "failed to add old item to the right sibling"
1525  " while splitting block %u of index \"%s\"",
1526  origpagenumber, RelationGetRelationName(rel));
1527  }
1528  rightoff = OffsetNumberNext(rightoff);
1529  }
1530  }
1531 
1532  /* cope with possibility that newitem goes at the end */
1533  if (i <= newitemoff)
1534  {
1535  /*
1536  * Can't have newitemonleft here; that would imply we were told to put
1537  * *everything* on the left page, which cannot fit (if it could, we'd
1538  * not be splitting the page).
1539  */
1540  Assert(!newitemonleft);
1541  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1542  {
1543  memset(rightpage, 0, BufferGetPageSize(rbuf));
1544  elog(ERROR, "failed to add new item to the right sibling"
1545  " while splitting block %u of index \"%s\"",
1546  origpagenumber, RelationGetRelationName(rel));
1547  }
1548  rightoff = OffsetNumberNext(rightoff);
1549  }
1550 
1551  /*
1552  * We have to grab the right sibling (if any) and fix the prev pointer
1553  * there. We are guaranteed that this is deadlock-free since no other
1554  * writer will be holding a lock on that page and trying to move left, and
1555  * all readers release locks on a page before trying to fetch its
1556  * neighbors.
1557  */
1558  if (!P_RIGHTMOST(oopaque))
1559  {
1560  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1561  spage = BufferGetPage(sbuf);
1562  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1563  if (sopaque->btpo_prev != origpagenumber)
1564  {
1565  memset(rightpage, 0, BufferGetPageSize(rbuf));
1566  ereport(ERROR,
1567  (errcode(ERRCODE_INDEX_CORRUPTED),
1568  errmsg_internal("right sibling's left-link doesn't match: "
1569  "block %u links to %u instead of expected %u in index \"%s\"",
1570  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1571  RelationGetRelationName(rel))));
1572  }
1573 
1574  /*
1575  * Check to see if we can set the SPLIT_END flag in the right-hand
1576  * split page; this can save some I/O for vacuum since it need not
1577  * proceed to the right sibling. We can set the flag if the right
1578  * sibling has a different cycleid: that means it could not be part of
1579  * a group of pages that were all split off from the same ancestor
1580  * page. If you're confused, imagine that page A splits to A B and
1581  * then again, yielding A C B, while vacuum is in progress. Tuples
1582  * originally in A could now be in either B or C, hence vacuum must
1583  * examine both pages. But if D, our right sibling, has a different
1584  * cycleid then it could not contain any tuples that were in A when
1585  * the vacuum started.
1586  */
1587  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1588  ropaque->btpo_flags |= BTP_SPLIT_END;
1589  }
1590 
1591  /*
1592  * Right sibling is locked, new siblings are prepared, but original page
1593  * is not updated yet.
1594  *
1595  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1596  * not starting the critical section till here because we haven't been
1597  * scribbling on the original page yet; see comments above.
1598  */
1600 
1601  /*
1602  * By here, the original data page has been split into two new halves, and
1603  * these are correct. The algorithm requires that the left page never
1604  * move during a split, so we copy the new left page back on top of the
1605  * original. Note that this is not a waste of time, since we also require
1606  * (in the page management code) that the center of a page always be
1607  * clean, and the most efficient way to guarantee this is just to compact
1608  * the data by reinserting it into a new left page. (XXX the latter
1609  * comment is probably obsolete; but in any case it's good to not scribble
1610  * on the original page until we enter the critical section.)
1611  *
1612  * We need to do this before writing the WAL record, so that XLogInsert
1613  * can WAL log an image of the page if necessary.
1614  */
1615  PageRestoreTempPage(leftpage, origpage);
1616  /* leftpage, lopaque must not be used below here */
1617 
1618  MarkBufferDirty(buf);
1619  MarkBufferDirty(rbuf);
1620 
1621  if (!P_RIGHTMOST(ropaque))
1622  {
1623  sopaque->btpo_prev = rightpagenumber;
1624  MarkBufferDirty(sbuf);
1625  }
1626 
1627  /*
1628  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1629  * a split.
1630  */
1631  if (!isleaf)
1632  {
1633  Page cpage = BufferGetPage(cbuf);
1634  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1635 
1636  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1637  MarkBufferDirty(cbuf);
1638  }
1639 
1640  /* XLOG stuff */
1641  if (RelationNeedsWAL(rel))
1642  {
1643  xl_btree_split xlrec;
1644  uint8 xlinfo;
1645  XLogRecPtr recptr;
1646 
1647  xlrec.level = ropaque->btpo.level;
1648  xlrec.firstright = firstright;
1649  xlrec.newitemoff = newitemoff;
1650 
1651  XLogBeginInsert();
1652  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1653 
1656  /* Log the right sibling, because we've changed its prev-pointer. */
1657  if (!P_RIGHTMOST(ropaque))
1659  if (BufferIsValid(cbuf))
1661 
1662  /*
1663  * Log the new item, if it was inserted on the left page. (If it was
1664  * put on the right page, we don't need to explicitly WAL log it
1665  * because it's included with all the other items on the right page.)
1666  * Show the new item as belonging to the left page buffer, so that it
1667  * is not stored if XLogInsert decides it needs a full-page image of
1668  * the left page. We store the offset anyway, though, to support
1669  * archive compression of these records.
1670  */
1671  if (newitemonleft)
1672  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1673 
1674  /* Log the left page's new high key */
1675  itemid = PageGetItemId(origpage, P_HIKEY);
1676  item = (IndexTuple) PageGetItem(origpage, itemid);
1677  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1678 
1679  /*
1680  * Log the contents of the right page in the format understood by
1681  * _bt_restore_page(). The whole right page will be recreated.
1682  *
1683  * Direct access to page is not good but faster - we should implement
1684  * some new func in page API. Note we only store the tuples
1685  * themselves, knowing that they were inserted in item-number order
1686  * and so the line pointers can be reconstructed. See comments for
1687  * _bt_restore_page().
1688  */
1690  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1691  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1692 
1693  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
1694  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1695 
1696  PageSetLSN(origpage, recptr);
1697  PageSetLSN(rightpage, recptr);
1698  if (!P_RIGHTMOST(ropaque))
1699  {
1700  PageSetLSN(spage, recptr);
1701  }
1702  if (!isleaf)
1703  {
1704  PageSetLSN(BufferGetPage(cbuf), recptr);
1705  }
1706  }
1707 
1708  END_CRIT_SECTION();
1709 
1710  /* release the old right sibling */
1711  if (!P_RIGHTMOST(ropaque))
1712  _bt_relbuf(rel, sbuf);
1713 
1714  /* release the child */
1715  if (!isleaf)
1716  _bt_relbuf(rel, cbuf);
1717 
1718  /* split's done */
1719  return rbuf;
1720 }
1721 
1722 /*
1723  * _bt_insert_parent() -- Insert downlink into parent, completing split.
1724  *
1725  * On entry, buf and rbuf are the left and right split pages, which we
1726  * still hold write locks on. Both locks will be released here. We
1727  * release the rbuf lock once we have a write lock on the page that we
1728  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
1729  * The lock on buf is released at the same point as the lock on the parent
1730  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
1731  * atomic operation that completes the split by inserting a new downlink.
1732  *
1733  * stack - stack showing how we got here. Will be NULL when splitting true
1734  * root, or during concurrent root split, where we can be inefficient
1735  * is_root - we split the true root
1736  * is_only - we split a page alone on its level (might have been fast root)
1737  */
1738 static void
1740  Buffer buf,
1741  Buffer rbuf,
1742  BTStack stack,
1743  bool is_root,
1744  bool is_only)
1745 {
1746  /*
1747  * Here we have to do something Lehman and Yao don't talk about: deal with
1748  * a root split and construction of a new root. If our stack is empty
1749  * then we have just split a node on what had been the root level when we
1750  * descended the tree. If it was still the root then we perform a
1751  * new-root construction. If it *wasn't* the root anymore, search to find
1752  * the next higher level that someone constructed meanwhile, and find the
1753  * right place to insert as for the normal case.
1754  *
1755  * If we have to search for the parent level, we do so by re-descending
1756  * from the root. This is not super-efficient, but it's rare enough not
1757  * to matter.
1758  */
1759  if (is_root)
1760  {
1761  Buffer rootbuf;
1762 
1763  Assert(stack == NULL);
1764  Assert(is_only);
1765  /* create a new root node and update the metapage */
1766  rootbuf = _bt_newroot(rel, buf, rbuf);
1767  /* release the split buffers */
1768  _bt_relbuf(rel, rootbuf);
1769  _bt_relbuf(rel, rbuf);
1770  _bt_relbuf(rel, buf);
1771  }
1772  else
1773  {
1774  BlockNumber bknum = BufferGetBlockNumber(buf);
1775  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1776  Page page = BufferGetPage(buf);
1777  IndexTuple new_item;
1778  BTStackData fakestack;
1779  IndexTuple ritem;
1780  Buffer pbuf;
1781 
1782  if (stack == NULL)
1783  {
1784  BTPageOpaque lpageop;
1785 
1786  elog(DEBUG2, "concurrent ROOT page split");
1787  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1788  /* Find the leftmost page at the next level up */
1789  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
1790  NULL);
1791  /* Set up a phony stack entry pointing there */
1792  stack = &fakestack;
1793  stack->bts_blkno = BufferGetBlockNumber(pbuf);
1795  stack->bts_parent = NULL;
1796  _bt_relbuf(rel, pbuf);
1797  }
1798 
1799  /* get high key from left, a strict lower bound for new right page */
1800  ritem = (IndexTuple) PageGetItem(page,
1801  PageGetItemId(page, P_HIKEY));
1802 
1803  /* form an index tuple that points at the new right page */
1804  new_item = CopyIndexTuple(ritem);
1805  BTreeTupleSetDownLink(new_item, rbknum);
1806 
1807  /*
1808  * Re-find and write lock the parent of buf.
1809  *
1810  * It's possible that the location of buf's downlink has changed since
1811  * our initial _bt_search() descent. _bt_getstackbuf() will detect
1812  * and recover from this, updating the stack, which ensures that the
1813  * new downlink will be inserted at the correct offset. Even buf's
1814  * parent may have changed.
1815  */
1816  pbuf = _bt_getstackbuf(rel, stack, bknum);
1817 
1818  /*
1819  * Now we can unlock the right child. The left child will be unlocked
1820  * by _bt_insertonpg().
1821  */
1822  _bt_relbuf(rel, rbuf);
1823 
1824  if (pbuf == InvalidBuffer)
1825  ereport(ERROR,
1826  (errcode(ERRCODE_INDEX_CORRUPTED),
1827  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
1828  RelationGetRelationName(rel), bknum, rbknum)));
1829 
1830  /* Recursively insert into the parent */
1831  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
1832  new_item, stack->bts_offset + 1,
1833  is_only);
1834 
1835  /* be tidy */
1836  pfree(new_item);
1837  }
1838 }
1839 
1840 /*
1841  * _bt_finish_split() -- Finish an incomplete split
1842  *
1843  * A crash or other failure can leave a split incomplete. The insertion
1844  * routines won't allow to insert on a page that is incompletely split.
1845  * Before inserting on such a page, call _bt_finish_split().
1846  *
1847  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
1848  * and unpinned.
1849  */
1850 void
1852 {
1853  Page lpage = BufferGetPage(lbuf);
1854  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1855  Buffer rbuf;
1856  Page rpage;
1857  BTPageOpaque rpageop;
1858  bool was_root;
1859  bool was_only;
1860 
1861  Assert(P_INCOMPLETE_SPLIT(lpageop));
1862 
1863  /* Lock right sibling, the one missing the downlink */
1864  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1865  rpage = BufferGetPage(rbuf);
1866  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1867 
1868  /* Could this be a root split? */
1869  if (!stack)
1870  {
1871  Buffer metabuf;
1872  Page metapg;
1873  BTMetaPageData *metad;
1874 
1875  /* acquire lock on the metapage */
1876  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1877  metapg = BufferGetPage(metabuf);
1878  metad = BTPageGetMeta(metapg);
1879 
1880  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1881 
1882  _bt_relbuf(rel, metabuf);
1883  }
1884  else
1885  was_root = false;
1886 
1887  /* Was this the only page on the level before split? */
1888  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1889 
1890  elog(DEBUG1, "finishing incomplete split of %u/%u",
1892 
1893  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1894 }
1895 
1896 /*
1897  * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
1898  * tuple whose downlink points to child page.
1899  *
1900  * Caller passes child's block number, which is used to identify
1901  * associated pivot tuple in parent page using a linear search that
1902  * matches on pivot's downlink/block number. The expected location of
1903  * the pivot tuple is taken from the stack one level above the child
1904  * page. This is used as a starting point. Insertions into the
1905  * parent level could cause the pivot tuple to move right; deletions
1906  * could cause it to move left, but not left of the page we previously
1907  * found it on.
1908  *
1909  * Caller can use its stack to relocate the pivot tuple/downlink for
1910  * any same-level page to the right of the page found by its initial
1911  * descent. This is necessary because of the possibility that caller
1912  * moved right to recover from a concurrent page split. It's also
1913  * convenient for certain callers to be able to step right when there
1914  * wasn't a concurrent page split, while still using their original
1915  * stack. For example, the checkingunique _bt_doinsert() case may
1916  * have to step right when there are many physical duplicates, and its
1917  * scantid forces an insertion to the right of the "first page the
1918  * value could be on".
1919  *
1920  * Returns write-locked parent page buffer, or InvalidBuffer if pivot
1921  * tuple not found (should not happen). Adjusts bts_blkno &
1922  * bts_offset if changed. Page split caller should insert its new
1923  * pivot tuple for its new right sibling page on parent page, at the
1924  * offset number bts_offset + 1.
1925  */
1926 Buffer
1928 {
1929  BlockNumber blkno;
1930  OffsetNumber start;
1931 
1932  blkno = stack->bts_blkno;
1933  start = stack->bts_offset;
1934 
1935  for (;;)
1936  {
1937  Buffer buf;
1938  Page page;
1939  BTPageOpaque opaque;
1940 
1941  buf = _bt_getbuf(rel, blkno, BT_WRITE);
1942  page = BufferGetPage(buf);
1943  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1944 
1945  if (P_INCOMPLETE_SPLIT(opaque))
1946  {
1947  _bt_finish_split(rel, buf, stack->bts_parent);
1948  continue;
1949  }
1950 
1951  if (!P_IGNORE(opaque))
1952  {
1953  OffsetNumber offnum,
1954  minoff,
1955  maxoff;
1956  ItemId itemid;
1957  IndexTuple item;
1958 
1959  minoff = P_FIRSTDATAKEY(opaque);
1960  maxoff = PageGetMaxOffsetNumber(page);
1961 
1962  /*
1963  * start = InvalidOffsetNumber means "search the whole page". We
1964  * need this test anyway due to possibility that page has a high
1965  * key now when it didn't before.
1966  */
1967  if (start < minoff)
1968  start = minoff;
1969 
1970  /*
1971  * Need this check too, to guard against possibility that page
1972  * split since we visited it originally.
1973  */
1974  if (start > maxoff)
1975  start = OffsetNumberNext(maxoff);
1976 
1977  /*
1978  * These loops will check every item on the page --- but in an
1979  * order that's attuned to the probability of where it actually
1980  * is. Scan to the right first, then to the left.
1981  */
1982  for (offnum = start;
1983  offnum <= maxoff;
1984  offnum = OffsetNumberNext(offnum))
1985  {
1986  itemid = PageGetItemId(page, offnum);
1987  item = (IndexTuple) PageGetItem(page, itemid);
1988 
1989  if (BTreeTupleGetDownLink(item) == child)
1990  {
1991  /* Return accurate pointer to where link is now */
1992  stack->bts_blkno = blkno;
1993  stack->bts_offset = offnum;
1994  return buf;
1995  }
1996  }
1997 
1998  for (offnum = OffsetNumberPrev(start);
1999  offnum >= minoff;
2000  offnum = OffsetNumberPrev(offnum))
2001  {
2002  itemid = PageGetItemId(page, offnum);
2003  item = (IndexTuple) PageGetItem(page, itemid);
2004 
2005  if (BTreeTupleGetDownLink(item) == child)
2006  {
2007  /* Return accurate pointer to where link is now */
2008  stack->bts_blkno = blkno;
2009  stack->bts_offset = offnum;
2010  return buf;
2011  }
2012  }
2013  }
2014 
2015  /*
2016  * The item we're looking for moved right at least one page.
2017  *
2018  * Lehman and Yao couple/chain locks when moving right here, which we
2019  * can avoid. See nbtree/README.
2020  */
2021  if (P_RIGHTMOST(opaque))
2022  {
2023  _bt_relbuf(rel, buf);
2024  return InvalidBuffer;
2025  }
2026  blkno = opaque->btpo_next;
2027  start = InvalidOffsetNumber;
2028  _bt_relbuf(rel, buf);
2029  }
2030 }
2031 
2032 /*
2033  * _bt_newroot() -- Create a new root page for the index.
2034  *
2035  * We've just split the old root page and need to create a new one.
2036  * In order to do this, we add a new root page to the file, then lock
2037  * the metadata page and update it. This is guaranteed to be deadlock-
2038  * free, because all readers release their locks on the metadata page
2039  * before trying to lock the root, and all writers lock the root before
2040  * trying to lock the metadata page. We have a write lock on the old
2041  * root page, so we have not introduced any cycles into the waits-for
2042  * graph.
2043  *
2044  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2045  * locked. On exit, a new root page exists with entries for the
2046  * two new children, metapage is updated and unlocked/unpinned.
2047  * The new root buffer is returned to caller which has to unlock/unpin
2048  * lbuf, rbuf & rootbuf.
2049  */
2050 static Buffer
2052 {
2053  Buffer rootbuf;
2054  Page lpage,
2055  rootpage;
2056  BlockNumber lbkno,
2057  rbkno;
2058  BlockNumber rootblknum;
2059  BTPageOpaque rootopaque;
2060  BTPageOpaque lopaque;
2061  ItemId itemid;
2062  IndexTuple item;
2063  IndexTuple left_item;
2064  Size left_item_sz;
2065  IndexTuple right_item;
2066  Size right_item_sz;
2067  Buffer metabuf;
2068  Page metapg;
2069  BTMetaPageData *metad;
2070 
2071  lbkno = BufferGetBlockNumber(lbuf);
2072  rbkno = BufferGetBlockNumber(rbuf);
2073  lpage = BufferGetPage(lbuf);
2074  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2075 
2076  /* get a new root page */
2077  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2078  rootpage = BufferGetPage(rootbuf);
2079  rootblknum = BufferGetBlockNumber(rootbuf);
2080 
2081  /* acquire lock on the metapage */
2082  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2083  metapg = BufferGetPage(metabuf);
2084  metad = BTPageGetMeta(metapg);
2085 
2086  /*
2087  * Create downlink item for left page (old root). Since this will be the
2088  * first item in a non-leaf page, it implicitly has minus-infinity key
2089  * value, so we need not store any actual key in it.
2090  */
2091  left_item_sz = sizeof(IndexTupleData);
2092  left_item = (IndexTuple) palloc(left_item_sz);
2093  left_item->t_info = left_item_sz;
2094  BTreeTupleSetDownLink(left_item, lbkno);
2095  BTreeTupleSetNAtts(left_item, 0);
2096 
2097  /*
2098  * Create downlink item for right page. The key for it is obtained from
2099  * the "high key" position in the left page.
2100  */
2101  itemid = PageGetItemId(lpage, P_HIKEY);
2102  right_item_sz = ItemIdGetLength(itemid);
2103  item = (IndexTuple) PageGetItem(lpage, itemid);
2104  right_item = CopyIndexTuple(item);
2105  BTreeTupleSetDownLink(right_item, rbkno);
2106 
2107  /* NO EREPORT(ERROR) from here till newroot op is logged */
2109 
2110  /* upgrade metapage if needed */
2111  if (metad->btm_version < BTREE_NOVAC_VERSION)
2112  _bt_upgrademetapage(metapg);
2113 
2114  /* set btree special data */
2115  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2116  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2117  rootopaque->btpo_flags = BTP_ROOT;
2118  rootopaque->btpo.level =
2119  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2120  rootopaque->btpo_cycleid = 0;
2121 
2122  /* update metapage data */
2123  metad->btm_root = rootblknum;
2124  metad->btm_level = rootopaque->btpo.level;
2125  metad->btm_fastroot = rootblknum;
2126  metad->btm_fastlevel = rootopaque->btpo.level;
2127 
2128  /*
2129  * Insert the left page pointer into the new root page. The root page is
2130  * the rightmost page on its level so there is no "high key" in it; the
2131  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2132  *
2133  * Note: we *must* insert the two items in item-number order, for the
2134  * benefit of _bt_restore_page().
2135  */
2136  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2137  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2138  false, false) == InvalidOffsetNumber)
2139  elog(PANIC, "failed to add leftkey to new root page"
2140  " while splitting block %u of index \"%s\"",
2142 
2143  /*
2144  * insert the right page pointer into the new root page.
2145  */
2146  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2147  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2149  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2150  false, false) == InvalidOffsetNumber)
2151  elog(PANIC, "failed to add rightkey to new root page"
2152  " while splitting block %u of index \"%s\"",
2154 
2155  /* Clear the incomplete-split flag in the left child */
2156  Assert(P_INCOMPLETE_SPLIT(lopaque));
2157  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2158  MarkBufferDirty(lbuf);
2159 
2160  MarkBufferDirty(rootbuf);
2161  MarkBufferDirty(metabuf);
2162 
2163  /* XLOG stuff */
2164  if (RelationNeedsWAL(rel))
2165  {
2166  xl_btree_newroot xlrec;
2167  XLogRecPtr recptr;
2168  xl_btree_metadata md;
2169 
2170  xlrec.rootblk = rootblknum;
2171  xlrec.level = metad->btm_level;
2172 
2173  XLogBeginInsert();
2174  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2175 
2176  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2179 
2181  md.version = metad->btm_version;
2182  md.root = rootblknum;
2183  md.level = metad->btm_level;
2184  md.fastroot = rootblknum;
2185  md.fastlevel = metad->btm_level;
2188 
2189  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2190 
2191  /*
2192  * Direct access to page is not good but faster - we should implement
2193  * some new func in page API.
2194  */
2196  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2197  ((PageHeader) rootpage)->pd_special -
2198  ((PageHeader) rootpage)->pd_upper);
2199 
2200  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2201 
2202  PageSetLSN(lpage, recptr);
2203  PageSetLSN(rootpage, recptr);
2204  PageSetLSN(metapg, recptr);
2205  }
2206 
2207  END_CRIT_SECTION();
2208 
2209  /* done with metapage */
2210  _bt_relbuf(rel, metabuf);
2211 
2212  pfree(left_item);
2213  pfree(right_item);
2214 
2215  return rootbuf;
2216 }
2217 
2218 /*
2219  * _bt_pgaddtup() -- add a tuple to a particular page in the index.
2220  *
2221  * This routine adds the tuple to the page as requested. It does
2222  * not affect pin/lock status, but you'd better have a write lock
2223  * and pin on the target buffer! Don't forget to write and release
2224  * the buffer afterwards, either.
2225  *
2226  * The main difference between this routine and a bare PageAddItem call
2227  * is that this code knows that the leftmost index tuple on a non-leaf
2228  * btree page has a key that must be treated as minus infinity.
2229  * Therefore, it truncates away all attributes. CAUTION: this works
2230  * ONLY if we insert the tuples in order, so that the given itup_off
2231  * does represent the final position of the tuple!
2232  */
2233 static bool
2235  Size itemsize,
2236  IndexTuple itup,
2237  OffsetNumber itup_off)
2238 {
2240  IndexTupleData trunctuple;
2241 
2242  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2243  {
2244  trunctuple = *itup;
2245  trunctuple.t_info = sizeof(IndexTupleData);
2246  BTreeTupleSetNAtts(&trunctuple, 0);
2247  itup = &trunctuple;
2248  itemsize = sizeof(IndexTupleData);
2249  }
2250 
2251  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2252  false, false) == InvalidOffsetNumber)
2253  return false;
2254 
2255  return true;
2256 }
2257 
2258 /*
2259  * _bt_vacuum_one_page - vacuum just one index page.
2260  *
2261  * Try to remove LP_DEAD items from the given page. The passed buffer
2262  * must be exclusive-locked, but unlike a real VACUUM, we don't need a
2263  * super-exclusive "cleanup" lock (see nbtree/README).
2264  */
2265 static void
2267 {
2268  OffsetNumber deletable[MaxOffsetNumber];
2269  int ndeletable = 0;
2270  OffsetNumber offnum,
2271  minoff,
2272  maxoff;
2273  Page page = BufferGetPage(buffer);
2275 
2276  Assert(P_ISLEAF(opaque));
2277 
2278  /*
2279  * Scan over all items to see which ones need to be deleted according to
2280  * LP_DEAD flags.
2281  */
2282  minoff = P_FIRSTDATAKEY(opaque);
2283  maxoff = PageGetMaxOffsetNumber(page);
2284  for (offnum = minoff;
2285  offnum <= maxoff;
2286  offnum = OffsetNumberNext(offnum))
2287  {
2288  ItemId itemId = PageGetItemId(page, offnum);
2289 
2290  if (ItemIdIsDead(itemId))
2291  deletable[ndeletable++] = offnum;
2292  }
2293 
2294  if (ndeletable > 0)
2295  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2296 
2297  /*
2298  * Note: if we didn't find any LP_DEAD items, then the page's
2299  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2300  * separate write to clear it, however. We will clear it when we split
2301  * the page.
2302  */
2303 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:235
#define BTP_ROOT
Definition: nbtree.h:73
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:164
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1846
#define BTP_SPLIT_END
Definition: nbtree.h:77
BlockNumber btpo_next
Definition: nbtree.h:59
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:927
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:76
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:1069
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:79
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:403
#define P_IGNORE(opaque)
Definition: nbtree.h:195
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:88
bool bounds_valid
Definition: nbtree.h:507
uint32 TransactionId
Definition: c.h:503
uint32 btm_version
Definition: nbtree.h:101
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:584
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3445
#define RelationGetDescr(relation)
Definition: rel.h:454
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:86
ItemPointer scantid
Definition: nbtree.h:475
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTreeTupleGetDownLink(itup)
Definition: nbtree.h:301
long random(void)
Definition: random.c:22
#define MaxOffsetNumber
Definition: off.h:28
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:220
#define SizeOfBtreeSplit
Definition: nbtxlog.h:117
ItemPointerData t_tid
Definition: itup.h:37
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:678
unsigned char uint8
Definition: c.h:355
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:182
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:196
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
OffsetNumber stricthigh
Definition: nbtree.h:509
int errcode(int sqlerrcode)
Definition: elog.c:608
uint32 level
Definition: nbtxlog.h:236
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:79
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:853
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3375
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1739
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2051
#define P_NEW
Definition: bufmgr.h:81
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:197
#define BTreeTupleSetDownLink(itup, blkno)
Definition: nbtree.h:303
OffsetNumber low
Definition: nbtree.h:508
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2058
#define PANIC
Definition: elog.h:53
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:513
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:336
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:574
uint16 OffsetNumber
Definition: off.h:24
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
Definition: nbtinsert.c:1211
IndexUniqueCheck
Definition: genam.h:112
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2109
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5258
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
void pfree(void *pointer)
Definition: mcxt.c:1056
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1851
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:690
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
OffsetNumber newitemoff
Definition: nbtxlog.h:114
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2266
BTCycleId btpo_cycleid
Definition: nbtree.h:66
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1055
#define DEBUG2
Definition: elog.h:24
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:781
#define BTPageGetMeta(p)
Definition: nbtree.h:113
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:509
#define SnapshotSelf
Definition: snapmgr.h:68
OffsetNumber bts_offset
Definition: nbtree.h:418
static char * buf
Definition: pg_test_fsync.c:67
OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:127
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:440
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
int errdetail(const char *fmt,...)
Definition: elog.c:955
#define MAX_RANDOM_VALUE
#define P_FIRSTKEY
Definition: nbtree.h:219
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:440
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:462
#define P_LEFTMOST(opaque)
Definition: nbtree.h:188
unsigned int uint32
Definition: c.h:357
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:433
TransactionId xmax
Definition: snapshot.h:158
TransactionId xmin
Definition: snapshot.h:157
IndexTuple itup
Definition: nbtree.h:495
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:447
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:136
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:554
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:201
#define ereport(elevel, rest)
Definition: elog.h:141
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3638
#define BTREE_METAPAGE
Definition: nbtree.h:132
#define P_ISROOT(opaque)
Definition: nbtree.h:191
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:29
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:239
uint32 version
Definition: nbtxlog.h:49
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
bool anynullkeys
Definition: nbtree.h:472
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:344
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2234
uintptr_t Datum
Definition: postgres.h:367
uint32 level
Definition: nbtxlog.h:112
OffsetNumber offnum
Definition: nbtxlog.h:70
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
BlockNumber btm_root
Definition: nbtree.h:102
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4374
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:417
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
int errmsg_internal(const char *fmt,...)
Definition: elog.c:909
uint32 speculativeToken
Definition: snapshot.h:193
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:624
uint64 XLogRecPtr
Definition: xlogdefs.h:21
BTScanInsert itup_key
Definition: nbtree.h:497
#define Assert(condition)
Definition: c.h:728
OffsetNumber firstright
Definition: nbtxlog.h:113
bool heapkeyspace
Definition: nbtree.h:471
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define INDEX_MAX_KEYS
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:520
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:456
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
#define MAXALIGN(LEN)
Definition: c.h:681
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
struct BTStackData * bts_parent
Definition: nbtree.h:419
#define RelationNeedsWAL(relation)
Definition: rel.h:530
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2523
#define PageGetLSN(page)
Definition: bufpage.h:366
#define BTMaxItemSize(page)
Definition: nbtree.h:148
#define P_HIKEY
Definition: nbtree.h:218
static Datum values[MAXATTR]
Definition: bootstrap.c:167
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2623
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:338
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:822
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:228
uint32 level
Definition: nbtxlog.h:51
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:924
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
#define unlikely(x)
Definition: c.h:206
BlockNumber fastroot
Definition: nbtxlog.h:52
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:176
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:403
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:189
#define SizeOfBtreeInsert
Definition: nbtxlog.h:73
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3098
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:1927
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:190