PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/heapam.h"
19 #include "access/nbtree.h"
20 #include "access/nbtxlog.h"
21 #include "access/transam.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "storage/lmgr.h"
25 #include "storage/predicate.h"
26 #include "storage/smgr.h"
27 #include "utils/tqual.h"
28 
29 /* Minimum tree height for application of fastpath optimization */
30 #define BTREE_FASTPATH_MIN_LEVEL 2
31 
32 typedef struct
33 {
34  /* context data for _bt_checksplitloc */
35  Size newitemsz; /* size of new item to be inserted */
36  int fillfactor; /* needed when splitting rightmost page */
37  bool is_leaf; /* T if splitting a leaf page */
38  bool is_rightmost; /* T if splitting a rightmost page */
39  OffsetNumber newitemoff; /* where the new item is to be inserted */
40  int leftspace; /* space available for items on left page */
41  int rightspace; /* space available for items on right page */
42  int olddataitemstotal; /* space taken by old items */
43 
44  bool have_split; /* found a valid split? */
45 
46  /* these fields valid only if have_split is true */
47  bool newitemonleft; /* new item on left or right of best split */
48  OffsetNumber firstright; /* best split point */
49  int best_delta; /* best size delta so far */
51 
52 
53 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
54 
56  Relation heapRel, Buffer buf, OffsetNumber offset,
57  ScanKey itup_scankey,
58  IndexUniqueCheck checkUnique, bool *is_unique,
59  uint32 *speculativeToken);
60 static void _bt_findinsertloc(Relation rel,
61  Buffer *bufptr,
62  OffsetNumber *offsetptr,
63  int keysz,
64  ScanKey scankey,
65  IndexTuple newtup,
66  BTStack stack,
67  Relation heapRel);
68 static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
69  BTStack stack,
70  IndexTuple itup,
71  OffsetNumber newitemoff,
72  bool split_only_page);
73 static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
74  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
75  IndexTuple newitem, bool newitemonleft);
76 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
77  BTStack stack, bool is_root, bool is_only);
79  OffsetNumber newitemoff,
80  Size newitemsz,
81  bool *newitemonleft);
83  OffsetNumber firstoldonright, bool newitemonleft,
84  int dataitemstoleft, Size firstoldonrightsz);
85 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
86  OffsetNumber itup_off);
87 static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
88  int keysz, ScanKey scankey);
89 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
90 
91 /*
92  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
93  *
94  * This routine is called by the public interface routine, btinsert.
95  * By here, itup is filled in, including the TID.
96  *
97  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
98  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
99  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
100  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
101  * don't actually insert.
102  *
103  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
104  * it must be true if the entry is known unique, else false.
105  * (In the current implementation we'll also return true after a
106  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
107  * that's just a coding artifact.)
108  */
109 bool
111  IndexUniqueCheck checkUnique, Relation heapRel)
112 {
113  bool is_unique = false;
114  int indnkeyatts;
115  ScanKey itup_scankey;
116  BTStack stack = NULL;
117  Buffer buf;
118  OffsetNumber offset;
119  bool fastpath;
120 
121  indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
122  Assert(indnkeyatts != 0);
123 
124  /* we need an insertion scan key to do our search, so build one */
125  itup_scankey = _bt_mkscankey(rel, itup);
126 
127  /*
128  * It's very common to have an index on an auto-incremented or
129  * monotonically increasing value. In such cases, every insertion happens
130  * towards the end of the index. We try to optimize that case by caching
131  * the right-most leaf of the index. If our cached block is still the
132  * rightmost leaf, has enough free space to accommodate a new entry and
133  * the insertion key is strictly greater than the first key in this page,
134  * then we can safely conclude that the new key will be inserted in the
135  * cached block. So we simply search within the cached block and insert the
136  * key at the appropriate location. We call it a fastpath.
137  *
138  * Testing has revealed, though, that the fastpath can result in increased
139  * contention on the exclusive-lock on the rightmost leaf page. So we
140  * conditionally check if the lock is available. If it's not available then
141  * we simply abandon the fastpath and take the regular path. This makes
142  * sense because unavailability of the lock also signals that some other
143  * backend might be concurrently inserting into the page, thus reducing our
144  * chances to finding an insertion place in this page.
145  */
146 top:
147  fastpath = false;
148  offset = InvalidOffsetNumber;
150  {
151  Size itemsz;
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change, as
159  * far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170  itemsz = IndexTupleSize(itup);
171  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this
172  * but we need to be consistent */
173 
174  /*
175  * Check if the page is still the rightmost leaf page, has enough
176  * free space to accommodate the new tuple, and the insertion
177  * scan key is strictly greater than the first key on the page.
178  */
179  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
180  !P_IGNORE(lpageop) &&
181  (PageGetFreeSpace(page) > itemsz) &&
182  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
183  _bt_compare(rel, indnkeyatts, itup_scankey, page,
184  P_FIRSTDATAKEY(lpageop)) > 0)
185  {
186  /*
187  * The right-most block should never have an incomplete split.
188  * But be paranoid and check for it anyway.
189  */
190  Assert(!P_INCOMPLETE_SPLIT(lpageop));
191  fastpath = true;
192  }
193  else
194  {
195  _bt_relbuf(rel, buf);
196 
197  /*
198  * Something did not work out. Just forget about the cached
199  * block and follow the normal path. It might be set again if
200  * the conditions are favourable.
201  */
203  }
204  }
205  else
206  {
207  ReleaseBuffer(buf);
208 
209  /*
210  * If someone's holding a lock, it's likely to change anyway,
211  * so don't try again until we get an updated rightmost leaf.
212  */
214  }
215  }
216 
217  if (!fastpath)
218  {
219  /* find the first page containing this key */
220  stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE,
221  NULL);
222 
223  /* trade in our read lock for a write lock */
225  LockBuffer(buf, BT_WRITE);
226 
227  /*
228  * If the page was split between the time that we surrendered our read
229  * lock and acquired our write lock, then this page may no longer be
230  * the right place for the key we want to insert. In this case, we
231  * need to move right in the tree. See Lehman and Yao for an
232  * excruciatingly precise description.
233  */
234  buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, false,
235  true, stack, BT_WRITE, NULL);
236  }
237 
238  /*
239  * If we're not allowing duplicates, make sure the key isn't already in
240  * the index.
241  *
242  * NOTE: obviously, _bt_check_unique can only detect keys that are already
243  * in the index; so it cannot defend against concurrent insertions of the
244  * same key. We protect against that by means of holding a write lock on
245  * the target page. Any other would-be inserter of the same key must
246  * acquire a write lock on the same target page, so only one would-be
247  * inserter can be making the check at one time. Furthermore, once we are
248  * past the check we hold write locks continuously until we have performed
249  * our insertion, so no later inserter can fail to see our insertion.
250  * (This requires some care in _bt_insertonpg.)
251  *
252  * If we must wait for another xact, we release the lock while waiting,
253  * and then must start over completely.
254  *
255  * For a partial uniqueness check, we don't wait for the other xact. Just
256  * let the tuple in and return false for possibly non-unique, or true for
257  * definitely unique.
258  */
259  if (checkUnique != UNIQUE_CHECK_NO)
260  {
261  TransactionId xwait;
262  uint32 speculativeToken;
263 
264  offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false);
265  xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
266  checkUnique, &is_unique, &speculativeToken);
267 
268  if (TransactionIdIsValid(xwait))
269  {
270  /* Have to wait for the other guy ... */
271  _bt_relbuf(rel, buf);
272 
273  /*
274  * If it's a speculative insertion, wait for it to finish (ie. to
275  * go ahead with the insertion, or kill the tuple). Otherwise
276  * wait for the transaction to finish as usual.
277  */
278  if (speculativeToken)
279  SpeculativeInsertionWait(xwait, speculativeToken);
280  else
281  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
282 
283  /* start over... */
284  if (stack)
285  _bt_freestack(stack);
286  goto top;
287  }
288  }
289 
290  if (checkUnique != UNIQUE_CHECK_EXISTING)
291  {
292  /*
293  * The only conflict predicate locking cares about for indexes is when
294  * an index tuple insert conflicts with an existing lock. Since the
295  * actual location of the insert is hard to predict because of the
296  * random search used to prevent O(N^2) performance when there are
297  * many duplicate entries, we can just use the "first valid" page.
298  * This reasoning also applies to INCLUDE indexes, whose extra
299  * attributes are not considered part of the key space.
300  */
301  CheckForSerializableConflictIn(rel, NULL, buf);
302  /* do the insertion */
303  _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup,
304  stack, heapRel);
305  _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
306  }
307  else
308  {
309  /* just release the buffer */
310  _bt_relbuf(rel, buf);
311  }
312 
313  /* be tidy */
314  if (stack)
315  _bt_freestack(stack);
316  _bt_freeskey(itup_scankey);
317 
318  return is_unique;
319 }
320 
321 /*
322  * _bt_check_unique() -- Check for violation of unique index constraint
323  *
324  * offset points to the first possible item that could conflict. It can
325  * also point to end-of-page, which means that the first tuple to check
326  * is the first tuple on the next page.
327  *
328  * Returns InvalidTransactionId if there is no conflict, else an xact ID
329  * we must wait for to see if it commits a conflicting tuple. If an actual
330  * conflict is detected, no return --- just ereport(). If an xact ID is
331  * returned, and the conflicting tuple still has a speculative insertion in
332  * progress, *speculativeToken is set to non-zero, and the caller can wait for
333  * the verdict on the insertion using SpeculativeInsertionWait().
334  *
335  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
336  * InvalidTransactionId because we don't want to wait. In this case we
337  * set *is_unique to false if there is a potential conflict, and the
338  * core code must redo the uniqueness check later.
339  */
340 static TransactionId
342  Buffer buf, OffsetNumber offset, ScanKey itup_scankey,
343  IndexUniqueCheck checkUnique, bool *is_unique,
344  uint32 *speculativeToken)
345 {
346  TupleDesc itupdesc = RelationGetDescr(rel);
347  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
348  SnapshotData SnapshotDirty;
349  OffsetNumber maxoff;
350  Page page;
351  BTPageOpaque opaque;
352  Buffer nbuf = InvalidBuffer;
353  bool found = false;
354 
355  /* Assume unique until we find a duplicate */
356  *is_unique = true;
357 
358  InitDirtySnapshot(SnapshotDirty);
359 
360  page = BufferGetPage(buf);
361  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362  maxoff = PageGetMaxOffsetNumber(page);
363 
364  /*
365  * Scan over all equal tuples, looking for live conflicts.
366  */
367  for (;;)
368  {
369  ItemId curitemid;
370  IndexTuple curitup;
371  BlockNumber nblkno;
372 
373  /*
374  * make sure the offset points to an actual item before trying to
375  * examine it...
376  */
377  if (offset <= maxoff)
378  {
379  curitemid = PageGetItemId(page, offset);
380 
381  /*
382  * We can skip items that are marked killed.
383  *
384  * Formerly, we applied _bt_isequal() before checking the kill
385  * flag, so as to fall out of the item loop as soon as possible.
386  * However, in the presence of heavy update activity an index may
387  * contain many killed items with the same key; running
388  * _bt_isequal() on each killed item gets expensive. Furthermore
389  * it is likely that the non-killed version of each key appears
390  * first, so that we didn't actually get to exit any sooner
391  * anyway. So now we just advance over killed items as quickly as
392  * we can. We only apply _bt_isequal() when we get to a non-killed
393  * item or the end of the page.
394  */
395  if (!ItemIdIsDead(curitemid))
396  {
397  ItemPointerData htid;
398  bool all_dead;
399 
400  /*
401  * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
402  * how we handling NULLs - and so we must not use _bt_compare
403  * in real comparison, but only for ordering/finding items on
404  * pages. - vadim 03/24/97
405  */
406  if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
407  break; /* we're past all the equal tuples */
408 
409  /* okay, we gotta fetch the heap tuple ... */
410  curitup = (IndexTuple) PageGetItem(page, curitemid);
411  htid = curitup->t_tid;
412 
413  /*
414  * If we are doing a recheck, we expect to find the tuple we
415  * are rechecking. It's not a duplicate, but we have to keep
416  * scanning.
417  */
418  if (checkUnique == UNIQUE_CHECK_EXISTING &&
419  ItemPointerCompare(&htid, &itup->t_tid) == 0)
420  {
421  found = true;
422  }
423 
424  /*
425  * We check the whole HOT-chain to see if there is any tuple
426  * that satisfies SnapshotDirty. This is necessary because we
427  * have just a single index entry for the entire chain.
428  */
429  else if (heap_hot_search(&htid, heapRel, &SnapshotDirty,
430  &all_dead))
431  {
432  TransactionId xwait;
433 
434  /*
435  * It is a duplicate. If we are only doing a partial
436  * check, then don't bother checking if the tuple is being
437  * updated in another transaction. Just return the fact
438  * that it is a potential conflict and leave the full
439  * check till later.
440  */
441  if (checkUnique == UNIQUE_CHECK_PARTIAL)
442  {
443  if (nbuf != InvalidBuffer)
444  _bt_relbuf(rel, nbuf);
445  *is_unique = false;
446  return InvalidTransactionId;
447  }
448 
449  /*
450  * If this tuple is being updated by other transaction
451  * then we have to wait for its commit/abort.
452  */
453  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
454  SnapshotDirty.xmin : SnapshotDirty.xmax;
455 
456  if (TransactionIdIsValid(xwait))
457  {
458  if (nbuf != InvalidBuffer)
459  _bt_relbuf(rel, nbuf);
460  /* Tell _bt_doinsert to wait... */
461  *speculativeToken = SnapshotDirty.speculativeToken;
462  return xwait;
463  }
464 
465  /*
466  * Otherwise we have a definite conflict. But before
467  * complaining, look to see if the tuple we want to insert
468  * is itself now committed dead --- if so, don't complain.
469  * This is a waste of time in normal scenarios but we must
470  * do it to support CREATE INDEX CONCURRENTLY.
471  *
472  * We must follow HOT-chains here because during
473  * concurrent index build, we insert the root TID though
474  * the actual tuple may be somewhere in the HOT-chain.
475  * While following the chain we might not stop at the
476  * exact tuple which triggered the insert, but that's OK
477  * because if we find a live tuple anywhere in this chain,
478  * we have a unique key conflict. The other live tuple is
479  * not part of this chain because it had a different index
480  * entry.
481  */
482  htid = itup->t_tid;
483  if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
484  {
485  /* Normal case --- it's still live */
486  }
487  else
488  {
489  /*
490  * It's been deleted, so no error, and no need to
491  * continue searching
492  */
493  break;
494  }
495 
496  /*
497  * Check for a conflict-in as we would if we were going to
498  * write to this page. We aren't actually going to write,
499  * but we want a chance to report SSI conflicts that would
500  * otherwise be masked by this unique constraint
501  * violation.
502  */
503  CheckForSerializableConflictIn(rel, NULL, buf);
504 
505  /*
506  * This is a definite conflict. Break the tuple down into
507  * datums and report the error. But first, make sure we
508  * release the buffer locks we're holding ---
509  * BuildIndexValueDescription could make catalog accesses,
510  * which in the worst case might touch this same index and
511  * cause deadlocks.
512  */
513  if (nbuf != InvalidBuffer)
514  _bt_relbuf(rel, nbuf);
515  _bt_relbuf(rel, buf);
516 
517  {
519  bool isnull[INDEX_MAX_KEYS];
520  char *key_desc;
521 
523  values, isnull);
524 
525  key_desc = BuildIndexValueDescription(rel, values,
526  isnull);
527 
528  ereport(ERROR,
529  (errcode(ERRCODE_UNIQUE_VIOLATION),
530  errmsg("duplicate key value violates unique constraint \"%s\"",
532  key_desc ? errdetail("Key %s already exists.",
533  key_desc) : 0,
534  errtableconstraint(heapRel,
535  RelationGetRelationName(rel))));
536  }
537  }
538  else if (all_dead)
539  {
540  /*
541  * The conflicting tuple (or whole HOT chain) is dead to
542  * everyone, so we may as well mark the index entry
543  * killed.
544  */
545  ItemIdMarkDead(curitemid);
546  opaque->btpo_flags |= BTP_HAS_GARBAGE;
547 
548  /*
549  * Mark buffer with a dirty hint, since state is not
550  * crucial. Be sure to mark the proper buffer dirty.
551  */
552  if (nbuf != InvalidBuffer)
553  MarkBufferDirtyHint(nbuf, true);
554  else
555  MarkBufferDirtyHint(buf, true);
556  }
557  }
558  }
559 
560  /*
561  * Advance to next tuple to continue checking.
562  */
563  if (offset < maxoff)
564  offset = OffsetNumberNext(offset);
565  else
566  {
567  /* If scankey == hikey we gotta check the next page too */
568  if (P_RIGHTMOST(opaque))
569  break;
570  if (!_bt_isequal(itupdesc, page, P_HIKEY,
571  indnkeyatts, itup_scankey))
572  break;
573  /* Advance to next non-dead page --- there must be one */
574  for (;;)
575  {
576  nblkno = opaque->btpo_next;
577  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
578  page = BufferGetPage(nbuf);
579  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
580  if (!P_IGNORE(opaque))
581  break;
582  if (P_RIGHTMOST(opaque))
583  elog(ERROR, "fell off the end of index \"%s\"",
585  }
586  maxoff = PageGetMaxOffsetNumber(page);
587  offset = P_FIRSTDATAKEY(opaque);
588  }
589  }
590 
591  /*
592  * If we are doing a recheck then we should have found the tuple we are
593  * checking. Otherwise there's something very wrong --- probably, the
594  * index is on a non-immutable expression.
595  */
596  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
597  ereport(ERROR,
598  (errcode(ERRCODE_INTERNAL_ERROR),
599  errmsg("failed to re-find tuple within index \"%s\"",
601  errhint("This may be because of a non-immutable index expression."),
602  errtableconstraint(heapRel,
603  RelationGetRelationName(rel))));
604 
605  if (nbuf != InvalidBuffer)
606  _bt_relbuf(rel, nbuf);
607 
608  return InvalidTransactionId;
609 }
610 
611 
612 /*
613  * _bt_findinsertloc() -- Finds an insert location for a tuple
614  *
615  * If the new key is equal to one or more existing keys, we can
616  * legitimately place it anywhere in the series of equal keys --- in fact,
617  * if the new key is equal to the page's "high key" we can place it on
618  * the next page. If it is equal to the high key, and there's not room
619  * to insert the new tuple on the current page without splitting, then
620  * we can move right hoping to find more free space and avoid a split.
621  * (We should not move right indefinitely, however, since that leads to
622  * O(N^2) insertion behavior in the presence of many equal keys.)
623  * Once we have chosen the page to put the key on, we'll insert it before
624  * any existing equal keys because of the way _bt_binsrch() works.
625  *
626  * If there's not enough room in the space, we try to make room by
627  * removing any LP_DEAD tuples.
628  *
629  * On entry, *bufptr and *offsetptr point to the first legal position
630  * where the new tuple could be inserted. The caller should hold an
631  * exclusive lock on *bufptr. *offsetptr can also be set to
632  * InvalidOffsetNumber, in which case the function will search for the
633  * right location within the page if needed. On exit, they point to the
634  * chosen insert location. If _bt_findinsertloc decides to move right,
635  * the lock and pin on the original page will be released and the new
636  * page returned to the caller is exclusively locked instead.
637  *
638  * newtup is the new tuple we're inserting, and scankey is an insertion
639  * type scan key for it.
640  */
641 static void
643  Buffer *bufptr,
644  OffsetNumber *offsetptr,
645  int keysz,
646  ScanKey scankey,
647  IndexTuple newtup,
648  BTStack stack,
649  Relation heapRel)
650 {
651  Buffer buf = *bufptr;
652  Page page = BufferGetPage(buf);
653  Size itemsz;
654  BTPageOpaque lpageop;
655  bool movedright,
656  vacuumed;
657  OffsetNumber newitemoff;
658  OffsetNumber firstlegaloff = *offsetptr;
659 
660  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
661 
662  itemsz = IndexTupleSize(newtup);
663  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
664  * need to be consistent */
665 
666  /*
667  * Check whether the item can fit on a btree page at all. (Eventually, we
668  * ought to try to apply TOAST methods if not.) We actually need to be
669  * able to fit three items on every page, so restrict any one item to 1/3
670  * the per-page available space. Note that at this point, itemsz doesn't
671  * include the ItemId.
672  *
673  * NOTE: if you change this, see also the similar code in _bt_buildadd().
674  */
675  if (itemsz > BTMaxItemSize(page))
676  ereport(ERROR,
677  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
678  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
679  itemsz, BTMaxItemSize(page),
681  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
682  "Consider a function index of an MD5 hash of the value, "
683  "or use full text indexing."),
684  errtableconstraint(heapRel,
685  RelationGetRelationName(rel))));
686 
687  /*----------
688  * If we will need to split the page to put the item on this page,
689  * check whether we can put the tuple somewhere to the right,
690  * instead. Keep scanning right until we
691  * (a) find a page with enough free space,
692  * (b) reach the last page where the tuple can legally go, or
693  * (c) get tired of searching.
694  * (c) is not flippant; it is important because if there are many
695  * pages' worth of equal keys, it's better to split one of the early
696  * pages than to scan all the way to the end of the run of equal keys
697  * on every insert. We implement "get tired" as a random choice,
698  * since stopping after scanning a fixed number of pages wouldn't work
699  * well (we'd never reach the right-hand side of previously split
700  * pages). Currently the probability of moving right is set at 0.99,
701  * which may seem too high to change the behavior much, but it does an
702  * excellent job of preventing O(N^2) behavior with many equal keys.
703  *----------
704  */
705  movedright = false;
706  vacuumed = false;
707  while (PageGetFreeSpace(page) < itemsz)
708  {
709  Buffer rbuf;
710  BlockNumber rblkno;
711 
712  /*
713  * before considering moving right, see if we can obtain enough space
714  * by erasing LP_DEAD items
715  */
716  if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
717  {
718  _bt_vacuum_one_page(rel, buf, heapRel);
719 
720  /*
721  * remember that we vacuumed this page, because that makes the
722  * hint supplied by the caller invalid
723  */
724  vacuumed = true;
725 
726  if (PageGetFreeSpace(page) >= itemsz)
727  break; /* OK, now we have enough space */
728  }
729 
730  /*
731  * nope, so check conditions (b) and (c) enumerated above
732  */
733  if (P_RIGHTMOST(lpageop) ||
734  _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
735  random() <= (MAX_RANDOM_VALUE / 100))
736  break;
737 
738  /*
739  * step right to next non-dead page
740  *
741  * must write-lock that page before releasing write lock on current
742  * page; else someone else's _bt_check_unique scan could fail to see
743  * our insertion. write locks on intermediate dead pages won't do
744  * because we don't know when they will get de-linked from the tree.
745  */
746  rbuf = InvalidBuffer;
747 
748  rblkno = lpageop->btpo_next;
749  for (;;)
750  {
751  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
752  page = BufferGetPage(rbuf);
753  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
754 
755  /*
756  * If this page was incompletely split, finish the split now. We
757  * do this while holding a lock on the left sibling, which is not
758  * good because finishing the split could be a fairly lengthy
759  * operation. But this should happen very seldom.
760  */
761  if (P_INCOMPLETE_SPLIT(lpageop))
762  {
763  _bt_finish_split(rel, rbuf, stack);
764  rbuf = InvalidBuffer;
765  continue;
766  }
767 
768  if (!P_IGNORE(lpageop))
769  break;
770  if (P_RIGHTMOST(lpageop))
771  elog(ERROR, "fell off the end of index \"%s\"",
773 
774  rblkno = lpageop->btpo_next;
775  }
776  _bt_relbuf(rel, buf);
777  buf = rbuf;
778  movedright = true;
779  vacuumed = false;
780  }
781 
782  /*
783  * Now we are on the right page, so find the insert position. If we moved
784  * right at all, we know we should insert at the start of the page. If we
785  * didn't move right, we can use the firstlegaloff hint if the caller
786  * supplied one, unless we vacuumed the page which might have moved tuples
787  * around making the hint invalid. If we didn't move right or can't use
788  * the hint, find the position by searching.
789  */
790  if (movedright)
791  newitemoff = P_FIRSTDATAKEY(lpageop);
792  else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
793  newitemoff = firstlegaloff;
794  else
795  newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
796 
797  *bufptr = buf;
798  *offsetptr = newitemoff;
799 }
800 
801 /*----------
802  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
803  *
804  * This recursive procedure does the following things:
805  *
806  * + if necessary, splits the target page (making sure that the
807  * split is equitable as far as post-insert free space goes).
808  * + inserts the tuple.
809  * + if the page was split, pops the parent stack, and finds the
810  * right place to insert the new child pointer (by walking
811  * right using information stored in the parent stack).
812  * + invokes itself with the appropriate tuple for the right
813  * child page on the parent.
814  * + updates the metapage if a true root or fast root is split.
815  *
816  * On entry, we must have the correct buffer in which to do the
817  * insertion, and the buffer must be pinned and write-locked. On return,
818  * we will have dropped both the pin and the lock on the buffer.
819  *
820  * This routine only performs retail tuple insertions. 'itup' should
821  * always be either a non-highkey leaf item, or a downlink (new high
822  * key items are created indirectly, when a page is split). When
823  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
824  * we're inserting the downlink for. This function will clear the
825  * INCOMPLETE_SPLIT flag on it, and release the buffer.
826  *
827  * The locking interactions in this code are critical. You should
828  * grok Lehman and Yao's paper before making any changes. In addition,
829  * you need to understand how we disambiguate duplicate keys in this
830  * implementation, in order to be able to find our location using
831  * L&Y "move right" operations.
832  *----------
833  */
834 static void
836  Buffer buf,
837  Buffer cbuf,
838  BTStack stack,
839  IndexTuple itup,
840  OffsetNumber newitemoff,
841  bool split_only_page)
842 {
843  Page page;
844  BTPageOpaque lpageop;
845  OffsetNumber firstright = InvalidOffsetNumber;
846  Size itemsz;
847 
848  page = BufferGetPage(buf);
849  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
850 
851  /* child buffer must be given iff inserting on an internal page */
852  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
853  /* tuple must have appropriate number of attributes */
854  Assert(!P_ISLEAF(lpageop) ||
855  BTreeTupleGetNAtts(itup, rel) ==
857  Assert(P_ISLEAF(lpageop) ||
858  BTreeTupleGetNAtts(itup, rel) ==
860 
861  /* The caller should've finished any incomplete splits already. */
862  if (P_INCOMPLETE_SPLIT(lpageop))
863  elog(ERROR, "cannot insert to incompletely split page %u",
864  BufferGetBlockNumber(buf));
865 
866  itemsz = IndexTupleSize(itup);
867  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
868  * need to be consistent */
869 
870  /*
871  * Do we need to split the page to fit the item on it?
872  *
873  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
874  * so this comparison is correct even though we appear to be accounting
875  * only for the item and not for its line pointer.
876  */
877  if (PageGetFreeSpace(page) < itemsz)
878  {
879  bool is_root = P_ISROOT(lpageop);
880  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
881  bool newitemonleft;
882  Buffer rbuf;
883 
884  /*
885  * If we're here then a pagesplit is needed. We should never reach here
886  * if we're using the fastpath since we should have checked for all the
887  * required conditions, including the fact that this page has enough
888  * freespace. Note that this routine can in theory deal with the
889  * situation where a NULL stack pointer is passed (that's what would
890  * happen if the fastpath is taken), like it does during crash
891  * recovery. But that path is much slower, defeating the very purpose
892  * of the optimization. The following assertion should protect us from
893  * any future code changes that invalidate those assumptions.
894  *
895  * Note that whenever we fail to take the fastpath, we clear the
896  * cached block. Checking for a valid cached block at this point is
897  * enough to decide whether we're in a fastpath or not.
898  */
899  Assert(!(P_ISLEAF(lpageop) &&
901 
902  /* Choose the split point */
903  firstright = _bt_findsplitloc(rel, page,
904  newitemoff, itemsz,
905  &newitemonleft);
906 
907  /* split the buffer into left and right halves */
908  rbuf = _bt_split(rel, buf, cbuf, firstright,
909  newitemoff, itemsz, itup, newitemonleft);
912  BufferGetBlockNumber(rbuf));
913 
914  /*----------
915  * By here,
916  *
917  * + our target page has been split;
918  * + the original tuple has been inserted;
919  * + we have write locks on both the old (left half)
920  * and new (right half) buffers, after the split; and
921  * + we know the key we want to insert into the parent
922  * (it's the "high key" on the left child page).
923  *
924  * We're ready to do the parent insertion. We need to hold onto the
925  * locks for the child pages until we locate the parent, but we can
926  * release them before doing the actual insertion (see Lehman and Yao
927  * for the reasoning).
928  *----------
929  */
930  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
931  }
932  else
933  {
934  Buffer metabuf = InvalidBuffer;
935  Page metapg = NULL;
936  BTMetaPageData *metad = NULL;
937  OffsetNumber itup_off;
938  BlockNumber itup_blkno;
939  BlockNumber cachedBlock = InvalidBlockNumber;
940 
941  itup_off = newitemoff;
942  itup_blkno = BufferGetBlockNumber(buf);
943 
944  /*
945  * If we are doing this insert because we split a page that was the
946  * only one on its tree level, but was not the root, it may have been
947  * the "fast root". We need to ensure that the fast root link points
948  * at or above the current page. We can safely acquire a lock on the
949  * metapage here --- see comments for _bt_newroot().
950  */
951  if (split_only_page)
952  {
953  Assert(!P_ISLEAF(lpageop));
954 
955  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
956  metapg = BufferGetPage(metabuf);
957  metad = BTPageGetMeta(metapg);
958 
959  if (metad->btm_fastlevel >= lpageop->btpo.level)
960  {
961  /* no update wanted */
962  _bt_relbuf(rel, metabuf);
963  metabuf = InvalidBuffer;
964  }
965  }
966 
967  /*
968  * Every internal page should have exactly one negative infinity item
969  * at all times. Only _bt_split() and _bt_newroot() should add items
970  * that become negative infinity items through truncation, since
971  * they're the only routines that allocate new internal pages. Do not
972  * allow a retail insertion of a new item at the negative infinity
973  * offset.
974  */
975  if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
976  elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
977  itup_blkno, RelationGetRelationName(rel));
978 
979  /* Do the update. No ereport(ERROR) until changes are logged */
981 
982  if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
983  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
984  itup_blkno, RelationGetRelationName(rel));
985 
986  MarkBufferDirty(buf);
987 
988  if (BufferIsValid(metabuf))
989  {
990  /* upgrade meta-page if needed */
991  if (metad->btm_version < BTREE_VERSION)
992  _bt_upgrademetapage(metapg);
993  metad->btm_fastroot = itup_blkno;
994  metad->btm_fastlevel = lpageop->btpo.level;
995  MarkBufferDirty(metabuf);
996  }
997 
998  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
999  if (BufferIsValid(cbuf))
1000  {
1001  Page cpage = BufferGetPage(cbuf);
1002  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1003 
1004  Assert(P_INCOMPLETE_SPLIT(cpageop));
1005  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1006  MarkBufferDirty(cbuf);
1007  }
1008 
1009  /*
1010  * Cache the block information if we just inserted into the rightmost
1011  * leaf page of the index and it's not the root page. For very small
1012  * index where root is also the leaf, there is no point trying for any
1013  * optimization.
1014  */
1015  if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
1016  cachedBlock = BufferGetBlockNumber(buf);
1017 
1018  /* XLOG stuff */
1019  if (RelationNeedsWAL(rel))
1020  {
1021  xl_btree_insert xlrec;
1022  xl_btree_metadata xlmeta;
1023  uint8 xlinfo;
1024  XLogRecPtr recptr;
1025 
1026  xlrec.offnum = itup_off;
1027 
1028  XLogBeginInsert();
1029  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1030 
1031  if (P_ISLEAF(lpageop))
1032  xlinfo = XLOG_BTREE_INSERT_LEAF;
1033  else
1034  {
1035  /*
1036  * Register the left child whose INCOMPLETE_SPLIT flag was
1037  * cleared.
1038  */
1040 
1041  xlinfo = XLOG_BTREE_INSERT_UPPER;
1042  }
1043 
1044  if (BufferIsValid(metabuf))
1045  {
1046  xlmeta.root = metad->btm_root;
1047  xlmeta.level = metad->btm_level;
1048  xlmeta.fastroot = metad->btm_fastroot;
1049  xlmeta.fastlevel = metad->btm_fastlevel;
1050  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1053 
1055  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1056 
1057  xlinfo = XLOG_BTREE_INSERT_META;
1058  }
1059 
1061  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1062 
1063  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1064 
1065  if (BufferIsValid(metabuf))
1066  {
1067  PageSetLSN(metapg, recptr);
1068  }
1069  if (BufferIsValid(cbuf))
1070  {
1071  PageSetLSN(BufferGetPage(cbuf), recptr);
1072  }
1073 
1074  PageSetLSN(page, recptr);
1075  }
1076 
1077  END_CRIT_SECTION();
1078 
1079  /* release buffers */
1080  if (BufferIsValid(metabuf))
1081  _bt_relbuf(rel, metabuf);
1082  if (BufferIsValid(cbuf))
1083  _bt_relbuf(rel, cbuf);
1084  _bt_relbuf(rel, buf);
1085 
1086  /*
1087  * If we decided to cache the insertion target block, then set it now.
1088  * But before that, check for the height of the tree and don't go for
1089  * the optimization for small indexes. We defer that check to this
1090  * point to ensure that we don't call _bt_getrootheight while holding
1091  * lock on any other block.
1092  *
1093  * We do this after dropping locks on all buffers. So the information
1094  * about whether the insertion block is still the rightmost block or
1095  * not may have changed in between. But we will deal with that during
1096  * next insert operation. No special care is required while setting it.
1097  */
1098  if (BlockNumberIsValid(cachedBlock) &&
1100  RelationSetTargetBlock(rel, cachedBlock);
1101  }
1102 }
1103 
1104 /*
1105  * _bt_split() -- split a page in the btree.
1106  *
1107  * On entry, buf is the page to split, and is pinned and write-locked.
1108  * firstright is the item index of the first item to be moved to the
1109  * new right page. newitemoff etc. tell us about the new item that
1110  * must be inserted along with the data from the old page.
1111  *
1112  * When splitting a non-leaf page, 'cbuf' is the left-sibling of the
1113  * page we're inserting the downlink for. This function will clear the
1114  * INCOMPLETE_SPLIT flag on it, and release the buffer.
1115  *
1116  * Returns the new right sibling of buf, pinned and write-locked.
1117  * The pin and lock on buf are maintained.
1118  */
1119 static Buffer
1121  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1122  bool newitemonleft)
1123 {
1124  Buffer rbuf;
1125  Page origpage;
1126  Page leftpage,
1127  rightpage;
1128  BlockNumber origpagenumber,
1129  rightpagenumber;
1130  BTPageOpaque ropaque,
1131  lopaque,
1132  oopaque;
1133  Buffer sbuf = InvalidBuffer;
1134  Page spage = NULL;
1135  BTPageOpaque sopaque = NULL;
1136  Size itemsz;
1137  ItemId itemid;
1138  IndexTuple item;
1139  OffsetNumber leftoff,
1140  rightoff;
1141  OffsetNumber maxoff;
1142  OffsetNumber i;
1143  bool isleaf;
1144  IndexTuple lefthikey;
1145  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1146  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1147 
1148  /* Acquire a new page to split into */
1149  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1150 
1151  /*
1152  * origpage is the original page to be split. leftpage is a temporary
1153  * buffer that receives the left-sibling data, which will be copied back
1154  * into origpage on success. rightpage is the new page that receives the
1155  * right-sibling data. If we fail before reaching the critical section,
1156  * origpage hasn't been modified and leftpage is only workspace. In
1157  * principle we shouldn't need to worry about rightpage either, because it
1158  * hasn't been linked into the btree page structure; but to avoid leaving
1159  * possibly-confusing junk behind, we are careful to rewrite rightpage as
1160  * zeroes before throwing any error.
1161  */
1162  origpage = BufferGetPage(buf);
1163  leftpage = PageGetTempPage(origpage);
1164  rightpage = BufferGetPage(rbuf);
1165 
1166  origpagenumber = BufferGetBlockNumber(buf);
1167  rightpagenumber = BufferGetBlockNumber(rbuf);
1168 
1169  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1170  /* rightpage was already initialized by _bt_getbuf */
1171 
1172  /*
1173  * Copy the original page's LSN into leftpage, which will become the
1174  * updated version of the page. We need this because XLogInsert will
1175  * examine the LSN and possibly dump it in a page image.
1176  */
1177  PageSetLSN(leftpage, PageGetLSN(origpage));
1178 
1179  /* init btree private data */
1180  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1181  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1182  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1183 
1184  isleaf = P_ISLEAF(oopaque);
1185 
1186  /* if we're splitting this page, it won't be the root when we're done */
1187  /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1188  lopaque->btpo_flags = oopaque->btpo_flags;
1189  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1190  ropaque->btpo_flags = lopaque->btpo_flags;
1191  /* set flag in left page indicating that the right page has no downlink */
1192  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1193  lopaque->btpo_prev = oopaque->btpo_prev;
1194  lopaque->btpo_next = rightpagenumber;
1195  ropaque->btpo_prev = origpagenumber;
1196  ropaque->btpo_next = oopaque->btpo_next;
1197  lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
1198  /* Since we already have write-lock on both pages, ok to read cycleid */
1199  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1200  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1201 
1202  /*
1203  * If the page we're splitting is not the rightmost page at its level in
1204  * the tree, then the first entry on the page is the high key for the
1205  * page. We need to copy that to the right half. Otherwise (meaning the
1206  * rightmost page case), all the items on the right half will be user
1207  * data.
1208  */
1209  rightoff = P_HIKEY;
1210 
1211  if (!P_RIGHTMOST(oopaque))
1212  {
1213  itemid = PageGetItemId(origpage, P_HIKEY);
1214  itemsz = ItemIdGetLength(itemid);
1215  item = (IndexTuple) PageGetItem(origpage, itemid);
1216  Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
1217  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1218  false, false) == InvalidOffsetNumber)
1219  {
1220  memset(rightpage, 0, BufferGetPageSize(rbuf));
1221  elog(ERROR, "failed to add hikey to the right sibling"
1222  " while splitting block %u of index \"%s\"",
1223  origpagenumber, RelationGetRelationName(rel));
1224  }
1225  rightoff = OffsetNumberNext(rightoff);
1226  }
1227 
1228  /*
1229  * The "high key" for the new left page will be the first key that's going
1230  * to go into the new right page. This might be either the existing data
1231  * item at position firstright, or the incoming tuple.
1232  */
1233  leftoff = P_HIKEY;
1234  if (!newitemonleft && newitemoff == firstright)
1235  {
1236  /* incoming tuple will become first on right page */
1237  itemsz = newitemsz;
1238  item = newitem;
1239  }
1240  else
1241  {
1242  /* existing item at firstright will become first on right page */
1243  itemid = PageGetItemId(origpage, firstright);
1244  itemsz = ItemIdGetLength(itemid);
1245  item = (IndexTuple) PageGetItem(origpage, itemid);
1246  }
1247 
1248  /*
1249  * Truncate non-key (INCLUDE) attributes of the high key item before
1250  * inserting it on the left page. This only needs to happen at the leaf
1251  * level, since in general all pivot tuple values originate from leaf
1252  * level high keys. This isn't just about avoiding unnecessary work,
1253  * though; truncating unneeded key attributes (more aggressive suffix
1254  * truncation) can only be performed at the leaf level anyway. This is
1255  * because a pivot tuple in a grandparent page must guide a search not
1256  * only to the correct parent page, but also to the correct leaf page.
1257  */
1258  if (indnatts != indnkeyatts && isleaf)
1259  {
1260  lefthikey = _bt_nonkey_truncate(rel, item);
1261  itemsz = IndexTupleSize(lefthikey);
1262  itemsz = MAXALIGN(itemsz);
1263  }
1264  else
1265  lefthikey = item;
1266 
1267  Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
1268  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1269  false, false) == InvalidOffsetNumber)
1270  {
1271  memset(rightpage, 0, BufferGetPageSize(rbuf));
1272  elog(ERROR, "failed to add hikey to the left sibling"
1273  " while splitting block %u of index \"%s\"",
1274  origpagenumber, RelationGetRelationName(rel));
1275  }
1276  leftoff = OffsetNumberNext(leftoff);
1277  /* be tidy */
1278  if (lefthikey != item)
1279  pfree(lefthikey);
1280 
1281  /*
1282  * Now transfer all the data items to the appropriate page.
1283  *
1284  * Note: we *must* insert at least the right page's items in item-number
1285  * order, for the benefit of _bt_restore_page().
1286  */
1287  maxoff = PageGetMaxOffsetNumber(origpage);
1288 
1289  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1290  {
1291  itemid = PageGetItemId(origpage, i);
1292  itemsz = ItemIdGetLength(itemid);
1293  item = (IndexTuple) PageGetItem(origpage, itemid);
1294 
1295  /* does new item belong before this one? */
1296  if (i == newitemoff)
1297  {
1298  if (newitemonleft)
1299  {
1300  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1301  {
1302  memset(rightpage, 0, BufferGetPageSize(rbuf));
1303  elog(ERROR, "failed to add new item to the left sibling"
1304  " while splitting block %u of index \"%s\"",
1305  origpagenumber, RelationGetRelationName(rel));
1306  }
1307  leftoff = OffsetNumberNext(leftoff);
1308  }
1309  else
1310  {
1311  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1312  {
1313  memset(rightpage, 0, BufferGetPageSize(rbuf));
1314  elog(ERROR, "failed to add new item to the right sibling"
1315  " while splitting block %u of index \"%s\"",
1316  origpagenumber, RelationGetRelationName(rel));
1317  }
1318  rightoff = OffsetNumberNext(rightoff);
1319  }
1320  }
1321 
1322  /* decide which page to put it on */
1323  if (i < firstright)
1324  {
1325  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1326  {
1327  memset(rightpage, 0, BufferGetPageSize(rbuf));
1328  elog(ERROR, "failed to add old item to the left sibling"
1329  " while splitting block %u of index \"%s\"",
1330  origpagenumber, RelationGetRelationName(rel));
1331  }
1332  leftoff = OffsetNumberNext(leftoff);
1333  }
1334  else
1335  {
1336  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1337  {
1338  memset(rightpage, 0, BufferGetPageSize(rbuf));
1339  elog(ERROR, "failed to add old item to the right sibling"
1340  " while splitting block %u of index \"%s\"",
1341  origpagenumber, RelationGetRelationName(rel));
1342  }
1343  rightoff = OffsetNumberNext(rightoff);
1344  }
1345  }
1346 
1347  /* cope with possibility that newitem goes at the end */
1348  if (i <= newitemoff)
1349  {
1350  /*
1351  * Can't have newitemonleft here; that would imply we were told to put
1352  * *everything* on the left page, which cannot fit (if it could, we'd
1353  * not be splitting the page).
1354  */
1355  Assert(!newitemonleft);
1356  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1357  {
1358  memset(rightpage, 0, BufferGetPageSize(rbuf));
1359  elog(ERROR, "failed to add new item to the right sibling"
1360  " while splitting block %u of index \"%s\"",
1361  origpagenumber, RelationGetRelationName(rel));
1362  }
1363  rightoff = OffsetNumberNext(rightoff);
1364  }
1365 
1366  /*
1367  * We have to grab the right sibling (if any) and fix the prev pointer
1368  * there. We are guaranteed that this is deadlock-free since no other
1369  * writer will be holding a lock on that page and trying to move left, and
1370  * all readers release locks on a page before trying to fetch its
1371  * neighbors.
1372  */
1373 
1374  if (!P_RIGHTMOST(oopaque))
1375  {
1376  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1377  spage = BufferGetPage(sbuf);
1378  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1379  if (sopaque->btpo_prev != origpagenumber)
1380  {
1381  memset(rightpage, 0, BufferGetPageSize(rbuf));
1382  elog(ERROR, "right sibling's left-link doesn't match: "
1383  "block %u links to %u instead of expected %u in index \"%s\"",
1384  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1386  }
1387 
1388  /*
1389  * Check to see if we can set the SPLIT_END flag in the right-hand
1390  * split page; this can save some I/O for vacuum since it need not
1391  * proceed to the right sibling. We can set the flag if the right
1392  * sibling has a different cycleid: that means it could not be part of
1393  * a group of pages that were all split off from the same ancestor
1394  * page. If you're confused, imagine that page A splits to A B and
1395  * then again, yielding A C B, while vacuum is in progress. Tuples
1396  * originally in A could now be in either B or C, hence vacuum must
1397  * examine both pages. But if D, our right sibling, has a different
1398  * cycleid then it could not contain any tuples that were in A when
1399  * the vacuum started.
1400  */
1401  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1402  ropaque->btpo_flags |= BTP_SPLIT_END;
1403  }
1404 
1405  /*
1406  * Right sibling is locked, new siblings are prepared, but original page
1407  * is not updated yet.
1408  *
1409  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1410  * not starting the critical section till here because we haven't been
1411  * scribbling on the original page yet; see comments above.
1412  */
1414 
1415  /*
1416  * By here, the original data page has been split into two new halves, and
1417  * these are correct. The algorithm requires that the left page never
1418  * move during a split, so we copy the new left page back on top of the
1419  * original. Note that this is not a waste of time, since we also require
1420  * (in the page management code) that the center of a page always be
1421  * clean, and the most efficient way to guarantee this is just to compact
1422  * the data by reinserting it into a new left page. (XXX the latter
1423  * comment is probably obsolete; but in any case it's good to not scribble
1424  * on the original page until we enter the critical section.)
1425  *
1426  * We need to do this before writing the WAL record, so that XLogInsert
1427  * can WAL log an image of the page if necessary.
1428  */
1429  PageRestoreTempPage(leftpage, origpage);
1430  /* leftpage, lopaque must not be used below here */
1431 
1432  MarkBufferDirty(buf);
1433  MarkBufferDirty(rbuf);
1434 
1435  if (!P_RIGHTMOST(ropaque))
1436  {
1437  sopaque->btpo_prev = rightpagenumber;
1438  MarkBufferDirty(sbuf);
1439  }
1440 
1441  /*
1442  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1443  * a split.
1444  */
1445  if (!isleaf)
1446  {
1447  Page cpage = BufferGetPage(cbuf);
1448  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1449 
1450  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1451  MarkBufferDirty(cbuf);
1452  }
1453 
1454  /* XLOG stuff */
1455  if (RelationNeedsWAL(rel))
1456  {
1457  xl_btree_split xlrec;
1458  uint8 xlinfo;
1459  XLogRecPtr recptr;
1460  bool loglhikey = false;
1461 
1462  xlrec.level = ropaque->btpo.level;
1463  xlrec.firstright = firstright;
1464  xlrec.newitemoff = newitemoff;
1465 
1466  XLogBeginInsert();
1467  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1468 
1471  /* Log the right sibling, because we've changed its prev-pointer. */
1472  if (!P_RIGHTMOST(ropaque))
1474  if (BufferIsValid(cbuf))
1476 
1477  /*
1478  * Log the new item, if it was inserted on the left page. (If it was
1479  * put on the right page, we don't need to explicitly WAL log it
1480  * because it's included with all the other items on the right page.)
1481  * Show the new item as belonging to the left page buffer, so that it
1482  * is not stored if XLogInsert decides it needs a full-page image of
1483  * the left page. We store the offset anyway, though, to support
1484  * archive compression of these records.
1485  */
1486  if (newitemonleft)
1487  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1488 
1489  /* Log left page */
1490  if (!isleaf || indnatts != indnkeyatts)
1491  {
1492  /*
1493  * We must also log the left page's high key. There are two
1494  * reasons for that: right page's leftmost key is suppressed on
1495  * non-leaf levels and in covering indexes included columns are
1496  * truncated from high keys. Show it as belonging to the left
1497  * page buffer, so that it is not stored if XLogInsert decides it
1498  * needs a full-page image of the left page.
1499  */
1500  itemid = PageGetItemId(origpage, P_HIKEY);
1501  item = (IndexTuple) PageGetItem(origpage, itemid);
1502  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1503  loglhikey = true;
1504  }
1505 
1506  /*
1507  * Log the contents of the right page in the format understood by
1508  * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1509  * because we're going to recreate the whole page anyway, so it should
1510  * never be stored by XLogInsert.
1511  *
1512  * Direct access to page is not good but faster - we should implement
1513  * some new func in page API. Note we only store the tuples
1514  * themselves, knowing that they were inserted in item-number order
1515  * and so the item pointers can be reconstructed. See comments for
1516  * _bt_restore_page().
1517  */
1519  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1520  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1521 
1522  xlinfo = newitemonleft ?
1525  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1526 
1527  PageSetLSN(origpage, recptr);
1528  PageSetLSN(rightpage, recptr);
1529  if (!P_RIGHTMOST(ropaque))
1530  {
1531  PageSetLSN(spage, recptr);
1532  }
1533  if (!isleaf)
1534  {
1535  PageSetLSN(BufferGetPage(cbuf), recptr);
1536  }
1537  }
1538 
1539  END_CRIT_SECTION();
1540 
1541  /* release the old right sibling */
1542  if (!P_RIGHTMOST(ropaque))
1543  _bt_relbuf(rel, sbuf);
1544 
1545  /* release the child */
1546  if (!isleaf)
1547  _bt_relbuf(rel, cbuf);
1548 
1549  /* split's done */
1550  return rbuf;
1551 }
1552 
1553 /*
1554  * _bt_findsplitloc() -- find an appropriate place to split a page.
1555  *
1556  * The idea here is to equalize the free space that will be on each split
1557  * page, *after accounting for the inserted tuple*. (If we fail to account
1558  * for it, we might find ourselves with too little room on the page that
1559  * it needs to go into!)
1560  *
1561  * If the page is the rightmost page on its level, we instead try to arrange
1562  * to leave the left split page fillfactor% full. In this way, when we are
1563  * inserting successively increasing keys (consider sequences, timestamps,
1564  * etc) we will end up with a tree whose pages are about fillfactor% full,
1565  * instead of the 50% full result that we'd get without this special case.
1566  * This is the same as nbtsort.c produces for a newly-created tree. Note
1567  * that leaf and nonleaf pages use different fillfactors.
1568  *
1569  * We are passed the intended insert position of the new tuple, expressed as
1570  * the offsetnumber of the tuple it must go in front of. (This could be
1571  * maxoff+1 if the tuple is to go at the end.)
1572  *
1573  * We return the index of the first existing tuple that should go on the
1574  * righthand page, plus a boolean indicating whether the new tuple goes on
1575  * the left or right page. The bool is necessary to disambiguate the case
1576  * where firstright == newitemoff.
1577  */
1578 static OffsetNumber
1580  Page page,
1581  OffsetNumber newitemoff,
1582  Size newitemsz,
1583  bool *newitemonleft)
1584 {
1585  BTPageOpaque opaque;
1586  OffsetNumber offnum;
1587  OffsetNumber maxoff;
1588  ItemId itemid;
1590  int leftspace,
1591  rightspace,
1592  goodenough,
1593  olddataitemstotal,
1594  olddataitemstoleft;
1595  bool goodenoughfound;
1596 
1597  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1598 
1599  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
1600  newitemsz += sizeof(ItemIdData);
1601 
1602  /* Total free space available on a btree page, after fixed overhead */
1603  leftspace = rightspace =
1605  MAXALIGN(sizeof(BTPageOpaqueData));
1606 
1607  /* The right page will have the same high key as the old page */
1608  if (!P_RIGHTMOST(opaque))
1609  {
1610  itemid = PageGetItemId(page, P_HIKEY);
1611  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
1612  sizeof(ItemIdData));
1613  }
1614 
1615  /* Count up total space in data items without actually scanning 'em */
1616  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
1617 
1618  state.newitemsz = newitemsz;
1619  state.is_leaf = P_ISLEAF(opaque);
1620  state.is_rightmost = P_RIGHTMOST(opaque);
1621  state.have_split = false;
1622  if (state.is_leaf)
1623  state.fillfactor = RelationGetFillFactor(rel,
1625  else
1627  state.newitemonleft = false; /* these just to keep compiler quiet */
1628  state.firstright = 0;
1629  state.best_delta = 0;
1630  state.leftspace = leftspace;
1631  state.rightspace = rightspace;
1632  state.olddataitemstotal = olddataitemstotal;
1633  state.newitemoff = newitemoff;
1634 
1635  /*
1636  * Finding the best possible split would require checking all the possible
1637  * split points, because of the high-key and left-key special cases.
1638  * That's probably more work than it's worth; instead, stop as soon as we
1639  * find a "good-enough" split, where good-enough is defined as an
1640  * imbalance in free space of no more than pagesize/16 (arbitrary...) This
1641  * should let us stop near the middle on most pages, instead of plowing to
1642  * the end.
1643  */
1644  goodenough = leftspace / 16;
1645 
1646  /*
1647  * Scan through the data items and calculate space usage for a split at
1648  * each possible position.
1649  */
1650  olddataitemstoleft = 0;
1651  goodenoughfound = false;
1652  maxoff = PageGetMaxOffsetNumber(page);
1653 
1654  for (offnum = P_FIRSTDATAKEY(opaque);
1655  offnum <= maxoff;
1656  offnum = OffsetNumberNext(offnum))
1657  {
1658  Size itemsz;
1659 
1660  itemid = PageGetItemId(page, offnum);
1661  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1662 
1663  /*
1664  * Will the new item go to left or right of split?
1665  */
1666  if (offnum > newitemoff)
1667  _bt_checksplitloc(&state, offnum, true,
1668  olddataitemstoleft, itemsz);
1669 
1670  else if (offnum < newitemoff)
1671  _bt_checksplitloc(&state, offnum, false,
1672  olddataitemstoleft, itemsz);
1673  else
1674  {
1675  /* need to try it both ways! */
1676  _bt_checksplitloc(&state, offnum, true,
1677  olddataitemstoleft, itemsz);
1678 
1679  _bt_checksplitloc(&state, offnum, false,
1680  olddataitemstoleft, itemsz);
1681  }
1682 
1683  /* Abort scan once we find a good-enough choice */
1684  if (state.have_split && state.best_delta <= goodenough)
1685  {
1686  goodenoughfound = true;
1687  break;
1688  }
1689 
1690  olddataitemstoleft += itemsz;
1691  }
1692 
1693  /*
1694  * If the new item goes as the last item, check for splitting so that all
1695  * the old items go to the left page and the new item goes to the right
1696  * page.
1697  */
1698  if (newitemoff > maxoff && !goodenoughfound)
1699  _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
1700 
1701  /*
1702  * I believe it is not possible to fail to find a feasible split, but just
1703  * in case ...
1704  */
1705  if (!state.have_split)
1706  elog(ERROR, "could not find a feasible split point for index \"%s\"",
1708 
1709  *newitemonleft = state.newitemonleft;
1710  return state.firstright;
1711 }
1712 
1713 /*
1714  * Subroutine to analyze a particular possible split choice (ie, firstright
1715  * and newitemonleft settings), and record the best split so far in *state.
1716  *
1717  * firstoldonright is the offset of the first item on the original page
1718  * that goes to the right page, and firstoldonrightsz is the size of that
1719  * tuple. firstoldonright can be > max offset, which means that all the old
1720  * items go to the left page and only the new item goes to the right page.
1721  * In that case, firstoldonrightsz is not used.
1722  *
1723  * olddataitemstoleft is the total size of all old items to the left of
1724  * firstoldonright.
1725  */
1726 static void
1728  OffsetNumber firstoldonright,
1729  bool newitemonleft,
1730  int olddataitemstoleft,
1731  Size firstoldonrightsz)
1732 {
1733  int leftfree,
1734  rightfree;
1735  Size firstrightitemsz;
1736  bool newitemisfirstonright;
1737 
1738  /* Is the new item going to be the first item on the right page? */
1739  newitemisfirstonright = (firstoldonright == state->newitemoff
1740  && !newitemonleft);
1741 
1742  if (newitemisfirstonright)
1743  firstrightitemsz = state->newitemsz;
1744  else
1745  firstrightitemsz = firstoldonrightsz;
1746 
1747  /* Account for all the old tuples */
1748  leftfree = state->leftspace - olddataitemstoleft;
1749  rightfree = state->rightspace -
1750  (state->olddataitemstotal - olddataitemstoleft);
1751 
1752  /*
1753  * The first item on the right page becomes the high key of the left page;
1754  * therefore it counts against left space as well as right space. When
1755  * index has included attributes, then those attributes of left page high
1756  * key will be truncated leaving that page with slightly more free space.
1757  * However, that shouldn't affect our ability to find valid split
1758  * location, because anyway split location should exists even without high
1759  * key truncation.
1760  */
1761  leftfree -= firstrightitemsz;
1762 
1763  /* account for the new item */
1764  if (newitemonleft)
1765  leftfree -= (int) state->newitemsz;
1766  else
1767  rightfree -= (int) state->newitemsz;
1768 
1769  /*
1770  * If we are not on the leaf level, we will be able to discard the key
1771  * data from the first item that winds up on the right page.
1772  */
1773  if (!state->is_leaf)
1774  rightfree += (int) firstrightitemsz -
1775  (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
1776 
1777  /*
1778  * If feasible split point, remember best delta.
1779  */
1780  if (leftfree >= 0 && rightfree >= 0)
1781  {
1782  int delta;
1783 
1784  if (state->is_rightmost)
1785  {
1786  /*
1787  * If splitting a rightmost page, try to put (100-fillfactor)% of
1788  * free space on left page. See comments for _bt_findsplitloc.
1789  */
1790  delta = (state->fillfactor * leftfree)
1791  - ((100 - state->fillfactor) * rightfree);
1792  }
1793  else
1794  {
1795  /* Otherwise, aim for equal free space on both sides */
1796  delta = leftfree - rightfree;
1797  }
1798 
1799  if (delta < 0)
1800  delta = -delta;
1801  if (!state->have_split || delta < state->best_delta)
1802  {
1803  state->have_split = true;
1804  state->newitemonleft = newitemonleft;
1805  state->firstright = firstoldonright;
1806  state->best_delta = delta;
1807  }
1808  }
1809 }
1810 
1811 /*
1812  * _bt_insert_parent() -- Insert downlink into parent after a page split.
1813  *
1814  * On entry, buf and rbuf are the left and right split pages, which we
1815  * still hold write locks on per the L&Y algorithm. We release the
1816  * write locks once we have write lock on the parent page. (Any sooner,
1817  * and it'd be possible for some other process to try to split or delete
1818  * one of these pages, and get confused because it cannot find the downlink.)
1819  *
1820  * stack - stack showing how we got here. May be NULL in cases that don't
1821  * have to be efficient (concurrent ROOT split, WAL recovery)
1822  * is_root - we split the true root
1823  * is_only - we split a page alone on its level (might have been fast root)
1824  */
1825 static void
1827  Buffer buf,
1828  Buffer rbuf,
1829  BTStack stack,
1830  bool is_root,
1831  bool is_only)
1832 {
1833  /*
1834  * Here we have to do something Lehman and Yao don't talk about: deal with
1835  * a root split and construction of a new root. If our stack is empty
1836  * then we have just split a node on what had been the root level when we
1837  * descended the tree. If it was still the root then we perform a
1838  * new-root construction. If it *wasn't* the root anymore, search to find
1839  * the next higher level that someone constructed meanwhile, and find the
1840  * right place to insert as for the normal case.
1841  *
1842  * If we have to search for the parent level, we do so by re-descending
1843  * from the root. This is not super-efficient, but it's rare enough not
1844  * to matter.
1845  */
1846  if (is_root)
1847  {
1848  Buffer rootbuf;
1849 
1850  Assert(stack == NULL);
1851  Assert(is_only);
1852  /* create a new root node and update the metapage */
1853  rootbuf = _bt_newroot(rel, buf, rbuf);
1854  /* release the split buffers */
1855  _bt_relbuf(rel, rootbuf);
1856  _bt_relbuf(rel, rbuf);
1857  _bt_relbuf(rel, buf);
1858  }
1859  else
1860  {
1861  BlockNumber bknum = BufferGetBlockNumber(buf);
1862  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1863  Page page = BufferGetPage(buf);
1864  IndexTuple new_item;
1865  BTStackData fakestack;
1866  IndexTuple ritem;
1867  Buffer pbuf;
1868 
1869  if (stack == NULL)
1870  {
1871  BTPageOpaque lpageop;
1872 
1873  elog(DEBUG2, "concurrent ROOT page split");
1874  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1875  /* Find the leftmost page at the next level up */
1876  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
1877  NULL);
1878  /* Set up a phony stack entry pointing there */
1879  stack = &fakestack;
1880  stack->bts_blkno = BufferGetBlockNumber(pbuf);
1883  stack->bts_parent = NULL;
1884  _bt_relbuf(rel, pbuf);
1885  }
1886 
1887  /* get high key from left page == lower bound for new right page */
1888  ritem = (IndexTuple) PageGetItem(page,
1889  PageGetItemId(page, P_HIKEY));
1890 
1891  /* form an index tuple that points at the new right page */
1892  new_item = CopyIndexTuple(ritem);
1893  BTreeInnerTupleSetDownLink(new_item, rbknum);
1894 
1895  /*
1896  * Find the parent buffer and get the parent page.
1897  *
1898  * Oops - if we were moved right then we need to change stack item! We
1899  * want to find parent pointing to where we are, right ? - vadim
1900  * 05/27/97
1901  */
1902  stack->bts_btentry = bknum;
1903  pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1904 
1905  /*
1906  * Now we can unlock the right child. The left child will be unlocked
1907  * by _bt_insertonpg().
1908  */
1909  _bt_relbuf(rel, rbuf);
1910 
1911  /* Check for error only after writing children */
1912  if (pbuf == InvalidBuffer)
1913  elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
1914  RelationGetRelationName(rel), bknum, rbknum);
1915 
1916  /* Recursively update the parent */
1917  _bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
1918  new_item, stack->bts_offset + 1,
1919  is_only);
1920 
1921  /* be tidy */
1922  pfree(new_item);
1923  }
1924 }
1925 
1926 /*
1927  * _bt_finish_split() -- Finish an incomplete split
1928  *
1929  * A crash or other failure can leave a split incomplete. The insertion
1930  * routines won't allow to insert on a page that is incompletely split.
1931  * Before inserting on such a page, call _bt_finish_split().
1932  *
1933  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
1934  * and unpinned.
1935  */
1936 void
1938 {
1939  Page lpage = BufferGetPage(lbuf);
1940  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1941  Buffer rbuf;
1942  Page rpage;
1943  BTPageOpaque rpageop;
1944  bool was_root;
1945  bool was_only;
1946 
1947  Assert(P_INCOMPLETE_SPLIT(lpageop));
1948 
1949  /* Lock right sibling, the one missing the downlink */
1950  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1951  rpage = BufferGetPage(rbuf);
1952  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1953 
1954  /* Could this be a root split? */
1955  if (!stack)
1956  {
1957  Buffer metabuf;
1958  Page metapg;
1959  BTMetaPageData *metad;
1960 
1961  /* acquire lock on the metapage */
1962  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1963  metapg = BufferGetPage(metabuf);
1964  metad = BTPageGetMeta(metapg);
1965 
1966  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1967 
1968  _bt_relbuf(rel, metabuf);
1969  }
1970  else
1971  was_root = false;
1972 
1973  /* Was this the only page on the level before split? */
1974  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1975 
1976  elog(DEBUG1, "finishing incomplete split of %u/%u",
1978 
1979  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1980 }
1981 
1982 /*
1983  * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
1984  * we last looked at in the parent.
1985  *
1986  * This is possible because we save the downlink from the parent item,
1987  * which is enough to uniquely identify it. Insertions into the parent
1988  * level could cause the item to move right; deletions could cause it
1989  * to move left, but not left of the page we previously found it in.
1990  *
1991  * Adjusts bts_blkno & bts_offset if changed.
1992  *
1993  * Returns InvalidBuffer if item not found (should not happen).
1994  */
1995 Buffer
1996 _bt_getstackbuf(Relation rel, BTStack stack, int access)
1997 {
1998  BlockNumber blkno;
1999  OffsetNumber start;
2000 
2001  blkno = stack->bts_blkno;
2002  start = stack->bts_offset;
2003 
2004  for (;;)
2005  {
2006  Buffer buf;
2007  Page page;
2008  BTPageOpaque opaque;
2009 
2010  buf = _bt_getbuf(rel, blkno, access);
2011  page = BufferGetPage(buf);
2012  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2013 
2014  if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
2015  {
2016  _bt_finish_split(rel, buf, stack->bts_parent);
2017  continue;
2018  }
2019 
2020  if (!P_IGNORE(opaque))
2021  {
2022  OffsetNumber offnum,
2023  minoff,
2024  maxoff;
2025  ItemId itemid;
2026  IndexTuple item;
2027 
2028  minoff = P_FIRSTDATAKEY(opaque);
2029  maxoff = PageGetMaxOffsetNumber(page);
2030 
2031  /*
2032  * start = InvalidOffsetNumber means "search the whole page". We
2033  * need this test anyway due to possibility that page has a high
2034  * key now when it didn't before.
2035  */
2036  if (start < minoff)
2037  start = minoff;
2038 
2039  /*
2040  * Need this check too, to guard against possibility that page
2041  * split since we visited it originally.
2042  */
2043  if (start > maxoff)
2044  start = OffsetNumberNext(maxoff);
2045 
2046  /*
2047  * These loops will check every item on the page --- but in an
2048  * order that's attuned to the probability of where it actually
2049  * is. Scan to the right first, then to the left.
2050  */
2051  for (offnum = start;
2052  offnum <= maxoff;
2053  offnum = OffsetNumberNext(offnum))
2054  {
2055  itemid = PageGetItemId(page, offnum);
2056  item = (IndexTuple) PageGetItem(page, itemid);
2057 
2058  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2059  {
2060  /* Return accurate pointer to where link is now */
2061  stack->bts_blkno = blkno;
2062  stack->bts_offset = offnum;
2063  return buf;
2064  }
2065  }
2066 
2067  for (offnum = OffsetNumberPrev(start);
2068  offnum >= minoff;
2069  offnum = OffsetNumberPrev(offnum))
2070  {
2071  itemid = PageGetItemId(page, offnum);
2072  item = (IndexTuple) PageGetItem(page, itemid);
2073 
2074  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2075  {
2076  /* Return accurate pointer to where link is now */
2077  stack->bts_blkno = blkno;
2078  stack->bts_offset = offnum;
2079  return buf;
2080  }
2081  }
2082  }
2083 
2084  /*
2085  * The item we're looking for moved right at least one page.
2086  */
2087  if (P_RIGHTMOST(opaque))
2088  {
2089  _bt_relbuf(rel, buf);
2090  return InvalidBuffer;
2091  }
2092  blkno = opaque->btpo_next;
2093  start = InvalidOffsetNumber;
2094  _bt_relbuf(rel, buf);
2095  }
2096 }
2097 
2098 /*
2099  * _bt_newroot() -- Create a new root page for the index.
2100  *
2101  * We've just split the old root page and need to create a new one.
2102  * In order to do this, we add a new root page to the file, then lock
2103  * the metadata page and update it. This is guaranteed to be deadlock-
2104  * free, because all readers release their locks on the metadata page
2105  * before trying to lock the root, and all writers lock the root before
2106  * trying to lock the metadata page. We have a write lock on the old
2107  * root page, so we have not introduced any cycles into the waits-for
2108  * graph.
2109  *
2110  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2111  * locked. On exit, a new root page exists with entries for the
2112  * two new children, metapage is updated and unlocked/unpinned.
2113  * The new root buffer is returned to caller which has to unlock/unpin
2114  * lbuf, rbuf & rootbuf.
2115  */
2116 static Buffer
2118 {
2119  Buffer rootbuf;
2120  Page lpage,
2121  rootpage;
2122  BlockNumber lbkno,
2123  rbkno;
2124  BlockNumber rootblknum;
2125  BTPageOpaque rootopaque;
2126  BTPageOpaque lopaque;
2127  ItemId itemid;
2128  IndexTuple item;
2129  IndexTuple left_item;
2130  Size left_item_sz;
2131  IndexTuple right_item;
2132  Size right_item_sz;
2133  Buffer metabuf;
2134  Page metapg;
2135  BTMetaPageData *metad;
2136 
2137  lbkno = BufferGetBlockNumber(lbuf);
2138  rbkno = BufferGetBlockNumber(rbuf);
2139  lpage = BufferGetPage(lbuf);
2140  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2141 
2142  /* get a new root page */
2143  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2144  rootpage = BufferGetPage(rootbuf);
2145  rootblknum = BufferGetBlockNumber(rootbuf);
2146 
2147  /* acquire lock on the metapage */
2148  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2149  metapg = BufferGetPage(metabuf);
2150  metad = BTPageGetMeta(metapg);
2151 
2152  /* upgrade metapage if needed */
2153  if (metad->btm_version < BTREE_VERSION)
2154  _bt_upgrademetapage(metapg);
2155 
2156  /*
2157  * Create downlink item for left page (old root). Since this will be the
2158  * first item in a non-leaf page, it implicitly has minus-infinity key
2159  * value, so we need not store any actual key in it.
2160  */
2161  left_item_sz = sizeof(IndexTupleData);
2162  left_item = (IndexTuple) palloc(left_item_sz);
2163  left_item->t_info = left_item_sz;
2164  BTreeInnerTupleSetDownLink(left_item, lbkno);
2165  BTreeTupleSetNAtts(left_item, 0);
2166 
2167  /*
2168  * Create downlink item for right page. The key for it is obtained from
2169  * the "high key" position in the left page.
2170  */
2171  itemid = PageGetItemId(lpage, P_HIKEY);
2172  right_item_sz = ItemIdGetLength(itemid);
2173  item = (IndexTuple) PageGetItem(lpage, itemid);
2174  right_item = CopyIndexTuple(item);
2175  BTreeInnerTupleSetDownLink(right_item, rbkno);
2176 
2177  /* NO EREPORT(ERROR) from here till newroot op is logged */
2179 
2180  /* set btree special data */
2181  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2182  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2183  rootopaque->btpo_flags = BTP_ROOT;
2184  rootopaque->btpo.level =
2185  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2186  rootopaque->btpo_cycleid = 0;
2187 
2188  /* update metapage data */
2189  metad->btm_root = rootblknum;
2190  metad->btm_level = rootopaque->btpo.level;
2191  metad->btm_fastroot = rootblknum;
2192  metad->btm_fastlevel = rootopaque->btpo.level;
2193 
2194  /*
2195  * Insert the left page pointer into the new root page. The root page is
2196  * the rightmost page on its level so there is no "high key" in it; the
2197  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2198  *
2199  * Note: we *must* insert the two items in item-number order, for the
2200  * benefit of _bt_restore_page().
2201  */
2202  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2203  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2204  false, false) == InvalidOffsetNumber)
2205  elog(PANIC, "failed to add leftkey to new root page"
2206  " while splitting block %u of index \"%s\"",
2208 
2209  /*
2210  * insert the right page pointer into the new root page.
2211  */
2212  Assert(BTreeTupleGetNAtts(right_item, rel) ==
2214  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2215  false, false) == InvalidOffsetNumber)
2216  elog(PANIC, "failed to add rightkey to new root page"
2217  " while splitting block %u of index \"%s\"",
2219 
2220  /* Clear the incomplete-split flag in the left child */
2221  Assert(P_INCOMPLETE_SPLIT(lopaque));
2222  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2223  MarkBufferDirty(lbuf);
2224 
2225  MarkBufferDirty(rootbuf);
2226  MarkBufferDirty(metabuf);
2227 
2228  /* XLOG stuff */
2229  if (RelationNeedsWAL(rel))
2230  {
2231  xl_btree_newroot xlrec;
2232  XLogRecPtr recptr;
2233  xl_btree_metadata md;
2234 
2235  xlrec.rootblk = rootblknum;
2236  xlrec.level = metad->btm_level;
2237 
2238  XLogBeginInsert();
2239  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2240 
2241  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2244 
2245  md.root = rootblknum;
2246  md.level = metad->btm_level;
2247  md.fastroot = rootblknum;
2248  md.fastlevel = metad->btm_level;
2251 
2252  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2253 
2254  /*
2255  * Direct access to page is not good but faster - we should implement
2256  * some new func in page API.
2257  */
2259  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2260  ((PageHeader) rootpage)->pd_special -
2261  ((PageHeader) rootpage)->pd_upper);
2262 
2263  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2264 
2265  PageSetLSN(lpage, recptr);
2266  PageSetLSN(rootpage, recptr);
2267  PageSetLSN(metapg, recptr);
2268  }
2269 
2270  END_CRIT_SECTION();
2271 
2272  /* done with metapage */
2273  _bt_relbuf(rel, metabuf);
2274 
2275  pfree(left_item);
2276  pfree(right_item);
2277 
2278  return rootbuf;
2279 }
2280 
2281 /*
2282  * _bt_pgaddtup() -- add a tuple to a particular page in the index.
2283  *
2284  * This routine adds the tuple to the page as requested. It does
2285  * not affect pin/lock status, but you'd better have a write lock
2286  * and pin on the target buffer! Don't forget to write and release
2287  * the buffer afterwards, either.
2288  *
2289  * The main difference between this routine and a bare PageAddItem call
2290  * is that this code knows that the leftmost index tuple on a non-leaf
2291  * btree page doesn't need to have a key. Therefore, it strips such
2292  * tuples down to just the tuple header. CAUTION: this works ONLY if
2293  * we insert the tuples in order, so that the given itup_off does
2294  * represent the final position of the tuple!
2295  */
2296 static bool
2298  Size itemsize,
2299  IndexTuple itup,
2300  OffsetNumber itup_off)
2301 {
2303  IndexTupleData trunctuple;
2304 
2305  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2306  {
2307  trunctuple = *itup;
2308  trunctuple.t_info = sizeof(IndexTupleData);
2309  BTreeTupleSetNAtts(&trunctuple, 0);
2310  itup = &trunctuple;
2311  itemsize = sizeof(IndexTupleData);
2312  }
2313 
2314  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2315  false, false) == InvalidOffsetNumber)
2316  return false;
2317 
2318  return true;
2319 }
2320 
2321 /*
2322  * _bt_isequal - used in _bt_doinsert in check for duplicates.
2323  *
2324  * This is very similar to _bt_compare, except for NULL handling.
2325  * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
2326  */
2327 static bool
2328 _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
2329  int keysz, ScanKey scankey)
2330 {
2331  IndexTuple itup;
2332  int i;
2333 
2334  /* Better be comparing to a leaf item */
2336 
2337  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2338 
2339  /*
2340  * It's okay that we might perform a comparison against a truncated page
2341  * high key when caller needs to determine if _bt_check_unique scan must
2342  * continue on to the next page. Caller never asks us to compare non-key
2343  * attributes within an INCLUDE index.
2344  */
2345  for (i = 1; i <= keysz; i++)
2346  {
2347  AttrNumber attno;
2348  Datum datum;
2349  bool isNull;
2350  int32 result;
2351 
2352  attno = scankey->sk_attno;
2353  Assert(attno == i);
2354  datum = index_getattr(itup, attno, itupdesc, &isNull);
2355 
2356  /* NULLs are never equal to anything */
2357  if (isNull || (scankey->sk_flags & SK_ISNULL))
2358  return false;
2359 
2360  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
2361  scankey->sk_collation,
2362  datum,
2363  scankey->sk_argument));
2364 
2365  if (result != 0)
2366  return false;
2367 
2368  scankey++;
2369  }
2370 
2371  /* if we get here, the keys are equal */
2372  return true;
2373 }
2374 
2375 /*
2376  * _bt_vacuum_one_page - vacuum just one index page.
2377  *
2378  * Try to remove LP_DEAD items from the given page. The passed buffer
2379  * must be exclusive-locked, but unlike a real VACUUM, we don't need a
2380  * super-exclusive "cleanup" lock (see nbtree/README).
2381  */
2382 static void
2384 {
2385  OffsetNumber deletable[MaxOffsetNumber];
2386  int ndeletable = 0;
2387  OffsetNumber offnum,
2388  minoff,
2389  maxoff;
2390  Page page = BufferGetPage(buffer);
2392 
2393  /*
2394  * Scan over all items to see which ones need to be deleted according to
2395  * LP_DEAD flags.
2396  */
2397  minoff = P_FIRSTDATAKEY(opaque);
2398  maxoff = PageGetMaxOffsetNumber(page);
2399  for (offnum = minoff;
2400  offnum <= maxoff;
2401  offnum = OffsetNumberNext(offnum))
2402  {
2403  ItemId itemId = PageGetItemId(page, offnum);
2404 
2405  if (ItemIdIsDead(itemId))
2406  deletable[ndeletable++] = offnum;
2407  }
2408 
2409  if (ndeletable > 0)
2410  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2411 
2412  /*
2413  * Note: if we didn't find any LP_DEAD items, then the page's
2414  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2415  * separate write to clear it, however. We will clear it when we split
2416  * the page.
2417  */
2418 }
Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access)
Definition: nbtinsert.c:1996
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:341
BlockNumber rootblk
Definition: nbtxlog.h:245
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
#define BTP_ROOT
Definition: nbtree.h:72
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1885
#define BTP_SPLIT_END
Definition: nbtree.h:76
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:987
bool newitemonleft
Definition: nbtinsert.c:47
int best_delta
Definition: nbtinsert.c:49
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:110
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:407
#define P_IGNORE(opaque)
Definition: nbtree.h:163
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:224
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:86
uint32 TransactionId
Definition: c.h:474
uint32 btm_version
Definition: nbtree.h:100
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:594
#define DatumGetInt32(X)
Definition: postgres.h:455
bool is_rightmost
Definition: nbtinsert.c:38
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:860
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
#define RelationGetDescr(relation)
Definition: rel.h:433
#define ItemIdMarkDead(itemId)
Definition: itemid.h:178
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:117
long random(void)
Definition: random.c:22
#define MaxOffsetNumber
Definition: off.h:28
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
void _bt_freeskey(ScanKey skey)
Definition: nbtutils.c:166
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey)
Definition: nbtinsert.c:2328
#define SizeOfBtreeSplit
Definition: nbtxlog.h:115
ItemPointerData t_tid
Definition: itup.h:37
int fillfactor
Definition: nbtinsert.c:36
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:133
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:323
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:150
#define InvalidBuffer
Definition: buf.h:25
static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft)
Definition: nbtinsert.c:1579
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup)
Definition: nbtutils.c:2102
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:164
#define START_CRIT_SECTION()
Definition: miscadmin.h:131
int errcode(int sqlerrcode)
Definition: elog.c:575
uint32 level
Definition: nbtxlog.h:246
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
bool have_split
Definition: nbtinsert.c:44
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1826
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2117
#define P_NEW
Definition: bufmgr.h:82
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1132
#define SizeOfPageHeaderData
Definition: bufpage.h:212
static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz)
Definition: nbtinsert.c:1727
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:233
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1776
#define PANIC
Definition: elog.h:53
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
#define RelationGetTargetBlock(relation)
Definition: rel.h:493
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:243
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4280
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
int rightspace
Definition: nbtinsert.c:41
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
signed int int32
Definition: c.h:313
uint16 OffsetNumber
Definition: off.h:24
IndexUniqueCheck
Definition: genam.h:111
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5246
#define BT_READ
Definition: nbtree.h:286
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1937
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:140
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:662
Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:214
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
#define InitDirtySnapshot(snapshotdata)
Definition: tqual.h:103
OffsetNumber newitemoff
Definition: nbtxlog.h:112
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2383
bool is_leaf
Definition: nbtinsert.c:37
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define DEBUG2
Definition: elog.h:24
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:711
Size newitemsz
Definition: nbtinsert.c:35
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define XLOG_BTREE_SPLIT_L_HIGHKEY
Definition: nbtxlog.h:31
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:441
OffsetNumber bts_offset
Definition: nbtree.h:302
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
int errdetail(const char *fmt,...)
Definition: elog.c:873
FmgrInfo sk_func
Definition: skey.h:71
#define MAX_RANDOM_VALUE
#define P_FIRSTKEY
Definition: nbtree.h:186
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:323
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:419
#define PageGetPageSize(page)
Definition: bufpage.h:264
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define P_LEFTMOST(opaque)
Definition: nbtree.h:156
unsigned int uint32
Definition: c.h:325
struct ItemIdData ItemIdData
static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
Definition: nbtinsert.c:1120
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:423
TransactionId xmax
Definition: snapshot.h:69
TransactionId xmin
Definition: snapshot.h:68
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
#define SK_ISNULL
Definition: skey.h:115
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define SnapshotSelf
Definition: tqual.h:27
#define ereport(elevel, rest)
Definition: elog.h:122
static void _bt_findinsertloc(Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, int keysz, ScanKey scankey, IndexTuple newtup, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:642
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:139
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3572
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define P_ISROOT(opaque)
Definition: nbtree.h:159
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:30
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:249
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:348
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2297
uintptr_t Datum
Definition: postgres.h:365
uint32 level
Definition: nbtxlog.h:110
OffsetNumber offnum
Definition: nbtxlog.h:70
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:147
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
BlockNumber btm_root
Definition: nbtree.h:101
#define RelationGetFillFactor(relation, defaultff)
Definition: rel.h:283
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:301
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
BlockNumber bts_btentry
Definition: nbtree.h:303
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
int sk_flags
Definition: skey.h:66
uint32 speculativeToken
Definition: snapshot.h:104
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:554
int olddataitemstotal
Definition: nbtinsert.c:42
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:699
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtinsert.c:39
OffsetNumber firstright
Definition: nbtxlog.h:111
WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]
Definition: walsender.c:215
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define INDEX_MAX_KEYS
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:500
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead)
Definition: heapam.c:2183
#define InvalidBlockNumber
Definition: block.h:33
#define XLOG_BTREE_SPLIT_R_HIGHKEY
Definition: nbtxlog.h:32
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define MAXALIGN(LEN)
Definition: c.h:652
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
struct BTStackData * bts_parent
Definition: nbtree.h:304
#define RelationNeedsWAL(relation)
Definition: rel.h:510
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
#define PageGetLSN(page)
Definition: bufpage.h:362
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:629
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:226
#define BTMaxItemSize(page)
Definition: nbtree.h:126
#define P_HIKEY
Definition: nbtree.h:185
static Datum values[MAXATTR]
Definition: bootstrap.c:164
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:835
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:797
uint32 fastlevel
Definition: nbtxlog.h:53
Oid sk_collation
Definition: skey.h:70
uint32 btm_level
Definition: nbtree.h:102
uint32 level
Definition: nbtxlog.h:51
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:891
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
Definition: nbtpage.c:1021
BlockNumber fastroot
Definition: nbtxlog.h:52
OffsetNumber firstright
Definition: nbtinsert.c:48
#define elog
Definition: elog.h:219
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:178
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
#define TransactionIdIsValid(xid)
Definition: transam.h:41
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:428
ScanKey _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:62
#define BT_WRITE
Definition: nbtree.h:287
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Datum sk_argument
Definition: skey.h:72
int16 AttrNumber
Definition: attnum.h:21
#define SizeOfBtreeInsert
Definition: nbtxlog.h:73
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3065
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158