PostgreSQL Source Code  git master
nbtinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtinsert.c
4  * Item insertion in Lehman and Yao btrees for Postgres.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/heapam.h"
19 #include "access/nbtree.h"
20 #include "access/nbtxlog.h"
21 #include "access/transam.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "storage/lmgr.h"
25 #include "storage/predicate.h"
26 #include "storage/smgr.h"
27 #include "utils/tqual.h"
28 
29 /* Minimum tree height for application of fastpath optimization */
30 #define BTREE_FASTPATH_MIN_LEVEL 2
31 
32 typedef struct
33 {
34  /* context data for _bt_checksplitloc */
35  Size newitemsz; /* size of new item to be inserted */
36  int fillfactor; /* needed when splitting rightmost page */
37  bool is_leaf; /* T if splitting a leaf page */
38  bool is_rightmost; /* T if splitting a rightmost page */
39  OffsetNumber newitemoff; /* where the new item is to be inserted */
40  int leftspace; /* space available for items on left page */
41  int rightspace; /* space available for items on right page */
42  int olddataitemstotal; /* space taken by old items */
43 
44  bool have_split; /* found a valid split? */
45 
46  /* these fields valid only if have_split is true */
47  bool newitemonleft; /* new item on left or right of best split */
48  OffsetNumber firstright; /* best split point */
49  int best_delta; /* best size delta so far */
51 
52 
53 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
54 
56  Relation heapRel, Buffer buf, OffsetNumber offset,
57  ScanKey itup_scankey,
58  IndexUniqueCheck checkUnique, bool *is_unique,
59  uint32 *speculativeToken);
60 static void _bt_findinsertloc(Relation rel,
61  Buffer *bufptr,
62  OffsetNumber *offsetptr,
63  int keysz,
64  ScanKey scankey,
65  IndexTuple newtup,
66  BTStack stack,
67  Relation heapRel);
68 static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
69  BTStack stack,
70  IndexTuple itup,
71  OffsetNumber newitemoff,
72  bool split_only_page);
73 static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
74  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
75  IndexTuple newitem, bool newitemonleft);
76 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
77  BTStack stack, bool is_root, bool is_only);
79  OffsetNumber newitemoff,
80  Size newitemsz,
81  bool *newitemonleft);
83  OffsetNumber firstoldonright, bool newitemonleft,
84  int dataitemstoleft, Size firstoldonrightsz);
85 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
86  OffsetNumber itup_off);
87 static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
88  int keysz, ScanKey scankey);
89 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
90 
91 /*
92  * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
93  *
94  * This routine is called by the public interface routine, btinsert.
95  * By here, itup is filled in, including the TID.
96  *
97  * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
98  * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
99  * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
100  * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
101  * don't actually insert.
102  *
103  * The result value is only significant for UNIQUE_CHECK_PARTIAL:
104  * it must be true if the entry is known unique, else false.
105  * (In the current implementation we'll also return true after a
106  * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
107  * that's just a coding artifact.)
108  */
109 bool
111  IndexUniqueCheck checkUnique, Relation heapRel)
112 {
113  bool is_unique = false;
114  int indnkeyatts;
115  ScanKey itup_scankey;
116  BTStack stack = NULL;
117  Buffer buf;
118  OffsetNumber offset;
119  bool fastpath;
120 
121  indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
122  Assert(indnkeyatts != 0);
123 
124  /* we need an insertion scan key to do our search, so build one */
125  itup_scankey = _bt_mkscankey(rel, itup);
126 
127  /*
128  * It's very common to have an index on an auto-incremented or
129  * monotonically increasing value. In such cases, every insertion happens
130  * towards the end of the index. We try to optimize that case by caching
131  * the right-most leaf of the index. If our cached block is still the
132  * rightmost leaf, has enough free space to accommodate a new entry and
133  * the insertion key is strictly greater than the first key in this page,
134  * then we can safely conclude that the new key will be inserted in the
135  * cached block. So we simply search within the cached block and insert
136  * the key at the appropriate location. We call it a fastpath.
137  *
138  * Testing has revealed, though, that the fastpath can result in increased
139  * contention on the exclusive-lock on the rightmost leaf page. So we
140  * conditionally check if the lock is available. If it's not available
141  * then we simply abandon the fastpath and take the regular path. This
142  * makes sense because unavailability of the lock also signals that some
143  * other backend might be concurrently inserting into the page, thus
144  * reducing our chances to finding an insertion place in this page.
145  */
146 top:
147  fastpath = false;
148  offset = InvalidOffsetNumber;
150  {
151  Size itemsz;
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change,
159  * as far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170  itemsz = IndexTupleSize(itup);
171  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this
172  * but we need to be consistent */
173 
174  /*
175  * Check if the page is still the rightmost leaf page, has enough
176  * free space to accommodate the new tuple, and the insertion scan
177  * key is strictly greater than the first key on the page.
178  */
179  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
180  !P_IGNORE(lpageop) &&
181  (PageGetFreeSpace(page) > itemsz) &&
182  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
183  _bt_compare(rel, indnkeyatts, itup_scankey, page,
184  P_FIRSTDATAKEY(lpageop)) > 0)
185  {
186  /*
187  * The right-most block should never have an incomplete split.
188  * But be paranoid and check for it anyway.
189  */
190  Assert(!P_INCOMPLETE_SPLIT(lpageop));
191  fastpath = true;
192  }
193  else
194  {
195  _bt_relbuf(rel, buf);
196 
197  /*
198  * Something did not work out. Just forget about the cached
199  * block and follow the normal path. It might be set again if
200  * the conditions are favourable.
201  */
203  }
204  }
205  else
206  {
207  ReleaseBuffer(buf);
208 
209  /*
210  * If someone's holding a lock, it's likely to change anyway, so
211  * don't try again until we get an updated rightmost leaf.
212  */
214  }
215  }
216 
217  if (!fastpath)
218  {
219  /* find the first page containing this key */
220  stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE,
221  NULL);
222 
223  /* trade in our read lock for a write lock */
225  LockBuffer(buf, BT_WRITE);
226 
227  /*
228  * If the page was split between the time that we surrendered our read
229  * lock and acquired our write lock, then this page may no longer be
230  * the right place for the key we want to insert. In this case, we
231  * need to move right in the tree. See Lehman and Yao for an
232  * excruciatingly precise description.
233  */
234  buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, false,
235  true, stack, BT_WRITE, NULL);
236  }
237 
238  /*
239  * If we're not allowing duplicates, make sure the key isn't already in
240  * the index.
241  *
242  * NOTE: obviously, _bt_check_unique can only detect keys that are already
243  * in the index; so it cannot defend against concurrent insertions of the
244  * same key. We protect against that by means of holding a write lock on
245  * the target page. Any other would-be inserter of the same key must
246  * acquire a write lock on the same target page, so only one would-be
247  * inserter can be making the check at one time. Furthermore, once we are
248  * past the check we hold write locks continuously until we have performed
249  * our insertion, so no later inserter can fail to see our insertion.
250  * (This requires some care in _bt_insertonpg.)
251  *
252  * If we must wait for another xact, we release the lock while waiting,
253  * and then must start over completely.
254  *
255  * For a partial uniqueness check, we don't wait for the other xact. Just
256  * let the tuple in and return false for possibly non-unique, or true for
257  * definitely unique.
258  */
259  if (checkUnique != UNIQUE_CHECK_NO)
260  {
261  TransactionId xwait;
262  uint32 speculativeToken;
263 
264  offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false);
265  xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
266  checkUnique, &is_unique, &speculativeToken);
267 
268  if (TransactionIdIsValid(xwait))
269  {
270  /* Have to wait for the other guy ... */
271  _bt_relbuf(rel, buf);
272 
273  /*
274  * If it's a speculative insertion, wait for it to finish (ie. to
275  * go ahead with the insertion, or kill the tuple). Otherwise
276  * wait for the transaction to finish as usual.
277  */
278  if (speculativeToken)
279  SpeculativeInsertionWait(xwait, speculativeToken);
280  else
281  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
282 
283  /* start over... */
284  if (stack)
285  _bt_freestack(stack);
286  goto top;
287  }
288  }
289 
290  if (checkUnique != UNIQUE_CHECK_EXISTING)
291  {
292  /*
293  * The only conflict predicate locking cares about for indexes is when
294  * an index tuple insert conflicts with an existing lock. Since the
295  * actual location of the insert is hard to predict because of the
296  * random search used to prevent O(N^2) performance when there are
297  * many duplicate entries, we can just use the "first valid" page.
298  * This reasoning also applies to INCLUDE indexes, whose extra
299  * attributes are not considered part of the key space.
300  */
301  CheckForSerializableConflictIn(rel, NULL, buf);
302  /* do the insertion */
303  _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup,
304  stack, heapRel);
305  _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
306  }
307  else
308  {
309  /* just release the buffer */
310  _bt_relbuf(rel, buf);
311  }
312 
313  /* be tidy */
314  if (stack)
315  _bt_freestack(stack);
316  _bt_freeskey(itup_scankey);
317 
318  return is_unique;
319 }
320 
321 /*
322  * _bt_check_unique() -- Check for violation of unique index constraint
323  *
324  * offset points to the first possible item that could conflict. It can
325  * also point to end-of-page, which means that the first tuple to check
326  * is the first tuple on the next page.
327  *
328  * Returns InvalidTransactionId if there is no conflict, else an xact ID
329  * we must wait for to see if it commits a conflicting tuple. If an actual
330  * conflict is detected, no return --- just ereport(). If an xact ID is
331  * returned, and the conflicting tuple still has a speculative insertion in
332  * progress, *speculativeToken is set to non-zero, and the caller can wait for
333  * the verdict on the insertion using SpeculativeInsertionWait().
334  *
335  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
336  * InvalidTransactionId because we don't want to wait. In this case we
337  * set *is_unique to false if there is a potential conflict, and the
338  * core code must redo the uniqueness check later.
339  */
340 static TransactionId
342  Buffer buf, OffsetNumber offset, ScanKey itup_scankey,
343  IndexUniqueCheck checkUnique, bool *is_unique,
344  uint32 *speculativeToken)
345 {
346  TupleDesc itupdesc = RelationGetDescr(rel);
347  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
348  SnapshotData SnapshotDirty;
349  OffsetNumber maxoff;
350  Page page;
351  BTPageOpaque opaque;
352  Buffer nbuf = InvalidBuffer;
353  bool found = false;
354 
355  /* Assume unique until we find a duplicate */
356  *is_unique = true;
357 
358  InitDirtySnapshot(SnapshotDirty);
359 
360  page = BufferGetPage(buf);
361  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362  maxoff = PageGetMaxOffsetNumber(page);
363 
364  /*
365  * Scan over all equal tuples, looking for live conflicts.
366  */
367  for (;;)
368  {
369  ItemId curitemid;
370  IndexTuple curitup;
371  BlockNumber nblkno;
372 
373  /*
374  * make sure the offset points to an actual item before trying to
375  * examine it...
376  */
377  if (offset <= maxoff)
378  {
379  curitemid = PageGetItemId(page, offset);
380 
381  /*
382  * We can skip items that are marked killed.
383  *
384  * Formerly, we applied _bt_isequal() before checking the kill
385  * flag, so as to fall out of the item loop as soon as possible.
386  * However, in the presence of heavy update activity an index may
387  * contain many killed items with the same key; running
388  * _bt_isequal() on each killed item gets expensive. Furthermore
389  * it is likely that the non-killed version of each key appears
390  * first, so that we didn't actually get to exit any sooner
391  * anyway. So now we just advance over killed items as quickly as
392  * we can. We only apply _bt_isequal() when we get to a non-killed
393  * item or the end of the page.
394  */
395  if (!ItemIdIsDead(curitemid))
396  {
397  ItemPointerData htid;
398  bool all_dead;
399 
400  /*
401  * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
402  * how we handling NULLs - and so we must not use _bt_compare
403  * in real comparison, but only for ordering/finding items on
404  * pages. - vadim 03/24/97
405  */
406  if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
407  break; /* we're past all the equal tuples */
408 
409  /* okay, we gotta fetch the heap tuple ... */
410  curitup = (IndexTuple) PageGetItem(page, curitemid);
411  htid = curitup->t_tid;
412 
413  /*
414  * If we are doing a recheck, we expect to find the tuple we
415  * are rechecking. It's not a duplicate, but we have to keep
416  * scanning.
417  */
418  if (checkUnique == UNIQUE_CHECK_EXISTING &&
419  ItemPointerCompare(&htid, &itup->t_tid) == 0)
420  {
421  found = true;
422  }
423 
424  /*
425  * We check the whole HOT-chain to see if there is any tuple
426  * that satisfies SnapshotDirty. This is necessary because we
427  * have just a single index entry for the entire chain.
428  */
429  else if (heap_hot_search(&htid, heapRel, &SnapshotDirty,
430  &all_dead))
431  {
432  TransactionId xwait;
433 
434  /*
435  * It is a duplicate. If we are only doing a partial
436  * check, then don't bother checking if the tuple is being
437  * updated in another transaction. Just return the fact
438  * that it is a potential conflict and leave the full
439  * check till later.
440  */
441  if (checkUnique == UNIQUE_CHECK_PARTIAL)
442  {
443  if (nbuf != InvalidBuffer)
444  _bt_relbuf(rel, nbuf);
445  *is_unique = false;
446  return InvalidTransactionId;
447  }
448 
449  /*
450  * If this tuple is being updated by other transaction
451  * then we have to wait for its commit/abort.
452  */
453  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
454  SnapshotDirty.xmin : SnapshotDirty.xmax;
455 
456  if (TransactionIdIsValid(xwait))
457  {
458  if (nbuf != InvalidBuffer)
459  _bt_relbuf(rel, nbuf);
460  /* Tell _bt_doinsert to wait... */
461  *speculativeToken = SnapshotDirty.speculativeToken;
462  return xwait;
463  }
464 
465  /*
466  * Otherwise we have a definite conflict. But before
467  * complaining, look to see if the tuple we want to insert
468  * is itself now committed dead --- if so, don't complain.
469  * This is a waste of time in normal scenarios but we must
470  * do it to support CREATE INDEX CONCURRENTLY.
471  *
472  * We must follow HOT-chains here because during
473  * concurrent index build, we insert the root TID though
474  * the actual tuple may be somewhere in the HOT-chain.
475  * While following the chain we might not stop at the
476  * exact tuple which triggered the insert, but that's OK
477  * because if we find a live tuple anywhere in this chain,
478  * we have a unique key conflict. The other live tuple is
479  * not part of this chain because it had a different index
480  * entry.
481  */
482  htid = itup->t_tid;
483  if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
484  {
485  /* Normal case --- it's still live */
486  }
487  else
488  {
489  /*
490  * It's been deleted, so no error, and no need to
491  * continue searching
492  */
493  break;
494  }
495 
496  /*
497  * Check for a conflict-in as we would if we were going to
498  * write to this page. We aren't actually going to write,
499  * but we want a chance to report SSI conflicts that would
500  * otherwise be masked by this unique constraint
501  * violation.
502  */
503  CheckForSerializableConflictIn(rel, NULL, buf);
504 
505  /*
506  * This is a definite conflict. Break the tuple down into
507  * datums and report the error. But first, make sure we
508  * release the buffer locks we're holding ---
509  * BuildIndexValueDescription could make catalog accesses,
510  * which in the worst case might touch this same index and
511  * cause deadlocks.
512  */
513  if (nbuf != InvalidBuffer)
514  _bt_relbuf(rel, nbuf);
515  _bt_relbuf(rel, buf);
516 
517  {
519  bool isnull[INDEX_MAX_KEYS];
520  char *key_desc;
521 
523  values, isnull);
524 
525  key_desc = BuildIndexValueDescription(rel, values,
526  isnull);
527 
528  ereport(ERROR,
529  (errcode(ERRCODE_UNIQUE_VIOLATION),
530  errmsg("duplicate key value violates unique constraint \"%s\"",
532  key_desc ? errdetail("Key %s already exists.",
533  key_desc) : 0,
534  errtableconstraint(heapRel,
535  RelationGetRelationName(rel))));
536  }
537  }
538  else if (all_dead)
539  {
540  /*
541  * The conflicting tuple (or whole HOT chain) is dead to
542  * everyone, so we may as well mark the index entry
543  * killed.
544  */
545  ItemIdMarkDead(curitemid);
546  opaque->btpo_flags |= BTP_HAS_GARBAGE;
547 
548  /*
549  * Mark buffer with a dirty hint, since state is not
550  * crucial. Be sure to mark the proper buffer dirty.
551  */
552  if (nbuf != InvalidBuffer)
553  MarkBufferDirtyHint(nbuf, true);
554  else
555  MarkBufferDirtyHint(buf, true);
556  }
557  }
558  }
559 
560  /*
561  * Advance to next tuple to continue checking.
562  */
563  if (offset < maxoff)
564  offset = OffsetNumberNext(offset);
565  else
566  {
567  /* If scankey == hikey we gotta check the next page too */
568  if (P_RIGHTMOST(opaque))
569  break;
570  if (!_bt_isequal(itupdesc, page, P_HIKEY,
571  indnkeyatts, itup_scankey))
572  break;
573  /* Advance to next non-dead page --- there must be one */
574  for (;;)
575  {
576  nblkno = opaque->btpo_next;
577  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
578  page = BufferGetPage(nbuf);
579  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
580  if (!P_IGNORE(opaque))
581  break;
582  if (P_RIGHTMOST(opaque))
583  elog(ERROR, "fell off the end of index \"%s\"",
585  }
586  maxoff = PageGetMaxOffsetNumber(page);
587  offset = P_FIRSTDATAKEY(opaque);
588  }
589  }
590 
591  /*
592  * If we are doing a recheck then we should have found the tuple we are
593  * checking. Otherwise there's something very wrong --- probably, the
594  * index is on a non-immutable expression.
595  */
596  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
597  ereport(ERROR,
598  (errcode(ERRCODE_INTERNAL_ERROR),
599  errmsg("failed to re-find tuple within index \"%s\"",
601  errhint("This may be because of a non-immutable index expression."),
602  errtableconstraint(heapRel,
603  RelationGetRelationName(rel))));
604 
605  if (nbuf != InvalidBuffer)
606  _bt_relbuf(rel, nbuf);
607 
608  return InvalidTransactionId;
609 }
610 
611 
612 /*
613  * _bt_findinsertloc() -- Finds an insert location for a tuple
614  *
615  * If the new key is equal to one or more existing keys, we can
616  * legitimately place it anywhere in the series of equal keys --- in fact,
617  * if the new key is equal to the page's "high key" we can place it on
618  * the next page. If it is equal to the high key, and there's not room
619  * to insert the new tuple on the current page without splitting, then
620  * we can move right hoping to find more free space and avoid a split.
621  * (We should not move right indefinitely, however, since that leads to
622  * O(N^2) insertion behavior in the presence of many equal keys.)
623  * Once we have chosen the page to put the key on, we'll insert it before
624  * any existing equal keys because of the way _bt_binsrch() works.
625  *
626  * If there's not enough room in the space, we try to make room by
627  * removing any LP_DEAD tuples.
628  *
629  * On entry, *bufptr and *offsetptr point to the first legal position
630  * where the new tuple could be inserted. The caller should hold an
631  * exclusive lock on *bufptr. *offsetptr can also be set to
632  * InvalidOffsetNumber, in which case the function will search for the
633  * right location within the page if needed. On exit, they point to the
634  * chosen insert location. If _bt_findinsertloc decides to move right,
635  * the lock and pin on the original page will be released and the new
636  * page returned to the caller is exclusively locked instead.
637  *
638  * newtup is the new tuple we're inserting, and scankey is an insertion
639  * type scan key for it.
640  */
641 static void
643  Buffer *bufptr,
644  OffsetNumber *offsetptr,
645  int keysz,
646  ScanKey scankey,
647  IndexTuple newtup,
648  BTStack stack,
649  Relation heapRel)
650 {
651  Buffer buf = *bufptr;
652  Page page = BufferGetPage(buf);
653  Size itemsz;
654  BTPageOpaque lpageop;
655  bool movedright,
656  vacuumed;
657  OffsetNumber newitemoff;
658  OffsetNumber firstlegaloff = *offsetptr;
659 
660  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
661 
662  itemsz = IndexTupleSize(newtup);
663  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
664  * need to be consistent */
665 
666  /*
667  * Check whether the item can fit on a btree page at all. (Eventually, we
668  * ought to try to apply TOAST methods if not.) We actually need to be
669  * able to fit three items on every page, so restrict any one item to 1/3
670  * the per-page available space. Note that at this point, itemsz doesn't
671  * include the ItemId.
672  *
673  * NOTE: if you change this, see also the similar code in _bt_buildadd().
674  */
675  if (itemsz > BTMaxItemSize(page))
676  ereport(ERROR,
677  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
678  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
679  itemsz, BTMaxItemSize(page),
681  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
682  "Consider a function index of an MD5 hash of the value, "
683  "or use full text indexing."),
684  errtableconstraint(heapRel,
685  RelationGetRelationName(rel))));
686 
687  /*----------
688  * If we will need to split the page to put the item on this page,
689  * check whether we can put the tuple somewhere to the right,
690  * instead. Keep scanning right until we
691  * (a) find a page with enough free space,
692  * (b) reach the last page where the tuple can legally go, or
693  * (c) get tired of searching.
694  * (c) is not flippant; it is important because if there are many
695  * pages' worth of equal keys, it's better to split one of the early
696  * pages than to scan all the way to the end of the run of equal keys
697  * on every insert. We implement "get tired" as a random choice,
698  * since stopping after scanning a fixed number of pages wouldn't work
699  * well (we'd never reach the right-hand side of previously split
700  * pages). Currently the probability of moving right is set at 0.99,
701  * which may seem too high to change the behavior much, but it does an
702  * excellent job of preventing O(N^2) behavior with many equal keys.
703  *----------
704  */
705  movedright = false;
706  vacuumed = false;
707  while (PageGetFreeSpace(page) < itemsz)
708  {
709  Buffer rbuf;
710  BlockNumber rblkno;
711 
712  /*
713  * before considering moving right, see if we can obtain enough space
714  * by erasing LP_DEAD items
715  */
716  if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
717  {
718  _bt_vacuum_one_page(rel, buf, heapRel);
719 
720  /*
721  * remember that we vacuumed this page, because that makes the
722  * hint supplied by the caller invalid
723  */
724  vacuumed = true;
725 
726  if (PageGetFreeSpace(page) >= itemsz)
727  break; /* OK, now we have enough space */
728  }
729 
730  /*
731  * nope, so check conditions (b) and (c) enumerated above
732  */
733  if (P_RIGHTMOST(lpageop) ||
734  _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
735  random() <= (MAX_RANDOM_VALUE / 100))
736  break;
737 
738  /*
739  * step right to next non-dead page
740  *
741  * must write-lock that page before releasing write lock on current
742  * page; else someone else's _bt_check_unique scan could fail to see
743  * our insertion. write locks on intermediate dead pages won't do
744  * because we don't know when they will get de-linked from the tree.
745  */
746  rbuf = InvalidBuffer;
747 
748  rblkno = lpageop->btpo_next;
749  for (;;)
750  {
751  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
752  page = BufferGetPage(rbuf);
753  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
754 
755  /*
756  * If this page was incompletely split, finish the split now. We
757  * do this while holding a lock on the left sibling, which is not
758  * good because finishing the split could be a fairly lengthy
759  * operation. But this should happen very seldom.
760  */
761  if (P_INCOMPLETE_SPLIT(lpageop))
762  {
763  _bt_finish_split(rel, rbuf, stack);
764  rbuf = InvalidBuffer;
765  continue;
766  }
767 
768  if (!P_IGNORE(lpageop))
769  break;
770  if (P_RIGHTMOST(lpageop))
771  elog(ERROR, "fell off the end of index \"%s\"",
773 
774  rblkno = lpageop->btpo_next;
775  }
776  _bt_relbuf(rel, buf);
777  buf = rbuf;
778  movedright = true;
779  vacuumed = false;
780  }
781 
782  /*
783  * Now we are on the right page, so find the insert position. If we moved
784  * right at all, we know we should insert at the start of the page. If we
785  * didn't move right, we can use the firstlegaloff hint if the caller
786  * supplied one, unless we vacuumed the page which might have moved tuples
787  * around making the hint invalid. If we didn't move right or can't use
788  * the hint, find the position by searching.
789  */
790  if (movedright)
791  newitemoff = P_FIRSTDATAKEY(lpageop);
792  else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
793  newitemoff = firstlegaloff;
794  else
795  newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
796 
797  *bufptr = buf;
798  *offsetptr = newitemoff;
799 }
800 
801 /*----------
802  * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
803  *
804  * This recursive procedure does the following things:
805  *
806  * + if necessary, splits the target page (making sure that the
807  * split is equitable as far as post-insert free space goes).
808  * + inserts the tuple.
809  * + if the page was split, pops the parent stack, and finds the
810  * right place to insert the new child pointer (by walking
811  * right using information stored in the parent stack).
812  * + invokes itself with the appropriate tuple for the right
813  * child page on the parent.
814  * + updates the metapage if a true root or fast root is split.
815  *
816  * On entry, we must have the correct buffer in which to do the
817  * insertion, and the buffer must be pinned and write-locked. On return,
818  * we will have dropped both the pin and the lock on the buffer.
819  *
820  * This routine only performs retail tuple insertions. 'itup' should
821  * always be either a non-highkey leaf item, or a downlink (new high
822  * key items are created indirectly, when a page is split). When
823  * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
824  * we're inserting the downlink for. This function will clear the
825  * INCOMPLETE_SPLIT flag on it, and release the buffer.
826  *
827  * The locking interactions in this code are critical. You should
828  * grok Lehman and Yao's paper before making any changes. In addition,
829  * you need to understand how we disambiguate duplicate keys in this
830  * implementation, in order to be able to find our location using
831  * L&Y "move right" operations.
832  *----------
833  */
834 static void
836  Buffer buf,
837  Buffer cbuf,
838  BTStack stack,
839  IndexTuple itup,
840  OffsetNumber newitemoff,
841  bool split_only_page)
842 {
843  Page page;
844  BTPageOpaque lpageop;
845  OffsetNumber firstright = InvalidOffsetNumber;
846  Size itemsz;
847 
848  page = BufferGetPage(buf);
849  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
850 
851  /* child buffer must be given iff inserting on an internal page */
852  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
853  /* tuple must have appropriate number of attributes */
854  Assert(!P_ISLEAF(lpageop) ||
855  BTreeTupleGetNAtts(itup, rel) ==
857  Assert(P_ISLEAF(lpageop) ||
858  BTreeTupleGetNAtts(itup, rel) ==
860 
861  /* The caller should've finished any incomplete splits already. */
862  if (P_INCOMPLETE_SPLIT(lpageop))
863  elog(ERROR, "cannot insert to incompletely split page %u",
864  BufferGetBlockNumber(buf));
865 
866  itemsz = IndexTupleSize(itup);
867  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
868  * need to be consistent */
869 
870  /*
871  * Do we need to split the page to fit the item on it?
872  *
873  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
874  * so this comparison is correct even though we appear to be accounting
875  * only for the item and not for its line pointer.
876  */
877  if (PageGetFreeSpace(page) < itemsz)
878  {
879  bool is_root = P_ISROOT(lpageop);
880  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
881  bool newitemonleft;
882  Buffer rbuf;
883 
884  /*
885  * If we're here then a pagesplit is needed. We should never reach
886  * here if we're using the fastpath since we should have checked for
887  * all the required conditions, including the fact that this page has
888  * enough freespace. Note that this routine can in theory deal with
889  * the situation where a NULL stack pointer is passed (that's what
890  * would happen if the fastpath is taken), like it does during crash
891  * recovery. But that path is much slower, defeating the very purpose
892  * of the optimization. The following assertion should protect us
893  * from any future code changes that invalidate those assumptions.
894  *
895  * Note that whenever we fail to take the fastpath, we clear the
896  * cached block. Checking for a valid cached block at this point is
897  * enough to decide whether we're in a fastpath or not.
898  */
899  Assert(!(P_ISLEAF(lpageop) &&
901 
902  /* Choose the split point */
903  firstright = _bt_findsplitloc(rel, page,
904  newitemoff, itemsz,
905  &newitemonleft);
906 
907  /* split the buffer into left and right halves */
908  rbuf = _bt_split(rel, buf, cbuf, firstright,
909  newitemoff, itemsz, itup, newitemonleft);
912  BufferGetBlockNumber(rbuf));
913 
914  /*----------
915  * By here,
916  *
917  * + our target page has been split;
918  * + the original tuple has been inserted;
919  * + we have write locks on both the old (left half)
920  * and new (right half) buffers, after the split; and
921  * + we know the key we want to insert into the parent
922  * (it's the "high key" on the left child page).
923  *
924  * We're ready to do the parent insertion. We need to hold onto the
925  * locks for the child pages until we locate the parent, but we can
926  * release them before doing the actual insertion (see Lehman and Yao
927  * for the reasoning).
928  *----------
929  */
930  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
931  }
932  else
933  {
934  Buffer metabuf = InvalidBuffer;
935  Page metapg = NULL;
936  BTMetaPageData *metad = NULL;
937  OffsetNumber itup_off;
938  BlockNumber itup_blkno;
939  BlockNumber cachedBlock = InvalidBlockNumber;
940 
941  itup_off = newitemoff;
942  itup_blkno = BufferGetBlockNumber(buf);
943 
944  /*
945  * If we are doing this insert because we split a page that was the
946  * only one on its tree level, but was not the root, it may have been
947  * the "fast root". We need to ensure that the fast root link points
948  * at or above the current page. We can safely acquire a lock on the
949  * metapage here --- see comments for _bt_newroot().
950  */
951  if (split_only_page)
952  {
953  Assert(!P_ISLEAF(lpageop));
954 
955  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
956  metapg = BufferGetPage(metabuf);
957  metad = BTPageGetMeta(metapg);
958 
959  if (metad->btm_fastlevel >= lpageop->btpo.level)
960  {
961  /* no update wanted */
962  _bt_relbuf(rel, metabuf);
963  metabuf = InvalidBuffer;
964  }
965  }
966 
967  /*
968  * Every internal page should have exactly one negative infinity item
969  * at all times. Only _bt_split() and _bt_newroot() should add items
970  * that become negative infinity items through truncation, since
971  * they're the only routines that allocate new internal pages. Do not
972  * allow a retail insertion of a new item at the negative infinity
973  * offset.
974  */
975  if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
976  elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
977  itup_blkno, RelationGetRelationName(rel));
978 
979  /* Do the update. No ereport(ERROR) until changes are logged */
981 
982  if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
983  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
984  itup_blkno, RelationGetRelationName(rel));
985 
986  MarkBufferDirty(buf);
987 
988  if (BufferIsValid(metabuf))
989  {
990  /* upgrade meta-page if needed */
991  if (metad->btm_version < BTREE_VERSION)
992  _bt_upgrademetapage(metapg);
993  metad->btm_fastroot = itup_blkno;
994  metad->btm_fastlevel = lpageop->btpo.level;
995  MarkBufferDirty(metabuf);
996  }
997 
998  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
999  if (BufferIsValid(cbuf))
1000  {
1001  Page cpage = BufferGetPage(cbuf);
1002  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1003 
1004  Assert(P_INCOMPLETE_SPLIT(cpageop));
1005  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1006  MarkBufferDirty(cbuf);
1007  }
1008 
1009  /*
1010  * Cache the block information if we just inserted into the rightmost
1011  * leaf page of the index and it's not the root page. For very small
1012  * index where root is also the leaf, there is no point trying for any
1013  * optimization.
1014  */
1015  if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
1016  cachedBlock = BufferGetBlockNumber(buf);
1017 
1018  /* XLOG stuff */
1019  if (RelationNeedsWAL(rel))
1020  {
1021  xl_btree_insert xlrec;
1022  xl_btree_metadata xlmeta;
1023  uint8 xlinfo;
1024  XLogRecPtr recptr;
1025 
1026  xlrec.offnum = itup_off;
1027 
1028  XLogBeginInsert();
1029  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1030 
1031  if (P_ISLEAF(lpageop))
1032  xlinfo = XLOG_BTREE_INSERT_LEAF;
1033  else
1034  {
1035  /*
1036  * Register the left child whose INCOMPLETE_SPLIT flag was
1037  * cleared.
1038  */
1040 
1041  xlinfo = XLOG_BTREE_INSERT_UPPER;
1042  }
1043 
1044  if (BufferIsValid(metabuf))
1045  {
1046  xlmeta.root = metad->btm_root;
1047  xlmeta.level = metad->btm_level;
1048  xlmeta.fastroot = metad->btm_fastroot;
1049  xlmeta.fastlevel = metad->btm_fastlevel;
1050  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1053 
1055  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1056 
1057  xlinfo = XLOG_BTREE_INSERT_META;
1058  }
1059 
1061  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1062 
1063  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1064 
1065  if (BufferIsValid(metabuf))
1066  {
1067  PageSetLSN(metapg, recptr);
1068  }
1069  if (BufferIsValid(cbuf))
1070  {
1071  PageSetLSN(BufferGetPage(cbuf), recptr);
1072  }
1073 
1074  PageSetLSN(page, recptr);
1075  }
1076 
1077  END_CRIT_SECTION();
1078 
1079  /* release buffers */
1080  if (BufferIsValid(metabuf))
1081  _bt_relbuf(rel, metabuf);
1082  if (BufferIsValid(cbuf))
1083  _bt_relbuf(rel, cbuf);
1084  _bt_relbuf(rel, buf);
1085 
1086  /*
1087  * If we decided to cache the insertion target block, then set it now.
1088  * But before that, check for the height of the tree and don't go for
1089  * the optimization for small indexes. We defer that check to this
1090  * point to ensure that we don't call _bt_getrootheight while holding
1091  * lock on any other block.
1092  *
1093  * We do this after dropping locks on all buffers. So the information
1094  * about whether the insertion block is still the rightmost block or
1095  * not may have changed in between. But we will deal with that during
1096  * next insert operation. No special care is required while setting
1097  * it.
1098  */
1099  if (BlockNumberIsValid(cachedBlock) &&
1101  RelationSetTargetBlock(rel, cachedBlock);
1102  }
1103 }
1104 
1105 /*
1106  * _bt_split() -- split a page in the btree.
1107  *
1108  * On entry, buf is the page to split, and is pinned and write-locked.
1109  * firstright is the item index of the first item to be moved to the
1110  * new right page. newitemoff etc. tell us about the new item that
1111  * must be inserted along with the data from the old page.
1112  *
1113  * When splitting a non-leaf page, 'cbuf' is the left-sibling of the
1114  * page we're inserting the downlink for. This function will clear the
1115  * INCOMPLETE_SPLIT flag on it, and release the buffer.
1116  *
1117  * Returns the new right sibling of buf, pinned and write-locked.
1118  * The pin and lock on buf are maintained.
1119  */
1120 static Buffer
1122  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1123  bool newitemonleft)
1124 {
1125  Buffer rbuf;
1126  Page origpage;
1127  Page leftpage,
1128  rightpage;
1129  BlockNumber origpagenumber,
1130  rightpagenumber;
1131  BTPageOpaque ropaque,
1132  lopaque,
1133  oopaque;
1134  Buffer sbuf = InvalidBuffer;
1135  Page spage = NULL;
1136  BTPageOpaque sopaque = NULL;
1137  Size itemsz;
1138  ItemId itemid;
1139  IndexTuple item;
1140  OffsetNumber leftoff,
1141  rightoff;
1142  OffsetNumber maxoff;
1143  OffsetNumber i;
1144  bool isleaf;
1145  IndexTuple lefthikey;
1146  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1147  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1148 
1149  /* Acquire a new page to split into */
1150  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1151 
1152  /*
1153  * origpage is the original page to be split. leftpage is a temporary
1154  * buffer that receives the left-sibling data, which will be copied back
1155  * into origpage on success. rightpage is the new page that receives the
1156  * right-sibling data. If we fail before reaching the critical section,
1157  * origpage hasn't been modified and leftpage is only workspace. In
1158  * principle we shouldn't need to worry about rightpage either, because it
1159  * hasn't been linked into the btree page structure; but to avoid leaving
1160  * possibly-confusing junk behind, we are careful to rewrite rightpage as
1161  * zeroes before throwing any error.
1162  */
1163  origpage = BufferGetPage(buf);
1164  leftpage = PageGetTempPage(origpage);
1165  rightpage = BufferGetPage(rbuf);
1166 
1167  origpagenumber = BufferGetBlockNumber(buf);
1168  rightpagenumber = BufferGetBlockNumber(rbuf);
1169 
1170  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1171  /* rightpage was already initialized by _bt_getbuf */
1172 
1173  /*
1174  * Copy the original page's LSN into leftpage, which will become the
1175  * updated version of the page. We need this because XLogInsert will
1176  * examine the LSN and possibly dump it in a page image.
1177  */
1178  PageSetLSN(leftpage, PageGetLSN(origpage));
1179 
1180  /* init btree private data */
1181  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1182  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1183  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1184 
1185  isleaf = P_ISLEAF(oopaque);
1186 
1187  /* if we're splitting this page, it won't be the root when we're done */
1188  /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1189  lopaque->btpo_flags = oopaque->btpo_flags;
1190  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1191  ropaque->btpo_flags = lopaque->btpo_flags;
1192  /* set flag in left page indicating that the right page has no downlink */
1193  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1194  lopaque->btpo_prev = oopaque->btpo_prev;
1195  lopaque->btpo_next = rightpagenumber;
1196  ropaque->btpo_prev = origpagenumber;
1197  ropaque->btpo_next = oopaque->btpo_next;
1198  lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
1199  /* Since we already have write-lock on both pages, ok to read cycleid */
1200  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1201  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1202 
1203  /*
1204  * If the page we're splitting is not the rightmost page at its level in
1205  * the tree, then the first entry on the page is the high key for the
1206  * page. We need to copy that to the right half. Otherwise (meaning the
1207  * rightmost page case), all the items on the right half will be user
1208  * data.
1209  */
1210  rightoff = P_HIKEY;
1211 
1212  if (!P_RIGHTMOST(oopaque))
1213  {
1214  itemid = PageGetItemId(origpage, P_HIKEY);
1215  itemsz = ItemIdGetLength(itemid);
1216  item = (IndexTuple) PageGetItem(origpage, itemid);
1217  Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
1218  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1219  false, false) == InvalidOffsetNumber)
1220  {
1221  memset(rightpage, 0, BufferGetPageSize(rbuf));
1222  elog(ERROR, "failed to add hikey to the right sibling"
1223  " while splitting block %u of index \"%s\"",
1224  origpagenumber, RelationGetRelationName(rel));
1225  }
1226  rightoff = OffsetNumberNext(rightoff);
1227  }
1228 
1229  /*
1230  * The "high key" for the new left page will be the first key that's going
1231  * to go into the new right page. This might be either the existing data
1232  * item at position firstright, or the incoming tuple.
1233  */
1234  leftoff = P_HIKEY;
1235  if (!newitemonleft && newitemoff == firstright)
1236  {
1237  /* incoming tuple will become first on right page */
1238  itemsz = newitemsz;
1239  item = newitem;
1240  }
1241  else
1242  {
1243  /* existing item at firstright will become first on right page */
1244  itemid = PageGetItemId(origpage, firstright);
1245  itemsz = ItemIdGetLength(itemid);
1246  item = (IndexTuple) PageGetItem(origpage, itemid);
1247  }
1248 
1249  /*
1250  * Truncate non-key (INCLUDE) attributes of the high key item before
1251  * inserting it on the left page. This only needs to happen at the leaf
1252  * level, since in general all pivot tuple values originate from leaf
1253  * level high keys. This isn't just about avoiding unnecessary work,
1254  * though; truncating unneeded key attributes (more aggressive suffix
1255  * truncation) can only be performed at the leaf level anyway. This is
1256  * because a pivot tuple in a grandparent page must guide a search not
1257  * only to the correct parent page, but also to the correct leaf page.
1258  */
1259  if (indnatts != indnkeyatts && isleaf)
1260  {
1261  lefthikey = _bt_nonkey_truncate(rel, item);
1262  itemsz = IndexTupleSize(lefthikey);
1263  itemsz = MAXALIGN(itemsz);
1264  }
1265  else
1266  lefthikey = item;
1267 
1268  Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
1269  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1270  false, false) == InvalidOffsetNumber)
1271  {
1272  memset(rightpage, 0, BufferGetPageSize(rbuf));
1273  elog(ERROR, "failed to add hikey to the left sibling"
1274  " while splitting block %u of index \"%s\"",
1275  origpagenumber, RelationGetRelationName(rel));
1276  }
1277  leftoff = OffsetNumberNext(leftoff);
1278  /* be tidy */
1279  if (lefthikey != item)
1280  pfree(lefthikey);
1281 
1282  /*
1283  * Now transfer all the data items to the appropriate page.
1284  *
1285  * Note: we *must* insert at least the right page's items in item-number
1286  * order, for the benefit of _bt_restore_page().
1287  */
1288  maxoff = PageGetMaxOffsetNumber(origpage);
1289 
1290  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1291  {
1292  itemid = PageGetItemId(origpage, i);
1293  itemsz = ItemIdGetLength(itemid);
1294  item = (IndexTuple) PageGetItem(origpage, itemid);
1295 
1296  /* does new item belong before this one? */
1297  if (i == newitemoff)
1298  {
1299  if (newitemonleft)
1300  {
1301  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1302  {
1303  memset(rightpage, 0, BufferGetPageSize(rbuf));
1304  elog(ERROR, "failed to add new item to the left sibling"
1305  " while splitting block %u of index \"%s\"",
1306  origpagenumber, RelationGetRelationName(rel));
1307  }
1308  leftoff = OffsetNumberNext(leftoff);
1309  }
1310  else
1311  {
1312  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1313  {
1314  memset(rightpage, 0, BufferGetPageSize(rbuf));
1315  elog(ERROR, "failed to add new item to the right sibling"
1316  " while splitting block %u of index \"%s\"",
1317  origpagenumber, RelationGetRelationName(rel));
1318  }
1319  rightoff = OffsetNumberNext(rightoff);
1320  }
1321  }
1322 
1323  /* decide which page to put it on */
1324  if (i < firstright)
1325  {
1326  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1327  {
1328  memset(rightpage, 0, BufferGetPageSize(rbuf));
1329  elog(ERROR, "failed to add old item to the left sibling"
1330  " while splitting block %u of index \"%s\"",
1331  origpagenumber, RelationGetRelationName(rel));
1332  }
1333  leftoff = OffsetNumberNext(leftoff);
1334  }
1335  else
1336  {
1337  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1338  {
1339  memset(rightpage, 0, BufferGetPageSize(rbuf));
1340  elog(ERROR, "failed to add old item to the right sibling"
1341  " while splitting block %u of index \"%s\"",
1342  origpagenumber, RelationGetRelationName(rel));
1343  }
1344  rightoff = OffsetNumberNext(rightoff);
1345  }
1346  }
1347 
1348  /* cope with possibility that newitem goes at the end */
1349  if (i <= newitemoff)
1350  {
1351  /*
1352  * Can't have newitemonleft here; that would imply we were told to put
1353  * *everything* on the left page, which cannot fit (if it could, we'd
1354  * not be splitting the page).
1355  */
1356  Assert(!newitemonleft);
1357  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1358  {
1359  memset(rightpage, 0, BufferGetPageSize(rbuf));
1360  elog(ERROR, "failed to add new item to the right sibling"
1361  " while splitting block %u of index \"%s\"",
1362  origpagenumber, RelationGetRelationName(rel));
1363  }
1364  rightoff = OffsetNumberNext(rightoff);
1365  }
1366 
1367  /*
1368  * We have to grab the right sibling (if any) and fix the prev pointer
1369  * there. We are guaranteed that this is deadlock-free since no other
1370  * writer will be holding a lock on that page and trying to move left, and
1371  * all readers release locks on a page before trying to fetch its
1372  * neighbors.
1373  */
1374 
1375  if (!P_RIGHTMOST(oopaque))
1376  {
1377  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1378  spage = BufferGetPage(sbuf);
1379  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1380  if (sopaque->btpo_prev != origpagenumber)
1381  {
1382  memset(rightpage, 0, BufferGetPageSize(rbuf));
1383  elog(ERROR, "right sibling's left-link doesn't match: "
1384  "block %u links to %u instead of expected %u in index \"%s\"",
1385  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1387  }
1388 
1389  /*
1390  * Check to see if we can set the SPLIT_END flag in the right-hand
1391  * split page; this can save some I/O for vacuum since it need not
1392  * proceed to the right sibling. We can set the flag if the right
1393  * sibling has a different cycleid: that means it could not be part of
1394  * a group of pages that were all split off from the same ancestor
1395  * page. If you're confused, imagine that page A splits to A B and
1396  * then again, yielding A C B, while vacuum is in progress. Tuples
1397  * originally in A could now be in either B or C, hence vacuum must
1398  * examine both pages. But if D, our right sibling, has a different
1399  * cycleid then it could not contain any tuples that were in A when
1400  * the vacuum started.
1401  */
1402  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1403  ropaque->btpo_flags |= BTP_SPLIT_END;
1404  }
1405 
1406  /*
1407  * Right sibling is locked, new siblings are prepared, but original page
1408  * is not updated yet.
1409  *
1410  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1411  * not starting the critical section till here because we haven't been
1412  * scribbling on the original page yet; see comments above.
1413  */
1415 
1416  /*
1417  * By here, the original data page has been split into two new halves, and
1418  * these are correct. The algorithm requires that the left page never
1419  * move during a split, so we copy the new left page back on top of the
1420  * original. Note that this is not a waste of time, since we also require
1421  * (in the page management code) that the center of a page always be
1422  * clean, and the most efficient way to guarantee this is just to compact
1423  * the data by reinserting it into a new left page. (XXX the latter
1424  * comment is probably obsolete; but in any case it's good to not scribble
1425  * on the original page until we enter the critical section.)
1426  *
1427  * We need to do this before writing the WAL record, so that XLogInsert
1428  * can WAL log an image of the page if necessary.
1429  */
1430  PageRestoreTempPage(leftpage, origpage);
1431  /* leftpage, lopaque must not be used below here */
1432 
1433  MarkBufferDirty(buf);
1434  MarkBufferDirty(rbuf);
1435 
1436  if (!P_RIGHTMOST(ropaque))
1437  {
1438  sopaque->btpo_prev = rightpagenumber;
1439  MarkBufferDirty(sbuf);
1440  }
1441 
1442  /*
1443  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1444  * a split.
1445  */
1446  if (!isleaf)
1447  {
1448  Page cpage = BufferGetPage(cbuf);
1449  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1450 
1451  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1452  MarkBufferDirty(cbuf);
1453  }
1454 
1455  /* XLOG stuff */
1456  if (RelationNeedsWAL(rel))
1457  {
1458  xl_btree_split xlrec;
1459  uint8 xlinfo;
1460  XLogRecPtr recptr;
1461  bool loglhikey = false;
1462 
1463  xlrec.level = ropaque->btpo.level;
1464  xlrec.firstright = firstright;
1465  xlrec.newitemoff = newitemoff;
1466 
1467  XLogBeginInsert();
1468  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1469 
1472  /* Log the right sibling, because we've changed its prev-pointer. */
1473  if (!P_RIGHTMOST(ropaque))
1475  if (BufferIsValid(cbuf))
1477 
1478  /*
1479  * Log the new item, if it was inserted on the left page. (If it was
1480  * put on the right page, we don't need to explicitly WAL log it
1481  * because it's included with all the other items on the right page.)
1482  * Show the new item as belonging to the left page buffer, so that it
1483  * is not stored if XLogInsert decides it needs a full-page image of
1484  * the left page. We store the offset anyway, though, to support
1485  * archive compression of these records.
1486  */
1487  if (newitemonleft)
1488  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1489 
1490  /* Log left page */
1491  if (!isleaf || indnatts != indnkeyatts)
1492  {
1493  /*
1494  * We must also log the left page's high key. There are two
1495  * reasons for that: right page's leftmost key is suppressed on
1496  * non-leaf levels and in covering indexes included columns are
1497  * truncated from high keys. Show it as belonging to the left
1498  * page buffer, so that it is not stored if XLogInsert decides it
1499  * needs a full-page image of the left page.
1500  */
1501  itemid = PageGetItemId(origpage, P_HIKEY);
1502  item = (IndexTuple) PageGetItem(origpage, itemid);
1503  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1504  loglhikey = true;
1505  }
1506 
1507  /*
1508  * Log the contents of the right page in the format understood by
1509  * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1510  * because we're going to recreate the whole page anyway, so it should
1511  * never be stored by XLogInsert.
1512  *
1513  * Direct access to page is not good but faster - we should implement
1514  * some new func in page API. Note we only store the tuples
1515  * themselves, knowing that they were inserted in item-number order
1516  * and so the item pointers can be reconstructed. See comments for
1517  * _bt_restore_page().
1518  */
1520  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1521  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1522 
1523  xlinfo = newitemonleft ?
1526  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1527 
1528  PageSetLSN(origpage, recptr);
1529  PageSetLSN(rightpage, recptr);
1530  if (!P_RIGHTMOST(ropaque))
1531  {
1532  PageSetLSN(spage, recptr);
1533  }
1534  if (!isleaf)
1535  {
1536  PageSetLSN(BufferGetPage(cbuf), recptr);
1537  }
1538  }
1539 
1540  END_CRIT_SECTION();
1541 
1542  /* release the old right sibling */
1543  if (!P_RIGHTMOST(ropaque))
1544  _bt_relbuf(rel, sbuf);
1545 
1546  /* release the child */
1547  if (!isleaf)
1548  _bt_relbuf(rel, cbuf);
1549 
1550  /* split's done */
1551  return rbuf;
1552 }
1553 
1554 /*
1555  * _bt_findsplitloc() -- find an appropriate place to split a page.
1556  *
1557  * The idea here is to equalize the free space that will be on each split
1558  * page, *after accounting for the inserted tuple*. (If we fail to account
1559  * for it, we might find ourselves with too little room on the page that
1560  * it needs to go into!)
1561  *
1562  * If the page is the rightmost page on its level, we instead try to arrange
1563  * to leave the left split page fillfactor% full. In this way, when we are
1564  * inserting successively increasing keys (consider sequences, timestamps,
1565  * etc) we will end up with a tree whose pages are about fillfactor% full,
1566  * instead of the 50% full result that we'd get without this special case.
1567  * This is the same as nbtsort.c produces for a newly-created tree. Note
1568  * that leaf and nonleaf pages use different fillfactors.
1569  *
1570  * We are passed the intended insert position of the new tuple, expressed as
1571  * the offsetnumber of the tuple it must go in front of. (This could be
1572  * maxoff+1 if the tuple is to go at the end.)
1573  *
1574  * We return the index of the first existing tuple that should go on the
1575  * righthand page, plus a boolean indicating whether the new tuple goes on
1576  * the left or right page. The bool is necessary to disambiguate the case
1577  * where firstright == newitemoff.
1578  */
1579 static OffsetNumber
1581  Page page,
1582  OffsetNumber newitemoff,
1583  Size newitemsz,
1584  bool *newitemonleft)
1585 {
1586  BTPageOpaque opaque;
1587  OffsetNumber offnum;
1588  OffsetNumber maxoff;
1589  ItemId itemid;
1591  int leftspace,
1592  rightspace,
1593  goodenough,
1594  olddataitemstotal,
1595  olddataitemstoleft;
1596  bool goodenoughfound;
1597 
1598  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1599 
1600  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
1601  newitemsz += sizeof(ItemIdData);
1602 
1603  /* Total free space available on a btree page, after fixed overhead */
1604  leftspace = rightspace =
1606  MAXALIGN(sizeof(BTPageOpaqueData));
1607 
1608  /* The right page will have the same high key as the old page */
1609  if (!P_RIGHTMOST(opaque))
1610  {
1611  itemid = PageGetItemId(page, P_HIKEY);
1612  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
1613  sizeof(ItemIdData));
1614  }
1615 
1616  /* Count up total space in data items without actually scanning 'em */
1617  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
1618 
1619  state.newitemsz = newitemsz;
1620  state.is_leaf = P_ISLEAF(opaque);
1621  state.is_rightmost = P_RIGHTMOST(opaque);
1622  state.have_split = false;
1623  if (state.is_leaf)
1624  state.fillfactor = RelationGetFillFactor(rel,
1626  else
1628  state.newitemonleft = false; /* these just to keep compiler quiet */
1629  state.firstright = 0;
1630  state.best_delta = 0;
1631  state.leftspace = leftspace;
1632  state.rightspace = rightspace;
1633  state.olddataitemstotal = olddataitemstotal;
1634  state.newitemoff = newitemoff;
1635 
1636  /*
1637  * Finding the best possible split would require checking all the possible
1638  * split points, because of the high-key and left-key special cases.
1639  * That's probably more work than it's worth; instead, stop as soon as we
1640  * find a "good-enough" split, where good-enough is defined as an
1641  * imbalance in free space of no more than pagesize/16 (arbitrary...) This
1642  * should let us stop near the middle on most pages, instead of plowing to
1643  * the end.
1644  */
1645  goodenough = leftspace / 16;
1646 
1647  /*
1648  * Scan through the data items and calculate space usage for a split at
1649  * each possible position.
1650  */
1651  olddataitemstoleft = 0;
1652  goodenoughfound = false;
1653  maxoff = PageGetMaxOffsetNumber(page);
1654 
1655  for (offnum = P_FIRSTDATAKEY(opaque);
1656  offnum <= maxoff;
1657  offnum = OffsetNumberNext(offnum))
1658  {
1659  Size itemsz;
1660 
1661  itemid = PageGetItemId(page, offnum);
1662  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1663 
1664  /*
1665  * Will the new item go to left or right of split?
1666  */
1667  if (offnum > newitemoff)
1668  _bt_checksplitloc(&state, offnum, true,
1669  olddataitemstoleft, itemsz);
1670 
1671  else if (offnum < newitemoff)
1672  _bt_checksplitloc(&state, offnum, false,
1673  olddataitemstoleft, itemsz);
1674  else
1675  {
1676  /* need to try it both ways! */
1677  _bt_checksplitloc(&state, offnum, true,
1678  olddataitemstoleft, itemsz);
1679 
1680  _bt_checksplitloc(&state, offnum, false,
1681  olddataitemstoleft, itemsz);
1682  }
1683 
1684  /* Abort scan once we find a good-enough choice */
1685  if (state.have_split && state.best_delta <= goodenough)
1686  {
1687  goodenoughfound = true;
1688  break;
1689  }
1690 
1691  olddataitemstoleft += itemsz;
1692  }
1693 
1694  /*
1695  * If the new item goes as the last item, check for splitting so that all
1696  * the old items go to the left page and the new item goes to the right
1697  * page.
1698  */
1699  if (newitemoff > maxoff && !goodenoughfound)
1700  _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
1701 
1702  /*
1703  * I believe it is not possible to fail to find a feasible split, but just
1704  * in case ...
1705  */
1706  if (!state.have_split)
1707  elog(ERROR, "could not find a feasible split point for index \"%s\"",
1709 
1710  *newitemonleft = state.newitemonleft;
1711  return state.firstright;
1712 }
1713 
1714 /*
1715  * Subroutine to analyze a particular possible split choice (ie, firstright
1716  * and newitemonleft settings), and record the best split so far in *state.
1717  *
1718  * firstoldonright is the offset of the first item on the original page
1719  * that goes to the right page, and firstoldonrightsz is the size of that
1720  * tuple. firstoldonright can be > max offset, which means that all the old
1721  * items go to the left page and only the new item goes to the right page.
1722  * In that case, firstoldonrightsz is not used.
1723  *
1724  * olddataitemstoleft is the total size of all old items to the left of
1725  * firstoldonright.
1726  */
1727 static void
1729  OffsetNumber firstoldonright,
1730  bool newitemonleft,
1731  int olddataitemstoleft,
1732  Size firstoldonrightsz)
1733 {
1734  int leftfree,
1735  rightfree;
1736  Size firstrightitemsz;
1737  bool newitemisfirstonright;
1738 
1739  /* Is the new item going to be the first item on the right page? */
1740  newitemisfirstonright = (firstoldonright == state->newitemoff
1741  && !newitemonleft);
1742 
1743  if (newitemisfirstonright)
1744  firstrightitemsz = state->newitemsz;
1745  else
1746  firstrightitemsz = firstoldonrightsz;
1747 
1748  /* Account for all the old tuples */
1749  leftfree = state->leftspace - olddataitemstoleft;
1750  rightfree = state->rightspace -
1751  (state->olddataitemstotal - olddataitemstoleft);
1752 
1753  /*
1754  * The first item on the right page becomes the high key of the left page;
1755  * therefore it counts against left space as well as right space. When
1756  * index has included attributes, then those attributes of left page high
1757  * key will be truncated leaving that page with slightly more free space.
1758  * However, that shouldn't affect our ability to find valid split
1759  * location, because anyway split location should exists even without high
1760  * key truncation.
1761  */
1762  leftfree -= firstrightitemsz;
1763 
1764  /* account for the new item */
1765  if (newitemonleft)
1766  leftfree -= (int) state->newitemsz;
1767  else
1768  rightfree -= (int) state->newitemsz;
1769 
1770  /*
1771  * If we are not on the leaf level, we will be able to discard the key
1772  * data from the first item that winds up on the right page.
1773  */
1774  if (!state->is_leaf)
1775  rightfree += (int) firstrightitemsz -
1776  (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
1777 
1778  /*
1779  * If feasible split point, remember best delta.
1780  */
1781  if (leftfree >= 0 && rightfree >= 0)
1782  {
1783  int delta;
1784 
1785  if (state->is_rightmost)
1786  {
1787  /*
1788  * If splitting a rightmost page, try to put (100-fillfactor)% of
1789  * free space on left page. See comments for _bt_findsplitloc.
1790  */
1791  delta = (state->fillfactor * leftfree)
1792  - ((100 - state->fillfactor) * rightfree);
1793  }
1794  else
1795  {
1796  /* Otherwise, aim for equal free space on both sides */
1797  delta = leftfree - rightfree;
1798  }
1799 
1800  if (delta < 0)
1801  delta = -delta;
1802  if (!state->have_split || delta < state->best_delta)
1803  {
1804  state->have_split = true;
1805  state->newitemonleft = newitemonleft;
1806  state->firstright = firstoldonright;
1807  state->best_delta = delta;
1808  }
1809  }
1810 }
1811 
1812 /*
1813  * _bt_insert_parent() -- Insert downlink into parent after a page split.
1814  *
1815  * On entry, buf and rbuf are the left and right split pages, which we
1816  * still hold write locks on per the L&Y algorithm. We release the
1817  * write locks once we have write lock on the parent page. (Any sooner,
1818  * and it'd be possible for some other process to try to split or delete
1819  * one of these pages, and get confused because it cannot find the downlink.)
1820  *
1821  * stack - stack showing how we got here. May be NULL in cases that don't
1822  * have to be efficient (concurrent ROOT split, WAL recovery)
1823  * is_root - we split the true root
1824  * is_only - we split a page alone on its level (might have been fast root)
1825  */
1826 static void
1828  Buffer buf,
1829  Buffer rbuf,
1830  BTStack stack,
1831  bool is_root,
1832  bool is_only)
1833 {
1834  /*
1835  * Here we have to do something Lehman and Yao don't talk about: deal with
1836  * a root split and construction of a new root. If our stack is empty
1837  * then we have just split a node on what had been the root level when we
1838  * descended the tree. If it was still the root then we perform a
1839  * new-root construction. If it *wasn't* the root anymore, search to find
1840  * the next higher level that someone constructed meanwhile, and find the
1841  * right place to insert as for the normal case.
1842  *
1843  * If we have to search for the parent level, we do so by re-descending
1844  * from the root. This is not super-efficient, but it's rare enough not
1845  * to matter.
1846  */
1847  if (is_root)
1848  {
1849  Buffer rootbuf;
1850 
1851  Assert(stack == NULL);
1852  Assert(is_only);
1853  /* create a new root node and update the metapage */
1854  rootbuf = _bt_newroot(rel, buf, rbuf);
1855  /* release the split buffers */
1856  _bt_relbuf(rel, rootbuf);
1857  _bt_relbuf(rel, rbuf);
1858  _bt_relbuf(rel, buf);
1859  }
1860  else
1861  {
1862  BlockNumber bknum = BufferGetBlockNumber(buf);
1863  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1864  Page page = BufferGetPage(buf);
1865  IndexTuple new_item;
1866  BTStackData fakestack;
1867  IndexTuple ritem;
1868  Buffer pbuf;
1869 
1870  if (stack == NULL)
1871  {
1872  BTPageOpaque lpageop;
1873 
1874  elog(DEBUG2, "concurrent ROOT page split");
1875  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1876  /* Find the leftmost page at the next level up */
1877  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
1878  NULL);
1879  /* Set up a phony stack entry pointing there */
1880  stack = &fakestack;
1881  stack->bts_blkno = BufferGetBlockNumber(pbuf);
1884  stack->bts_parent = NULL;
1885  _bt_relbuf(rel, pbuf);
1886  }
1887 
1888  /* get high key from left page == lower bound for new right page */
1889  ritem = (IndexTuple) PageGetItem(page,
1890  PageGetItemId(page, P_HIKEY));
1891 
1892  /* form an index tuple that points at the new right page */
1893  new_item = CopyIndexTuple(ritem);
1894  BTreeInnerTupleSetDownLink(new_item, rbknum);
1895 
1896  /*
1897  * Find the parent buffer and get the parent page.
1898  *
1899  * Oops - if we were moved right then we need to change stack item! We
1900  * want to find parent pointing to where we are, right ? - vadim
1901  * 05/27/97
1902  */
1903  stack->bts_btentry = bknum;
1904  pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1905 
1906  /*
1907  * Now we can unlock the right child. The left child will be unlocked
1908  * by _bt_insertonpg().
1909  */
1910  _bt_relbuf(rel, rbuf);
1911 
1912  /* Check for error only after writing children */
1913  if (pbuf == InvalidBuffer)
1914  elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
1915  RelationGetRelationName(rel), bknum, rbknum);
1916 
1917  /* Recursively update the parent */
1918  _bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
1919  new_item, stack->bts_offset + 1,
1920  is_only);
1921 
1922  /* be tidy */
1923  pfree(new_item);
1924  }
1925 }
1926 
1927 /*
1928  * _bt_finish_split() -- Finish an incomplete split
1929  *
1930  * A crash or other failure can leave a split incomplete. The insertion
1931  * routines won't allow to insert on a page that is incompletely split.
1932  * Before inserting on such a page, call _bt_finish_split().
1933  *
1934  * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
1935  * and unpinned.
1936  */
1937 void
1939 {
1940  Page lpage = BufferGetPage(lbuf);
1941  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1942  Buffer rbuf;
1943  Page rpage;
1944  BTPageOpaque rpageop;
1945  bool was_root;
1946  bool was_only;
1947 
1948  Assert(P_INCOMPLETE_SPLIT(lpageop));
1949 
1950  /* Lock right sibling, the one missing the downlink */
1951  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1952  rpage = BufferGetPage(rbuf);
1953  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1954 
1955  /* Could this be a root split? */
1956  if (!stack)
1957  {
1958  Buffer metabuf;
1959  Page metapg;
1960  BTMetaPageData *metad;
1961 
1962  /* acquire lock on the metapage */
1963  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1964  metapg = BufferGetPage(metabuf);
1965  metad = BTPageGetMeta(metapg);
1966 
1967  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1968 
1969  _bt_relbuf(rel, metabuf);
1970  }
1971  else
1972  was_root = false;
1973 
1974  /* Was this the only page on the level before split? */
1975  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1976 
1977  elog(DEBUG1, "finishing incomplete split of %u/%u",
1979 
1980  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1981 }
1982 
1983 /*
1984  * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
1985  * we last looked at in the parent.
1986  *
1987  * This is possible because we save the downlink from the parent item,
1988  * which is enough to uniquely identify it. Insertions into the parent
1989  * level could cause the item to move right; deletions could cause it
1990  * to move left, but not left of the page we previously found it in.
1991  *
1992  * Adjusts bts_blkno & bts_offset if changed.
1993  *
1994  * Returns InvalidBuffer if item not found (should not happen).
1995  */
1996 Buffer
1997 _bt_getstackbuf(Relation rel, BTStack stack, int access)
1998 {
1999  BlockNumber blkno;
2000  OffsetNumber start;
2001 
2002  blkno = stack->bts_blkno;
2003  start = stack->bts_offset;
2004 
2005  for (;;)
2006  {
2007  Buffer buf;
2008  Page page;
2009  BTPageOpaque opaque;
2010 
2011  buf = _bt_getbuf(rel, blkno, access);
2012  page = BufferGetPage(buf);
2013  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2014 
2015  if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
2016  {
2017  _bt_finish_split(rel, buf, stack->bts_parent);
2018  continue;
2019  }
2020 
2021  if (!P_IGNORE(opaque))
2022  {
2023  OffsetNumber offnum,
2024  minoff,
2025  maxoff;
2026  ItemId itemid;
2027  IndexTuple item;
2028 
2029  minoff = P_FIRSTDATAKEY(opaque);
2030  maxoff = PageGetMaxOffsetNumber(page);
2031 
2032  /*
2033  * start = InvalidOffsetNumber means "search the whole page". We
2034  * need this test anyway due to possibility that page has a high
2035  * key now when it didn't before.
2036  */
2037  if (start < minoff)
2038  start = minoff;
2039 
2040  /*
2041  * Need this check too, to guard against possibility that page
2042  * split since we visited it originally.
2043  */
2044  if (start > maxoff)
2045  start = OffsetNumberNext(maxoff);
2046 
2047  /*
2048  * These loops will check every item on the page --- but in an
2049  * order that's attuned to the probability of where it actually
2050  * is. Scan to the right first, then to the left.
2051  */
2052  for (offnum = start;
2053  offnum <= maxoff;
2054  offnum = OffsetNumberNext(offnum))
2055  {
2056  itemid = PageGetItemId(page, offnum);
2057  item = (IndexTuple) PageGetItem(page, itemid);
2058 
2059  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2060  {
2061  /* Return accurate pointer to where link is now */
2062  stack->bts_blkno = blkno;
2063  stack->bts_offset = offnum;
2064  return buf;
2065  }
2066  }
2067 
2068  for (offnum = OffsetNumberPrev(start);
2069  offnum >= minoff;
2070  offnum = OffsetNumberPrev(offnum))
2071  {
2072  itemid = PageGetItemId(page, offnum);
2073  item = (IndexTuple) PageGetItem(page, itemid);
2074 
2075  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2076  {
2077  /* Return accurate pointer to where link is now */
2078  stack->bts_blkno = blkno;
2079  stack->bts_offset = offnum;
2080  return buf;
2081  }
2082  }
2083  }
2084 
2085  /*
2086  * The item we're looking for moved right at least one page.
2087  */
2088  if (P_RIGHTMOST(opaque))
2089  {
2090  _bt_relbuf(rel, buf);
2091  return InvalidBuffer;
2092  }
2093  blkno = opaque->btpo_next;
2094  start = InvalidOffsetNumber;
2095  _bt_relbuf(rel, buf);
2096  }
2097 }
2098 
2099 /*
2100  * _bt_newroot() -- Create a new root page for the index.
2101  *
2102  * We've just split the old root page and need to create a new one.
2103  * In order to do this, we add a new root page to the file, then lock
2104  * the metadata page and update it. This is guaranteed to be deadlock-
2105  * free, because all readers release their locks on the metadata page
2106  * before trying to lock the root, and all writers lock the root before
2107  * trying to lock the metadata page. We have a write lock on the old
2108  * root page, so we have not introduced any cycles into the waits-for
2109  * graph.
2110  *
2111  * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2112  * locked. On exit, a new root page exists with entries for the
2113  * two new children, metapage is updated and unlocked/unpinned.
2114  * The new root buffer is returned to caller which has to unlock/unpin
2115  * lbuf, rbuf & rootbuf.
2116  */
2117 static Buffer
2119 {
2120  Buffer rootbuf;
2121  Page lpage,
2122  rootpage;
2123  BlockNumber lbkno,
2124  rbkno;
2125  BlockNumber rootblknum;
2126  BTPageOpaque rootopaque;
2127  BTPageOpaque lopaque;
2128  ItemId itemid;
2129  IndexTuple item;
2130  IndexTuple left_item;
2131  Size left_item_sz;
2132  IndexTuple right_item;
2133  Size right_item_sz;
2134  Buffer metabuf;
2135  Page metapg;
2136  BTMetaPageData *metad;
2137 
2138  lbkno = BufferGetBlockNumber(lbuf);
2139  rbkno = BufferGetBlockNumber(rbuf);
2140  lpage = BufferGetPage(lbuf);
2141  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2142 
2143  /* get a new root page */
2144  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2145  rootpage = BufferGetPage(rootbuf);
2146  rootblknum = BufferGetBlockNumber(rootbuf);
2147 
2148  /* acquire lock on the metapage */
2149  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2150  metapg = BufferGetPage(metabuf);
2151  metad = BTPageGetMeta(metapg);
2152 
2153  /*
2154  * Create downlink item for left page (old root). Since this will be the
2155  * first item in a non-leaf page, it implicitly has minus-infinity key
2156  * value, so we need not store any actual key in it.
2157  */
2158  left_item_sz = sizeof(IndexTupleData);
2159  left_item = (IndexTuple) palloc(left_item_sz);
2160  left_item->t_info = left_item_sz;
2161  BTreeInnerTupleSetDownLink(left_item, lbkno);
2162  BTreeTupleSetNAtts(left_item, 0);
2163 
2164  /*
2165  * Create downlink item for right page. The key for it is obtained from
2166  * the "high key" position in the left page.
2167  */
2168  itemid = PageGetItemId(lpage, P_HIKEY);
2169  right_item_sz = ItemIdGetLength(itemid);
2170  item = (IndexTuple) PageGetItem(lpage, itemid);
2171  right_item = CopyIndexTuple(item);
2172  BTreeInnerTupleSetDownLink(right_item, rbkno);
2173 
2174  /* NO EREPORT(ERROR) from here till newroot op is logged */
2176 
2177  /* upgrade metapage if needed */
2178  if (metad->btm_version < BTREE_VERSION)
2179  _bt_upgrademetapage(metapg);
2180 
2181  /* set btree special data */
2182  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2183  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2184  rootopaque->btpo_flags = BTP_ROOT;
2185  rootopaque->btpo.level =
2186  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2187  rootopaque->btpo_cycleid = 0;
2188 
2189  /* update metapage data */
2190  metad->btm_root = rootblknum;
2191  metad->btm_level = rootopaque->btpo.level;
2192  metad->btm_fastroot = rootblknum;
2193  metad->btm_fastlevel = rootopaque->btpo.level;
2194 
2195  /*
2196  * Insert the left page pointer into the new root page. The root page is
2197  * the rightmost page on its level so there is no "high key" in it; the
2198  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2199  *
2200  * Note: we *must* insert the two items in item-number order, for the
2201  * benefit of _bt_restore_page().
2202  */
2203  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2204  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2205  false, false) == InvalidOffsetNumber)
2206  elog(PANIC, "failed to add leftkey to new root page"
2207  " while splitting block %u of index \"%s\"",
2209 
2210  /*
2211  * insert the right page pointer into the new root page.
2212  */
2213  Assert(BTreeTupleGetNAtts(right_item, rel) ==
2215  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2216  false, false) == InvalidOffsetNumber)
2217  elog(PANIC, "failed to add rightkey to new root page"
2218  " while splitting block %u of index \"%s\"",
2220 
2221  /* Clear the incomplete-split flag in the left child */
2222  Assert(P_INCOMPLETE_SPLIT(lopaque));
2223  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2224  MarkBufferDirty(lbuf);
2225 
2226  MarkBufferDirty(rootbuf);
2227  MarkBufferDirty(metabuf);
2228 
2229  /* XLOG stuff */
2230  if (RelationNeedsWAL(rel))
2231  {
2232  xl_btree_newroot xlrec;
2233  XLogRecPtr recptr;
2234  xl_btree_metadata md;
2235 
2236  xlrec.rootblk = rootblknum;
2237  xlrec.level = metad->btm_level;
2238 
2239  XLogBeginInsert();
2240  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2241 
2242  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2245 
2246  md.root = rootblknum;
2247  md.level = metad->btm_level;
2248  md.fastroot = rootblknum;
2249  md.fastlevel = metad->btm_level;
2252 
2253  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2254 
2255  /*
2256  * Direct access to page is not good but faster - we should implement
2257  * some new func in page API.
2258  */
2260  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2261  ((PageHeader) rootpage)->pd_special -
2262  ((PageHeader) rootpage)->pd_upper);
2263 
2264  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2265 
2266  PageSetLSN(lpage, recptr);
2267  PageSetLSN(rootpage, recptr);
2268  PageSetLSN(metapg, recptr);
2269  }
2270 
2271  END_CRIT_SECTION();
2272 
2273  /* done with metapage */
2274  _bt_relbuf(rel, metabuf);
2275 
2276  pfree(left_item);
2277  pfree(right_item);
2278 
2279  return rootbuf;
2280 }
2281 
2282 /*
2283  * _bt_pgaddtup() -- add a tuple to a particular page in the index.
2284  *
2285  * This routine adds the tuple to the page as requested. It does
2286  * not affect pin/lock status, but you'd better have a write lock
2287  * and pin on the target buffer! Don't forget to write and release
2288  * the buffer afterwards, either.
2289  *
2290  * The main difference between this routine and a bare PageAddItem call
2291  * is that this code knows that the leftmost index tuple on a non-leaf
2292  * btree page doesn't need to have a key. Therefore, it strips such
2293  * tuples down to just the tuple header. CAUTION: this works ONLY if
2294  * we insert the tuples in order, so that the given itup_off does
2295  * represent the final position of the tuple!
2296  */
2297 static bool
2299  Size itemsize,
2300  IndexTuple itup,
2301  OffsetNumber itup_off)
2302 {
2304  IndexTupleData trunctuple;
2305 
2306  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2307  {
2308  trunctuple = *itup;
2309  trunctuple.t_info = sizeof(IndexTupleData);
2310  BTreeTupleSetNAtts(&trunctuple, 0);
2311  itup = &trunctuple;
2312  itemsize = sizeof(IndexTupleData);
2313  }
2314 
2315  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2316  false, false) == InvalidOffsetNumber)
2317  return false;
2318 
2319  return true;
2320 }
2321 
2322 /*
2323  * _bt_isequal - used in _bt_doinsert in check for duplicates.
2324  *
2325  * This is very similar to _bt_compare, except for NULL handling.
2326  * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
2327  */
2328 static bool
2329 _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
2330  int keysz, ScanKey scankey)
2331 {
2332  IndexTuple itup;
2333  int i;
2334 
2335  /* Better be comparing to a leaf item */
2337 
2338  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2339 
2340  /*
2341  * It's okay that we might perform a comparison against a truncated page
2342  * high key when caller needs to determine if _bt_check_unique scan must
2343  * continue on to the next page. Caller never asks us to compare non-key
2344  * attributes within an INCLUDE index.
2345  */
2346  for (i = 1; i <= keysz; i++)
2347  {
2348  AttrNumber attno;
2349  Datum datum;
2350  bool isNull;
2351  int32 result;
2352 
2353  attno = scankey->sk_attno;
2354  Assert(attno == i);
2355  datum = index_getattr(itup, attno, itupdesc, &isNull);
2356 
2357  /* NULLs are never equal to anything */
2358  if (isNull || (scankey->sk_flags & SK_ISNULL))
2359  return false;
2360 
2361  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
2362  scankey->sk_collation,
2363  datum,
2364  scankey->sk_argument));
2365 
2366  if (result != 0)
2367  return false;
2368 
2369  scankey++;
2370  }
2371 
2372  /* if we get here, the keys are equal */
2373  return true;
2374 }
2375 
2376 /*
2377  * _bt_vacuum_one_page - vacuum just one index page.
2378  *
2379  * Try to remove LP_DEAD items from the given page. The passed buffer
2380  * must be exclusive-locked, but unlike a real VACUUM, we don't need a
2381  * super-exclusive "cleanup" lock (see nbtree/README).
2382  */
2383 static void
2385 {
2386  OffsetNumber deletable[MaxOffsetNumber];
2387  int ndeletable = 0;
2388  OffsetNumber offnum,
2389  minoff,
2390  maxoff;
2391  Page page = BufferGetPage(buffer);
2393 
2394  /*
2395  * Scan over all items to see which ones need to be deleted according to
2396  * LP_DEAD flags.
2397  */
2398  minoff = P_FIRSTDATAKEY(opaque);
2399  maxoff = PageGetMaxOffsetNumber(page);
2400  for (offnum = minoff;
2401  offnum <= maxoff;
2402  offnum = OffsetNumberNext(offnum))
2403  {
2404  ItemId itemId = PageGetItemId(page, offnum);
2405 
2406  if (ItemIdIsDead(itemId))
2407  deletable[ndeletable++] = offnum;
2408  }
2409 
2410  if (ndeletable > 0)
2411  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2412 
2413  /*
2414  * Note: if we didn't find any LP_DEAD items, then the page's
2415  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2416  * separate write to clear it, however. We will clear it when we split
2417  * the page.
2418  */
2419 }
Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access)
Definition: nbtinsert.c:1997
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:341
BlockNumber rootblk
Definition: nbtxlog.h:245
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
#define BTP_ROOT
Definition: nbtree.h:72
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1885
#define BTP_SPLIT_END
Definition: nbtree.h:76
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:987
bool newitemonleft
Definition: nbtinsert.c:47
int best_delta
Definition: nbtinsert.c:49
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:110
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:407
#define P_IGNORE(opaque)
Definition: nbtree.h:163
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:224
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:86
uint32 TransactionId
Definition: c.h:474
uint32 btm_version
Definition: nbtree.h:100
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:594
#define DatumGetInt32(X)
Definition: postgres.h:457
bool is_rightmost
Definition: nbtinsert.c:38
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:860
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
#define RelationGetDescr(relation)
Definition: rel.h:433
#define ItemIdMarkDead(itemId)
Definition: itemid.h:178
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:117
long random(void)
Definition: random.c:22
#define MaxOffsetNumber
Definition: off.h:28
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
void _bt_freeskey(ScanKey skey)
Definition: nbtutils.c:166
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey)
Definition: nbtinsert.c:2329
#define SizeOfBtreeSplit
Definition: nbtxlog.h:115
ItemPointerData t_tid
Definition: itup.h:37
int fillfactor
Definition: nbtinsert.c:36
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:133
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:323
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:150
#define InvalidBuffer
Definition: buf.h:25
static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft)
Definition: nbtinsert.c:1580
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup)
Definition: nbtutils.c:2102
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:164
#define START_CRIT_SECTION()
Definition: miscadmin.h:131
int errcode(int sqlerrcode)
Definition: elog.c:575
uint32 level
Definition: nbtxlog.h:246
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
bool have_split
Definition: nbtinsert.c:44
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1827
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2118
#define P_NEW
Definition: bufmgr.h:82
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1133
#define SizeOfPageHeaderData
Definition: bufpage.h:212
static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz)
Definition: nbtinsert.c:1728
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:247
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1776
#define PANIC
Definition: elog.h:53
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
#define RelationGetTargetBlock(relation)
Definition: rel.h:493
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:257
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4280
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
int rightspace
Definition: nbtinsert.c:41
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
signed int int32
Definition: c.h:313
uint16 OffsetNumber
Definition: off.h:24
IndexUniqueCheck
Definition: genam.h:111
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5317
#define BT_READ
Definition: nbtree.h:300
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1938
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:140
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:662
Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:214
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
#define InitDirtySnapshot(snapshotdata)
Definition: tqual.h:103
OffsetNumber newitemoff
Definition: nbtxlog.h:112
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2384
bool is_leaf
Definition: nbtinsert.c:37
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define DEBUG2
Definition: elog.h:24
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:711
Size newitemsz
Definition: nbtinsert.c:35
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define XLOG_BTREE_SPLIT_L_HIGHKEY
Definition: nbtxlog.h:31
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:441
OffsetNumber bts_offset
Definition: nbtree.h:316
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
int errdetail(const char *fmt,...)
Definition: elog.c:873
FmgrInfo sk_func
Definition: skey.h:71
#define MAX_RANDOM_VALUE
#define P_FIRSTKEY
Definition: nbtree.h:186
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:323
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:419
#define PageGetPageSize(page)
Definition: bufpage.h:264
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define P_LEFTMOST(opaque)
Definition: nbtree.h:156
unsigned int uint32
Definition: c.h:325
struct ItemIdData ItemIdData
static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
Definition: nbtinsert.c:1121
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:423
TransactionId xmax
Definition: snapshot.h:69
TransactionId xmin
Definition: snapshot.h:68
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
#define SK_ISNULL
Definition: skey.h:115
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define SnapshotSelf
Definition: tqual.h:27
#define ereport(elevel, rest)
Definition: elog.h:122
static void _bt_findinsertloc(Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, int keysz, ScanKey scankey, IndexTuple newtup, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:642
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:139
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3572
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define P_ISROOT(opaque)
Definition: nbtree.h:159
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:30
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:249
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:348
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2298
uintptr_t Datum
Definition: postgres.h:367
uint32 level
Definition: nbtxlog.h:110
OffsetNumber offnum
Definition: nbtxlog.h:70
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:147
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
BlockNumber btm_root
Definition: nbtree.h:101
#define RelationGetFillFactor(relation, defaultff)
Definition: rel.h:283
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:315
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
BlockNumber bts_btentry
Definition: nbtree.h:317
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
int sk_flags
Definition: skey.h:66
uint32 speculativeToken
Definition: snapshot.h:104
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:554
int olddataitemstotal
Definition: nbtinsert.c:42
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:699
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtinsert.c:39
OffsetNumber firstright
Definition: nbtxlog.h:111
WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]
Definition: walsender.c:215
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define INDEX_MAX_KEYS
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:500
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead)
Definition: heapam.c:2183
#define InvalidBlockNumber
Definition: block.h:33
#define XLOG_BTREE_SPLIT_R_HIGHKEY
Definition: nbtxlog.h:32
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define MAXALIGN(LEN)
Definition: c.h:652
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
struct BTStackData * bts_parent
Definition: nbtree.h:318
#define RelationNeedsWAL(relation)
Definition: rel.h:510
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
#define PageGetLSN(page)
Definition: bufpage.h:362
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:629
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:226
#define BTMaxItemSize(page)
Definition: nbtree.h:126
#define P_HIKEY
Definition: nbtree.h:185
static Datum values[MAXATTR]
Definition: bootstrap.c:164
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:835
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:797
uint32 fastlevel
Definition: nbtxlog.h:53
Oid sk_collation
Definition: skey.h:70
uint32 btm_level
Definition: nbtree.h:102
uint32 level
Definition: nbtxlog.h:51
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:891
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
Definition: nbtpage.c:1021
BlockNumber fastroot
Definition: nbtxlog.h:52
OffsetNumber firstright
Definition: nbtinsert.c:48
#define elog
Definition: elog.h:219
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:178
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
#define TransactionIdIsValid(xid)
Definition: transam.h:41
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:428
ScanKey _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:62
#define BT_WRITE
Definition: nbtree.h:301
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Datum sk_argument
Definition: skey.h:72
int16 AttrNumber
Definition: attnum.h:21
#define SizeOfBtreeInsert
Definition: nbtxlog.h:73
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3065
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158