PostgreSQL Source Code  git master
nbtinsert.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
Include dependency graph for nbtinsert.c:

Go to the source code of this file.

Macros

#define BTREE_FASTPATH_MIN_LEVEL   2
 

Functions

static BTStack _bt_search_insert (Relation rel, BTInsertState insertstate)
 
static TransactionId _bt_check_unique (Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
 
static OffsetNumber _bt_findinsertloc (Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
 
static void _bt_stepright (Relation rel, BTInsertState insertstate, BTStack stack)
 
static void _bt_insertonpg (Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
 
static Buffer _bt_split (Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
 
static void _bt_insert_parent (Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
 
static Buffer _bt_newroot (Relation rel, Buffer lbuf, Buffer rbuf)
 
static bool _bt_pgaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
 
static void _bt_vacuum_one_page (Relation rel, Buffer buffer, Relation heapRel)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, BlockNumber child)
 

Macro Definition Documentation

◆ BTREE_FASTPATH_MIN_LEVEL

#define BTREE_FASTPATH_MIN_LEVEL   2

Definition at line 29 of file nbtinsert.c.

Referenced by _bt_insertonpg().

Function Documentation

◆ _bt_check_unique()

static TransactionId _bt_check_unique ( Relation  rel,
BTInsertState  insertstate,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool is_unique,
uint32 speculativeToken 
)
static

Definition at line 387 of file nbtinsert.c.

References _bt_binsrch_insert(), _bt_compare(), _bt_relandgetbuf(), _bt_relbuf(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BT_READ, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage, BuildIndexValueDescription(), CheckForSerializableConflictIn(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), index_deform_tuple(), INDEX_MAX_KEYS, InitDirtySnapshot, InvalidBuffer, InvalidTransactionId, ItemIdIsDead, ItemIdMarkDead, ItemPointerCompare(), BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MarkBufferDirtyHint(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, RelationGetRelationName, BTScanInsertData::scantid, SnapshotSelf, SnapshotData::speculativeToken, BTInsertStateData::stricthigh, IndexTupleData::t_tid, table_index_fetch_tuple_check(), TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_PARTIAL, values, SnapshotData::xmax, and SnapshotData::xmin.

Referenced by _bt_doinsert().

390 {
391  IndexTuple itup = insertstate->itup;
392  IndexTuple curitup = NULL;
393  ItemId curitemid;
394  BTScanInsert itup_key = insertstate->itup_key;
395  SnapshotData SnapshotDirty;
396  OffsetNumber offset;
397  OffsetNumber maxoff;
398  Page page;
399  BTPageOpaque opaque;
400  Buffer nbuf = InvalidBuffer;
401  bool found = false;
402  bool inposting = false;
403  bool prevalldead = true;
404  int curposti = 0;
405 
406  /* Assume unique until we find a duplicate */
407  *is_unique = true;
408 
409  InitDirtySnapshot(SnapshotDirty);
410 
411  page = BufferGetPage(insertstate->buf);
412  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
413  maxoff = PageGetMaxOffsetNumber(page);
414 
415  /*
416  * Find the first tuple with the same key.
417  *
418  * This also saves the binary search bounds in insertstate. We use them
419  * in the fastpath below, but also in the _bt_findinsertloc() call later.
420  */
421  Assert(!insertstate->bounds_valid);
422  offset = _bt_binsrch_insert(rel, insertstate);
423 
424  /*
425  * Scan over all equal tuples, looking for live conflicts.
426  */
427  Assert(!insertstate->bounds_valid || insertstate->low == offset);
428  Assert(!itup_key->anynullkeys);
429  Assert(itup_key->scantid == NULL);
430  for (;;)
431  {
432  /*
433  * Each iteration of the loop processes one heap TID, not one index
434  * tuple. Current offset number for page isn't usually advanced on
435  * iterations that process heap TIDs from posting list tuples.
436  *
437  * "inposting" state is set when _inside_ a posting list --- not when
438  * we're at the start (or end) of a posting list. We advance curposti
439  * at the end of the iteration when inside a posting list tuple. In
440  * general, every loop iteration either advances the page offset or
441  * advances curposti --- an iteration that handles the rightmost/max
442  * heap TID in a posting list finally advances the page offset (and
443  * unsets "inposting").
444  *
445  * Make sure the offset points to an actual index tuple before trying
446  * to examine it...
447  */
448  if (offset <= maxoff)
449  {
450  /*
451  * Fastpath: In most cases, we can use cached search bounds to
452  * limit our consideration to items that are definitely
453  * duplicates. This fastpath doesn't apply when the original page
454  * is empty, or when initial offset is past the end of the
455  * original page, which may indicate that we need to examine a
456  * second or subsequent page.
457  *
458  * Note that this optimization allows us to avoid calling
459  * _bt_compare() directly when there are no duplicates, as long as
460  * the offset where the key will go is not at the end of the page.
461  */
462  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
463  {
464  Assert(insertstate->bounds_valid);
465  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
466  Assert(insertstate->low <= insertstate->stricthigh);
467  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
468  break;
469  }
470 
471  /*
472  * We can skip items that are already marked killed.
473  *
474  * In the presence of heavy update activity an index may contain
475  * many killed items with the same key; running _bt_compare() on
476  * each killed item gets expensive. Just advance over killed
477  * items as quickly as we can. We only apply _bt_compare() when
478  * we get to a non-killed item. We could reuse the bounds to
479  * avoid _bt_compare() calls for known equal tuples, but it
480  * doesn't seem worth it. Workloads with heavy update activity
481  * tend to have many deduplication passes, so we'll often avoid
482  * most of those comparisons, too (we call _bt_compare() when the
483  * posting list tuple is initially encountered, though not when
484  * processing later TIDs from the same tuple).
485  */
486  if (!inposting)
487  curitemid = PageGetItemId(page, offset);
488  if (inposting || !ItemIdIsDead(curitemid))
489  {
490  ItemPointerData htid;
491  bool all_dead = false;
492 
493  if (!inposting)
494  {
495  /* Plain tuple, or first TID in posting list tuple */
496  if (_bt_compare(rel, itup_key, page, offset) != 0)
497  break; /* we're past all the equal tuples */
498 
499  /* Advanced curitup */
500  curitup = (IndexTuple) PageGetItem(page, curitemid);
501  Assert(!BTreeTupleIsPivot(curitup));
502  }
503 
504  /* okay, we gotta fetch the heap tuple using htid ... */
505  if (!BTreeTupleIsPosting(curitup))
506  {
507  /* ... htid is from simple non-pivot tuple */
508  Assert(!inposting);
509  htid = curitup->t_tid;
510  }
511  else if (!inposting)
512  {
513  /* ... htid is first TID in new posting list */
514  inposting = true;
515  prevalldead = true;
516  curposti = 0;
517  htid = *BTreeTupleGetPostingN(curitup, 0);
518  }
519  else
520  {
521  /* ... htid is second or subsequent TID in posting list */
522  Assert(curposti > 0);
523  htid = *BTreeTupleGetPostingN(curitup, curposti);
524  }
525 
526  /*
527  * If we are doing a recheck, we expect to find the tuple we
528  * are rechecking. It's not a duplicate, but we have to keep
529  * scanning.
530  */
531  if (checkUnique == UNIQUE_CHECK_EXISTING &&
532  ItemPointerCompare(&htid, &itup->t_tid) == 0)
533  {
534  found = true;
535  }
536 
537  /*
538  * Check if there's any table tuples for this index entry
539  * satisfying SnapshotDirty. This is necessary because for AMs
540  * with optimizations like heap's HOT, we have just a single
541  * index entry for the entire chain.
542  */
543  else if (table_index_fetch_tuple_check(heapRel, &htid,
544  &SnapshotDirty,
545  &all_dead))
546  {
547  TransactionId xwait;
548 
549  /*
550  * It is a duplicate. If we are only doing a partial
551  * check, then don't bother checking if the tuple is being
552  * updated in another transaction. Just return the fact
553  * that it is a potential conflict and leave the full
554  * check till later. Don't invalidate binary search
555  * bounds.
556  */
557  if (checkUnique == UNIQUE_CHECK_PARTIAL)
558  {
559  if (nbuf != InvalidBuffer)
560  _bt_relbuf(rel, nbuf);
561  *is_unique = false;
562  return InvalidTransactionId;
563  }
564 
565  /*
566  * If this tuple is being updated by other transaction
567  * then we have to wait for its commit/abort.
568  */
569  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
570  SnapshotDirty.xmin : SnapshotDirty.xmax;
571 
572  if (TransactionIdIsValid(xwait))
573  {
574  if (nbuf != InvalidBuffer)
575  _bt_relbuf(rel, nbuf);
576  /* Tell _bt_doinsert to wait... */
577  *speculativeToken = SnapshotDirty.speculativeToken;
578  /* Caller releases lock on buf immediately */
579  insertstate->bounds_valid = false;
580  return xwait;
581  }
582 
583  /*
584  * Otherwise we have a definite conflict. But before
585  * complaining, look to see if the tuple we want to insert
586  * is itself now committed dead --- if so, don't complain.
587  * This is a waste of time in normal scenarios but we must
588  * do it to support CREATE INDEX CONCURRENTLY.
589  *
590  * We must follow HOT-chains here because during
591  * concurrent index build, we insert the root TID though
592  * the actual tuple may be somewhere in the HOT-chain.
593  * While following the chain we might not stop at the
594  * exact tuple which triggered the insert, but that's OK
595  * because if we find a live tuple anywhere in this chain,
596  * we have a unique key conflict. The other live tuple is
597  * not part of this chain because it had a different index
598  * entry.
599  */
600  htid = itup->t_tid;
601  if (table_index_fetch_tuple_check(heapRel, &htid,
602  SnapshotSelf, NULL))
603  {
604  /* Normal case --- it's still live */
605  }
606  else
607  {
608  /*
609  * It's been deleted, so no error, and no need to
610  * continue searching
611  */
612  break;
613  }
614 
615  /*
616  * Check for a conflict-in as we would if we were going to
617  * write to this page. We aren't actually going to write,
618  * but we want a chance to report SSI conflicts that would
619  * otherwise be masked by this unique constraint
620  * violation.
621  */
622  CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
623 
624  /*
625  * This is a definite conflict. Break the tuple down into
626  * datums and report the error. But first, make sure we
627  * release the buffer locks we're holding ---
628  * BuildIndexValueDescription could make catalog accesses,
629  * which in the worst case might touch this same index and
630  * cause deadlocks.
631  */
632  if (nbuf != InvalidBuffer)
633  _bt_relbuf(rel, nbuf);
634  _bt_relbuf(rel, insertstate->buf);
635  insertstate->buf = InvalidBuffer;
636  insertstate->bounds_valid = false;
637 
638  {
640  bool isnull[INDEX_MAX_KEYS];
641  char *key_desc;
642 
644  values, isnull);
645 
646  key_desc = BuildIndexValueDescription(rel, values,
647  isnull);
648 
649  ereport(ERROR,
650  (errcode(ERRCODE_UNIQUE_VIOLATION),
651  errmsg("duplicate key value violates unique constraint \"%s\"",
653  key_desc ? errdetail("Key %s already exists.",
654  key_desc) : 0,
655  errtableconstraint(heapRel,
656  RelationGetRelationName(rel))));
657  }
658  }
659  else if (all_dead && (!inposting ||
660  (prevalldead &&
661  curposti == BTreeTupleGetNPosting(curitup) - 1)))
662  {
663  /*
664  * The conflicting tuple (or all HOT chains pointed to by
665  * all posting list TIDs) is dead to everyone, so mark the
666  * index entry killed.
667  */
668  ItemIdMarkDead(curitemid);
669  opaque->btpo_flags |= BTP_HAS_GARBAGE;
670 
671  /*
672  * Mark buffer with a dirty hint, since state is not
673  * crucial. Be sure to mark the proper buffer dirty.
674  */
675  if (nbuf != InvalidBuffer)
676  MarkBufferDirtyHint(nbuf, true);
677  else
678  MarkBufferDirtyHint(insertstate->buf, true);
679  }
680 
681  /*
682  * Remember if posting list tuple has even a single HOT chain
683  * whose members are not all dead
684  */
685  if (!all_dead && inposting)
686  prevalldead = false;
687  }
688  }
689 
690  if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
691  {
692  /* Advance to next TID in same posting list */
693  curposti++;
694  continue;
695  }
696  else if (offset < maxoff)
697  {
698  /* Advance to next tuple */
699  curposti = 0;
700  inposting = false;
701  offset = OffsetNumberNext(offset);
702  }
703  else
704  {
705  int highkeycmp;
706 
707  /* If scankey == hikey we gotta check the next page too */
708  if (P_RIGHTMOST(opaque))
709  break;
710  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
711  Assert(highkeycmp <= 0);
712  if (highkeycmp != 0)
713  break;
714  /* Advance to next non-dead page --- there must be one */
715  for (;;)
716  {
717  BlockNumber nblkno = opaque->btpo_next;
718 
719  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
720  page = BufferGetPage(nbuf);
721  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
722  if (!P_IGNORE(opaque))
723  break;
724  if (P_RIGHTMOST(opaque))
725  elog(ERROR, "fell off the end of index \"%s\"",
727  }
728  /* Will also advance to next tuple */
729  curposti = 0;
730  inposting = false;
731  maxoff = PageGetMaxOffsetNumber(page);
732  offset = P_FIRSTDATAKEY(opaque);
733  /* Don't invalidate binary search bounds */
734  }
735  }
736 
737  /*
738  * If we are doing a recheck then we should have found the tuple we are
739  * checking. Otherwise there's something very wrong --- probably, the
740  * index is on a non-immutable expression.
741  */
742  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
743  ereport(ERROR,
744  (errcode(ERRCODE_INTERNAL_ERROR),
745  errmsg("failed to re-find tuple within index \"%s\"",
747  errhint("This may be because of a non-immutable index expression."),
748  errtableconstraint(heapRel,
749  RelationGetRelationName(rel))));
750 
751  if (nbuf != InvalidBuffer)
752  _bt_relbuf(rel, nbuf);
753 
754  return InvalidTransactionId;
755 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
BlockNumber btpo_next
Definition: nbtree.h:59
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:76
int errhint(const char *fmt,...)
Definition: elog.c:1071
#define P_IGNORE(opaque)
Definition: nbtree.h:219
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
bool bounds_valid
Definition: nbtree.h:696
uint32 TransactionId
Definition: c.h:513
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:928
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3553
#define RelationGetDescr(relation)
Definition: rel.h:482
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
ItemPointer scantid
Definition: nbtree.h:664
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
OffsetNumber stricthigh
Definition: nbtree.h:698
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 BlockNumber
Definition: block.h:31
OffsetNumber low
Definition: nbtree.h:697
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5544
#define BT_READ
Definition: nbtree.h:587
#define ERROR
Definition: elog.h:43
#define SnapshotSelf
Definition: snapmgr.h:68
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:448
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:490
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:434
TransactionId xmax
Definition: snapshot.h:158
TransactionId xmin
Definition: snapshot.h:157
IndexTuple itup
Definition: nbtree.h:684
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:645
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:205
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
bool anynullkeys
Definition: nbtree.h:661
uintptr_t Datum
Definition: postgres.h:367
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4377
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define ereport(elevel,...)
Definition: elog.h:144
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
uint32 speculativeToken
Definition: snapshot.h:193
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:738
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define P_HIKEY
Definition: nbtree.h:242
static Datum values[MAXATTR]
Definition: bootstrap.c:167
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define elog(elevel,...)
Definition: elog.h:214
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:176
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint16 btpo_flags
Definition: nbtree.h:65
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 82 of file nbtinsert.c.

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

84 {
85  bool is_unique = false;
86  BTInsertStateData insertstate;
87  BTScanInsert itup_key;
88  BTStack stack;
89  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
90 
91  /* we need an insertion scan key to do our search, so build one */
92  itup_key = _bt_mkscankey(rel, itup);
93 
94  if (checkingunique)
95  {
96  if (!itup_key->anynullkeys)
97  {
98  /* No (heapkeyspace) scantid until uniqueness established */
99  itup_key->scantid = NULL;
100  }
101  else
102  {
103  /*
104  * Scan key for new tuple contains NULL key values. Bypass
105  * checkingunique steps. They are unnecessary because core code
106  * considers NULL unequal to every value, including NULL.
107  *
108  * This optimization avoids O(N^2) behavior within the
109  * _bt_findinsertloc() heapkeyspace path when a unique index has a
110  * large number of "duplicates" with NULL key values.
111  */
112  checkingunique = false;
113  /* Tuple is unique in the sense that core code cares about */
114  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
115  is_unique = true;
116  }
117  }
118 
119  /*
120  * Fill in the BTInsertState working area, to track the current page and
121  * position within the page to insert on.
122  *
123  * Note that itemsz is passed down to lower level code that deals with
124  * inserting the item. It must be MAXALIGN()'d. This ensures that space
125  * accounting code consistently considers the alignment overhead that we
126  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
127  * already conservative about alignment, but we don't rely on that from
128  * this distance. Besides, preserving the "true" tuple size in index
129  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
130  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
131  */
132  insertstate.itup = itup;
133  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
134  insertstate.itup_key = itup_key;
135  insertstate.bounds_valid = false;
136  insertstate.buf = InvalidBuffer;
137  insertstate.postingoff = 0;
138 
139 search:
140 
141  /*
142  * Find and lock the leaf page that the tuple should be added to by
143  * searching from the root page. insertstate.buf will hold a buffer that
144  * is locked in exclusive mode afterwards.
145  */
146  stack = _bt_search_insert(rel, &insertstate);
147 
148  /*
149  * checkingunique inserts are not allowed to go ahead when two tuples with
150  * equal key attribute values would be visible to new MVCC snapshots once
151  * the xact commits. Check for conflicts in the locked page/buffer (if
152  * needed) here.
153  *
154  * It might be necessary to check a page to the right in _bt_check_unique,
155  * though that should be very rare. In practice the first page the value
156  * could be on (with scantid omitted) is almost always also the only page
157  * that a matching tuple might be found on. This is due to the behavior
158  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
159  * only be allowed to cross a page boundary when there is no candidate
160  * leaf page split point that avoids it. Also, _bt_check_unique can use
161  * the leaf page high key to determine that there will be no duplicates on
162  * the right sibling without actually visiting it (it uses the high key in
163  * cases where the new item happens to belong at the far right of the leaf
164  * page).
165  *
166  * NOTE: obviously, _bt_check_unique can only detect keys that are already
167  * in the index; so it cannot defend against concurrent insertions of the
168  * same key. We protect against that by means of holding a write lock on
169  * the first page the value could be on, with omitted/-inf value for the
170  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
171  * the same key must acquire a write lock on the same page, so only one
172  * would-be inserter can be making the check at one time. Furthermore,
173  * once we are past the check we hold write locks continuously until we
174  * have performed our insertion, so no later inserter can fail to see our
175  * insertion. (This requires some care in _bt_findinsertloc.)
176  *
177  * If we must wait for another xact, we release the lock while waiting,
178  * and then must perform a new search.
179  *
180  * For a partial uniqueness check, we don't wait for the other xact. Just
181  * let the tuple in and return false for possibly non-unique, or true for
182  * definitely unique.
183  */
184  if (checkingunique)
185  {
186  TransactionId xwait;
187  uint32 speculativeToken;
188 
189  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
190  &is_unique, &speculativeToken);
191 
192  if (unlikely(TransactionIdIsValid(xwait)))
193  {
194  /* Have to wait for the other guy ... */
195  _bt_relbuf(rel, insertstate.buf);
196  insertstate.buf = InvalidBuffer;
197 
198  /*
199  * If it's a speculative insertion, wait for it to finish (ie. to
200  * go ahead with the insertion, or kill the tuple). Otherwise
201  * wait for the transaction to finish as usual.
202  */
203  if (speculativeToken)
204  SpeculativeInsertionWait(xwait, speculativeToken);
205  else
206  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
207 
208  /* start over... */
209  if (stack)
210  _bt_freestack(stack);
211  goto search;
212  }
213 
214  /* Uniqueness is established -- restore heap tid as scantid */
215  if (itup_key->heapkeyspace)
216  itup_key->scantid = &itup->t_tid;
217  }
218 
219  if (checkUnique != UNIQUE_CHECK_EXISTING)
220  {
221  OffsetNumber newitemoff;
222 
223  /*
224  * The only conflict predicate locking cares about for indexes is when
225  * an index tuple insert conflicts with an existing lock. We don't
226  * know the actual page we're going to insert on for sure just yet in
227  * checkingunique and !heapkeyspace cases, but it's okay to use the
228  * first page the value could be on (with scantid omitted) instead.
229  */
231 
232  /*
233  * Do the insertion. Note that insertstate contains cached binary
234  * search bounds established within _bt_check_unique when insertion is
235  * checkingunique.
236  */
237  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
238  stack, heapRel);
239  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
240  itup, insertstate.itemsz, newitemoff,
241  insertstate.postingoff, false);
242  }
243  else
244  {
245  /* just release the buffer */
246  _bt_relbuf(rel, insertstate.buf);
247  }
248 
249  /* be tidy */
250  if (stack)
251  _bt_freestack(stack);
252  pfree(itup_key);
253 
254  return is_unique;
255 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1101
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
bool bounds_valid
Definition: nbtree.h:696
uint32 TransactionId
Definition: c.h:513
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
ItemPointer scantid
Definition: nbtree.h:664
ItemPointerData t_tid
Definition: itup.h:37
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:790
#define InvalidBuffer
Definition: buf.h:25
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1056
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:781
unsigned int uint32
Definition: c.h:367
IndexTuple itup
Definition: nbtree.h:684
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:296
bool anynullkeys
Definition: nbtree.h:661
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4377
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:624
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:738
bool heapkeyspace
Definition: nbtree.h:659
#define MAXALIGN(LEN)
Definition: c.h:691
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:387
#define unlikely(x)
Definition: c.h:206
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_findinsertloc()

static OffsetNumber _bt_findinsertloc ( Relation  rel,
BTInsertState  insertstate,
bool  checkingunique,
BTStack  stack,
Relation  heapRel 
)
static

Definition at line 790 of file nbtinsert.c.

References _bt_binsrch_insert(), _bt_check_third_page(), _bt_compare(), _bt_dedup_one_page(), _bt_stepright(), _bt_vacuum_one_page(), BTScanInsertData::allequalimage, Assert, BTInsertStateData::bounds_valid, BTGetDeduplicateItems, BTMaxItemSize, BTInsertStateData::buf, BufferGetPage, BTScanInsertData::heapkeyspace, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MAX_RANDOM_VALUE, P_HAS_GARBAGE, P_HIKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::postingoff, random(), BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_doinsert().

795 {
796  BTScanInsert itup_key = insertstate->itup_key;
797  Page page = BufferGetPage(insertstate->buf);
798  BTPageOpaque lpageop;
799  OffsetNumber newitemoff;
800 
801  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
802 
803  /* Check 1/3 of a page restriction */
804  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
805  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
806  insertstate->itup);
807 
808  Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
809  Assert(!insertstate->bounds_valid || checkingunique);
810  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
811  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
812  Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
813 
814  if (itup_key->heapkeyspace)
815  {
816  /* Keep track of whether checkingunique duplicate seen */
817  bool uniquedup = false;
818 
819  /*
820  * If we're inserting into a unique index, we may have to walk right
821  * through leaf pages to find the one leaf page that we must insert on
822  * to.
823  *
824  * This is needed for checkingunique callers because a scantid was not
825  * used when we called _bt_search(). scantid can only be set after
826  * _bt_check_unique() has checked for duplicates. The buffer
827  * initially stored in insertstate->buf has the page where the first
828  * duplicate key might be found, which isn't always the page that new
829  * tuple belongs on. The heap TID attribute for new tuple (scantid)
830  * could force us to insert on a sibling page, though that should be
831  * very rare in practice.
832  */
833  if (checkingunique)
834  {
835  if (insertstate->low < insertstate->stricthigh)
836  {
837  /* Encountered a duplicate in _bt_check_unique() */
838  Assert(insertstate->bounds_valid);
839  uniquedup = true;
840  }
841 
842  for (;;)
843  {
844  /*
845  * Does the new tuple belong on this page?
846  *
847  * The earlier _bt_check_unique() call may well have
848  * established a strict upper bound on the offset for the new
849  * item. If it's not the last item of the page (i.e. if there
850  * is at least one tuple on the page that goes after the tuple
851  * we're inserting) then we know that the tuple belongs on
852  * this page. We can skip the high key check.
853  */
854  if (insertstate->bounds_valid &&
855  insertstate->low <= insertstate->stricthigh &&
856  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
857  break;
858 
859  /* Test '<=', not '!=', since scantid is set now */
860  if (P_RIGHTMOST(lpageop) ||
861  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
862  break;
863 
864  _bt_stepright(rel, insertstate, stack);
865  /* Update local state after stepping right */
866  page = BufferGetPage(insertstate->buf);
867  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
868  /* Assume duplicates (if checkingunique) */
869  uniquedup = true;
870  }
871  }
872 
873  /*
874  * If the target page is full, see if we can obtain enough space by
875  * erasing LP_DEAD items. If that fails to free enough space, see if
876  * we can avoid a page split by performing a deduplication pass over
877  * the page.
878  *
879  * We only perform a deduplication pass for a checkingunique caller
880  * when the incoming item is a duplicate of an existing item on the
881  * leaf page. This heuristic avoids wasting cycles -- we only expect
882  * to benefit from deduplicating a unique index page when most or all
883  * recently added items are duplicates. See nbtree/README.
884  */
885  if (PageGetFreeSpace(page) < insertstate->itemsz)
886  {
887  if (P_HAS_GARBAGE(lpageop))
888  {
889  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
890  insertstate->bounds_valid = false;
891 
892  /* Might as well assume duplicates (if checkingunique) */
893  uniquedup = true;
894  }
895 
896  if (itup_key->allequalimage && BTGetDeduplicateItems(rel) &&
897  (!checkingunique || uniquedup) &&
898  PageGetFreeSpace(page) < insertstate->itemsz)
899  {
900  _bt_dedup_one_page(rel, insertstate->buf, heapRel,
901  insertstate->itup, insertstate->itemsz,
902  checkingunique);
903  insertstate->bounds_valid = false;
904  }
905  }
906  }
907  else
908  {
909  /*----------
910  * This is a !heapkeyspace (version 2 or 3) index. The current page
911  * is the first page that we could insert the new tuple to, but there
912  * may be other pages to the right that we could opt to use instead.
913  *
914  * If the new key is equal to one or more existing keys, we can
915  * legitimately place it anywhere in the series of equal keys. In
916  * fact, if the new key is equal to the page's "high key" we can place
917  * it on the next page. If it is equal to the high key, and there's
918  * not room to insert the new tuple on the current page without
919  * splitting, then we move right hoping to find more free space and
920  * avoid a split.
921  *
922  * Keep scanning right until we
923  * (a) find a page with enough free space,
924  * (b) reach the last page where the tuple can legally go, or
925  * (c) get tired of searching.
926  * (c) is not flippant; it is important because if there are many
927  * pages' worth of equal keys, it's better to split one of the early
928  * pages than to scan all the way to the end of the run of equal keys
929  * on every insert. We implement "get tired" as a random choice,
930  * since stopping after scanning a fixed number of pages wouldn't work
931  * well (we'd never reach the right-hand side of previously split
932  * pages). The probability of moving right is set at 0.99, which may
933  * seem too high to change the behavior much, but it does an excellent
934  * job of preventing O(N^2) behavior with many equal keys.
935  *----------
936  */
937  while (PageGetFreeSpace(page) < insertstate->itemsz)
938  {
939  /*
940  * Before considering moving right, see if we can obtain enough
941  * space by erasing LP_DEAD items
942  */
943  if (P_HAS_GARBAGE(lpageop))
944  {
945  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
946  insertstate->bounds_valid = false;
947 
948  if (PageGetFreeSpace(page) >= insertstate->itemsz)
949  break; /* OK, now we have enough space */
950  }
951 
952  /*
953  * Nope, so check conditions (b) and (c) enumerated above
954  *
955  * The earlier _bt_check_unique() call may well have established a
956  * strict upper bound on the offset for the new item. If it's not
957  * the last item of the page (i.e. if there is at least one tuple
958  * on the page that's greater than the tuple we're inserting to)
959  * then we know that the tuple belongs on this page. We can skip
960  * the high key check.
961  */
962  if (insertstate->bounds_valid &&
963  insertstate->low <= insertstate->stricthigh &&
964  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
965  break;
966 
967  if (P_RIGHTMOST(lpageop) ||
968  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
969  random() <= (MAX_RANDOM_VALUE / 100))
970  break;
971 
972  _bt_stepright(rel, insertstate, stack);
973  /* Update local state after stepping right */
974  page = BufferGetPage(insertstate->buf);
975  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
976  }
977  }
978 
979  /*
980  * We should now be on the correct page. Find the offset within the page
981  * for the new tuple. (Possibly reusing earlier search bounds.)
982  */
983  Assert(P_RIGHTMOST(lpageop) ||
984  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
985 
986  newitemoff = _bt_binsrch_insert(rel, insertstate);
987 
988  if (insertstate->postingoff == -1)
989  {
990  /*
991  * There is an overlapping posting list tuple with its LP_DEAD bit
992  * set. We don't want to unnecessarily unset its LP_DEAD bit while
993  * performing a posting list split, so delete all LP_DEAD items early.
994  * This is the only case where LP_DEAD deletes happen even though
995  * there is space for newitem on the page.
996  */
997  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
998 
999  /*
1000  * Do new binary search. New insert location cannot overlap with any
1001  * posting list now.
1002  */
1003  insertstate->bounds_valid = false;
1004  insertstate->postingoff = 0;
1005  newitemoff = _bt_binsrch_insert(rel, insertstate);
1006  Assert(insertstate->postingoff == 0);
1007  }
1008 
1009  return newitemoff;
1010 }
void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique)
Definition: nbtdedup.c:56
bool bounds_valid
Definition: nbtree.h:696
ItemPointer scantid
Definition: nbtree.h:664
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:976
long random(void)
Definition: random.c:22
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:220
OffsetNumber stricthigh
Definition: nbtree.h:698
bool allequalimage
Definition: nbtree.h:660
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1025
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
OffsetNumber low
Definition: nbtree.h:697
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:574
uint16 OffsetNumber
Definition: off.h:24
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2635
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:448
#define MAX_RANDOM_VALUE
IndexTuple itup
Definition: nbtree.h:684
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:645
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:738
bool heapkeyspace
Definition: nbtree.h:659
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2633
#define BTMaxItemSize(page)
Definition: nbtree.h:157
#define P_HIKEY
Definition: nbtree.h:242
#define unlikely(x)
Definition: c.h:206
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2215 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

2216 {
2217  Page lpage = BufferGetPage(lbuf);
2218  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
2219  Buffer rbuf;
2220  Page rpage;
2221  BTPageOpaque rpageop;
2222  bool was_root;
2223  bool was_only;
2224 
2225  Assert(P_INCOMPLETE_SPLIT(lpageop));
2226 
2227  /* Lock right sibling, the one missing the downlink */
2228  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2229  rpage = BufferGetPage(rbuf);
2230  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
2231 
2232  /* Could this be a root split? */
2233  if (!stack)
2234  {
2235  Buffer metabuf;
2236  Page metapg;
2237  BTMetaPageData *metad;
2238 
2239  /* acquire lock on the metapage */
2240  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2241  metapg = BufferGetPage(metabuf);
2242  metad = BTPageGetMeta(metapg);
2243 
2244  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
2245 
2246  _bt_relbuf(rel, metabuf);
2247  }
2248  else
2249  was_root = false;
2250 
2251  /* Was this the only page on the level before split? */
2252  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2253 
2254  elog(DEBUG1, "finishing incomplete split of %u/%u",
2256 
2257  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
2258 }
BlockNumber btpo_next
Definition: nbtree.h:59
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:2078
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
#define BTPageGetMeta(p)
Definition: nbtree.h:114
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BTREE_METAPAGE
Definition: nbtree.h:141
BlockNumber btm_root
Definition: nbtree.h:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define Assert(condition)
Definition: c.h:738
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define elog(elevel,...)
Definition: elog.h:214
#define BT_WRITE
Definition: nbtree.h:588
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Pointer Page
Definition: bufpage.h:78

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2292 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

2293 {
2294  BlockNumber blkno;
2295  OffsetNumber start;
2296 
2297  blkno = stack->bts_blkno;
2298  start = stack->bts_offset;
2299 
2300  for (;;)
2301  {
2302  Buffer buf;
2303  Page page;
2304  BTPageOpaque opaque;
2305 
2306  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2307  page = BufferGetPage(buf);
2308  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2309 
2310  if (P_INCOMPLETE_SPLIT(opaque))
2311  {
2312  _bt_finish_split(rel, buf, stack->bts_parent);
2313  continue;
2314  }
2315 
2316  if (!P_IGNORE(opaque))
2317  {
2318  OffsetNumber offnum,
2319  minoff,
2320  maxoff;
2321  ItemId itemid;
2322  IndexTuple item;
2323 
2324  minoff = P_FIRSTDATAKEY(opaque);
2325  maxoff = PageGetMaxOffsetNumber(page);
2326 
2327  /*
2328  * start = InvalidOffsetNumber means "search the whole page". We
2329  * need this test anyway due to possibility that page has a high
2330  * key now when it didn't before.
2331  */
2332  if (start < minoff)
2333  start = minoff;
2334 
2335  /*
2336  * Need this check too, to guard against possibility that page
2337  * split since we visited it originally.
2338  */
2339  if (start > maxoff)
2340  start = OffsetNumberNext(maxoff);
2341 
2342  /*
2343  * These loops will check every item on the page --- but in an
2344  * order that's attuned to the probability of where it actually
2345  * is. Scan to the right first, then to the left.
2346  */
2347  for (offnum = start;
2348  offnum <= maxoff;
2349  offnum = OffsetNumberNext(offnum))
2350  {
2351  itemid = PageGetItemId(page, offnum);
2352  item = (IndexTuple) PageGetItem(page, itemid);
2353 
2354  if (BTreeTupleGetDownLink(item) == child)
2355  {
2356  /* Return accurate pointer to where link is now */
2357  stack->bts_blkno = blkno;
2358  stack->bts_offset = offnum;
2359  return buf;
2360  }
2361  }
2362 
2363  for (offnum = OffsetNumberPrev(start);
2364  offnum >= minoff;
2365  offnum = OffsetNumberPrev(offnum))
2366  {
2367  itemid = PageGetItemId(page, offnum);
2368  item = (IndexTuple) PageGetItem(page, itemid);
2369 
2370  if (BTreeTupleGetDownLink(item) == child)
2371  {
2372  /* Return accurate pointer to where link is now */
2373  stack->bts_blkno = blkno;
2374  stack->bts_offset = offnum;
2375  return buf;
2376  }
2377  }
2378  }
2379 
2380  /*
2381  * The item we're looking for moved right at least one page.
2382  *
2383  * Lehman and Yao couple/chain locks when moving right here, which we
2384  * can avoid. See nbtree/README.
2385  */
2386  if (P_RIGHTMOST(opaque))
2387  {
2388  _bt_relbuf(rel, buf);
2389  return InvalidBuffer;
2390  }
2391  blkno = opaque->btpo_next;
2392  start = InvalidOffsetNumber;
2393  _bt_relbuf(rel, buf);
2394  }
2395 }
BlockNumber btpo_next
Definition: nbtree.h:59
#define P_IGNORE(opaque)
Definition: nbtree.h:219
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2215
OffsetNumber bts_offset
Definition: nbtree.h:603
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
struct BTStackData * bts_parent
Definition: nbtree.h:604
#define BT_WRITE
Definition: nbtree.h:588
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_insert_parent()

static void _bt_insert_parent ( Relation  rel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  is_root,
bool  is_only 
)
static

Definition at line 2078 of file nbtinsert.c.

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newroot(), _bt_relbuf(), Assert, BlockNumberIsValid, BTPageOpaqueData::btpo, BTreeTupleSetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), DEBUG2, elog, ereport, errcode(), errmsg_internal(), ERROR, IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, BTPageOpaqueData::level, MAXALIGN, P_HIKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), RelationGetRelationName, and RelationGetTargetBlock.

Referenced by _bt_finish_split(), and _bt_insertonpg().

2084 {
2085  /*
2086  * Here we have to do something Lehman and Yao don't talk about: deal with
2087  * a root split and construction of a new root. If our stack is empty
2088  * then we have just split a node on what had been the root level when we
2089  * descended the tree. If it was still the root then we perform a
2090  * new-root construction. If it *wasn't* the root anymore, search to find
2091  * the next higher level that someone constructed meanwhile, and find the
2092  * right place to insert as for the normal case.
2093  *
2094  * If we have to search for the parent level, we do so by re-descending
2095  * from the root. This is not super-efficient, but it's rare enough not
2096  * to matter.
2097  */
2098  if (is_root)
2099  {
2100  Buffer rootbuf;
2101 
2102  Assert(stack == NULL);
2103  Assert(is_only);
2104  /* create a new root node and update the metapage */
2105  rootbuf = _bt_newroot(rel, buf, rbuf);
2106  /* release the split buffers */
2107  _bt_relbuf(rel, rootbuf);
2108  _bt_relbuf(rel, rbuf);
2109  _bt_relbuf(rel, buf);
2110  }
2111  else
2112  {
2114  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2115  Page page = BufferGetPage(buf);
2116  IndexTuple new_item;
2117  BTStackData fakestack;
2118  IndexTuple ritem;
2119  Buffer pbuf;
2120 
2121  if (stack == NULL)
2122  {
2123  BTPageOpaque lpageop;
2124 
2125  elog(DEBUG2, "concurrent ROOT page split");
2126  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
2127 
2128  /*
2129  * We should never reach here when a leaf page split takes place
2130  * despite the insert of newitem being able to apply the fastpath
2131  * optimization. Make sure of that with an assertion.
2132  *
2133  * This is more of a performance issue than a correctness issue.
2134  * The fastpath won't have a descent stack. Using a phony stack
2135  * here works, but never rely on that. The fastpath should be
2136  * rejected within _bt_search_insert() when the rightmost leaf
2137  * page will split, since it's faster to go through _bt_search()
2138  * and get a stack in the usual way.
2139  */
2140  Assert(!(P_ISLEAF(lpageop) &&
2142 
2143  /* Find the leftmost page at the next level up */
2144  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
2145  NULL);
2146  /* Set up a phony stack entry pointing there */
2147  stack = &fakestack;
2148  stack->bts_blkno = BufferGetBlockNumber(pbuf);
2150  stack->bts_parent = NULL;
2151  _bt_relbuf(rel, pbuf);
2152  }
2153 
2154  /* get high key from left, a strict lower bound for new right page */
2155  ritem = (IndexTuple) PageGetItem(page,
2156  PageGetItemId(page, P_HIKEY));
2157 
2158  /* form an index tuple that points at the new right page */
2159  new_item = CopyIndexTuple(ritem);
2160  BTreeTupleSetDownLink(new_item, rbknum);
2161 
2162  /*
2163  * Re-find and write lock the parent of buf.
2164  *
2165  * It's possible that the location of buf's downlink has changed since
2166  * our initial _bt_search() descent. _bt_getstackbuf() will detect
2167  * and recover from this, updating the stack, which ensures that the
2168  * new downlink will be inserted at the correct offset. Even buf's
2169  * parent may have changed.
2170  */
2171  pbuf = _bt_getstackbuf(rel, stack, bknum);
2172 
2173  /*
2174  * Unlock the right child. The left child will be unlocked in
2175  * _bt_insertonpg().
2176  *
2177  * Unlocking the right child must be delayed until here to ensure that
2178  * no concurrent VACUUM operation can become confused. Page deletion
2179  * cannot be allowed to fail to re-find a downlink for the rbuf page.
2180  * (Actually, this is just a vestige of how things used to work. The
2181  * page deletion code is expected to check for the INCOMPLETE_SPLIT
2182  * flag on the left child. It won't attempt deletion of the right
2183  * child until the split is complete. Despite all this, we opt to
2184  * conservatively delay unlocking the right child until here.)
2185  */
2186  _bt_relbuf(rel, rbuf);
2187 
2188  if (pbuf == InvalidBuffer)
2189  ereport(ERROR,
2190  (errcode(ERRCODE_INDEX_CORRUPTED),
2191  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2192  RelationGetRelationName(rel), bknum, rbknum)));
2193 
2194  /* Recursively insert into the parent */
2195  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
2196  new_item, MAXALIGN(IndexTupleSize(new_item)),
2197  stack->bts_offset + 1, 0, is_only);
2198 
2199  /* be tidy */
2200  pfree(new_item);
2201  }
2202 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1101
union BTPageOpaqueData::@45 btpo
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 BlockNumber
Definition: block.h:31
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2416
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2292
#define RelationGetTargetBlock(relation)
Definition: rel.h:541
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ERROR
Definition: elog.h:43
#define DEBUG2
Definition: elog.h:24
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
OffsetNumber bts_offset
Definition: nbtree.h:603
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:490
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:430
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:62
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define ereport(elevel,...)
Definition: elog.h:144
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
#define Assert(condition)
Definition: c.h:738
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:691
struct BTStackData * bts_parent
Definition: nbtree.h:604
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define elog(elevel,...)
Definition: elog.h:214
int Buffer
Definition: buf.h:23
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2292
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_insertonpg()

static void _bt_insertonpg ( Relation  rel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
BTStack  stack,
IndexTuple  itup,
Size  itemsz,
OffsetNumber  newitemoff,
int  postingoff,
bool  split_only_page 
)
static

Definition at line 1101 of file nbtinsert.c.

References _bt_getbuf(), _bt_getrootheight(), _bt_insert_parent(), _bt_relbuf(), _bt_split(), _bt_swap_posting(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, BTScanInsertData::allequalimage, Assert, BlockNumberIsValid, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTREE_FASTPATH_MIN_LEVEL, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BTreeTupleIsPosting(), BufferGetBlockNumber(), BufferGetPage, BufferIsValid, CopyIndexTuple(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, BTScanInsertData::heapkeyspace, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, ItemIdIsDead, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, MarkBufferDirty(), MAXALIGN, xl_btree_insert::offnum, OffsetNumberNext, xl_btree_metadata::oldest_btpo_xact, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageAddItem, PageGetFreeSpace(), PageGetItem, PageGetItemId, PageGetSpecialPointer, PageSetLSN, PANIC, pfree(), PredicateLockPageSplit(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, RelationSetTargetBlock, xl_btree_metadata::root, SizeOfBtreeInsert, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_POST, XLOG_BTREE_INSERT_UPPER, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_doinsert(), and _bt_insert_parent().

1111 {
1112  Page page;
1113  BTPageOpaque lpageop;
1114  IndexTuple oposting = NULL;
1115  IndexTuple origitup = NULL;
1116  IndexTuple nposting = NULL;
1117 
1118  page = BufferGetPage(buf);
1119  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1120 
1121  /* child buffer must be given iff inserting on an internal page */
1122  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
1123  /* tuple must have appropriate number of attributes */
1124  Assert(!P_ISLEAF(lpageop) ||
1125  BTreeTupleGetNAtts(itup, rel) ==
1127  Assert(P_ISLEAF(lpageop) ||
1128  BTreeTupleGetNAtts(itup, rel) <=
1130  Assert(!BTreeTupleIsPosting(itup));
1131  Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1132 
1133  /*
1134  * Every internal page should have exactly one negative infinity item at
1135  * all times. Only _bt_split() and _bt_newroot() should add items that
1136  * become negative infinity items through truncation, since they're the
1137  * only routines that allocate new internal pages.
1138  */
1139  Assert(P_ISLEAF(lpageop) || newitemoff > P_FIRSTDATAKEY(lpageop));
1140 
1141  /* The caller should've finished any incomplete splits already. */
1142  if (P_INCOMPLETE_SPLIT(lpageop))
1143  elog(ERROR, "cannot insert to incompletely split page %u",
1145 
1146  /*
1147  * Do we need to split an existing posting list item?
1148  */
1149  if (postingoff != 0)
1150  {
1151  ItemId itemid = PageGetItemId(page, newitemoff);
1152 
1153  /*
1154  * The new tuple is a duplicate with a heap TID that falls inside the
1155  * range of an existing posting list tuple on a leaf page. Prepare to
1156  * split an existing posting list. Overwriting the posting list with
1157  * its post-split version is treated as an extra step in either the
1158  * insert or page split critical section.
1159  */
1160  Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid));
1161  Assert(itup_key->heapkeyspace && itup_key->allequalimage);
1162  oposting = (IndexTuple) PageGetItem(page, itemid);
1163 
1164  /* use a mutable copy of itup as our itup from here on */
1165  origitup = itup;
1166  itup = CopyIndexTuple(origitup);
1167  nposting = _bt_swap_posting(itup, oposting, postingoff);
1168  /* itup now contains rightmost/max TID from oposting */
1169 
1170  /* Alter offset so that newitem goes after posting list */
1171  newitemoff = OffsetNumberNext(newitemoff);
1172  }
1173 
1174  /*
1175  * Do we need to split the page to fit the item on it?
1176  *
1177  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1178  * so this comparison is correct even though we appear to be accounting
1179  * only for the item and not for its line pointer.
1180  */
1181  if (PageGetFreeSpace(page) < itemsz)
1182  {
1183  bool is_root = P_ISROOT(lpageop);
1184  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
1185  Buffer rbuf;
1186 
1187  Assert(!split_only_page);
1188 
1189  /* split the buffer into left and right halves */
1190  rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
1191  origitup, nposting, postingoff);
1194  BufferGetBlockNumber(rbuf));
1195 
1196  /*----------
1197  * By here,
1198  *
1199  * + our target page has been split;
1200  * + the original tuple has been inserted;
1201  * + we have write locks on both the old (left half)
1202  * and new (right half) buffers, after the split; and
1203  * + we know the key we want to insert into the parent
1204  * (it's the "high key" on the left child page).
1205  *
1206  * We're ready to do the parent insertion. We need to hold onto the
1207  * locks for the child pages until we locate the parent, but we can
1208  * at least release the lock on the right child before doing the
1209  * actual insertion. The lock on the left child will be released
1210  * last of all by parent insertion, where it is the 'cbuf' of parent
1211  * page.
1212  *----------
1213  */
1214  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
1215  }
1216  else
1217  {
1218  bool isleaf = P_ISLEAF(lpageop);
1219  bool isrightmost = P_RIGHTMOST(lpageop);
1220  Buffer metabuf = InvalidBuffer;
1221  Page metapg = NULL;
1222  BTMetaPageData *metad = NULL;
1223  BlockNumber blockcache;
1224 
1225  /*
1226  * If we are doing this insert because we split a page that was the
1227  * only one on its tree level, but was not the root, it may have been
1228  * the "fast root". We need to ensure that the fast root link points
1229  * at or above the current page. We can safely acquire a lock on the
1230  * metapage here --- see comments for _bt_newroot().
1231  */
1232  if (split_only_page)
1233  {
1234  Assert(!isleaf);
1235  Assert(BufferIsValid(cbuf));
1236 
1237  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1238  metapg = BufferGetPage(metabuf);
1239  metad = BTPageGetMeta(metapg);
1240 
1241  if (metad->btm_fastlevel >= lpageop->btpo.level)
1242  {
1243  /* no update wanted */
1244  _bt_relbuf(rel, metabuf);
1245  metabuf = InvalidBuffer;
1246  }
1247  }
1248 
1249  /* Do the update. No ereport(ERROR) until changes are logged */
1251 
1252  if (postingoff != 0)
1253  memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1254 
1255  if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1256  false) == InvalidOffsetNumber)
1257  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1259 
1261 
1262  if (BufferIsValid(metabuf))
1263  {
1264  /* upgrade meta-page if needed */
1265  if (metad->btm_version < BTREE_NOVAC_VERSION)
1266  _bt_upgrademetapage(metapg);
1268  metad->btm_fastlevel = lpageop->btpo.level;
1269  MarkBufferDirty(metabuf);
1270  }
1271 
1272  /*
1273  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1274  * finishes a split
1275  */
1276  if (!isleaf)
1277  {
1278  Page cpage = BufferGetPage(cbuf);
1279  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1280 
1281  Assert(P_INCOMPLETE_SPLIT(cpageop));
1282  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1283  MarkBufferDirty(cbuf);
1284  }
1285 
1286  /* XLOG stuff */
1287  if (RelationNeedsWAL(rel))
1288  {
1289  xl_btree_insert xlrec;
1290  xl_btree_metadata xlmeta;
1291  uint8 xlinfo;
1292  XLogRecPtr recptr;
1293  uint16 upostingoff;
1294 
1295  xlrec.offnum = newitemoff;
1296 
1297  XLogBeginInsert();
1298  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1299 
1300  if (isleaf && postingoff == 0)
1301  {
1302  /* Simple leaf insert */
1303  xlinfo = XLOG_BTREE_INSERT_LEAF;
1304  }
1305  else if (postingoff != 0)
1306  {
1307  /*
1308  * Leaf insert with posting list split. Must include
1309  * postingoff field before newitem/orignewitem.
1310  */
1311  Assert(isleaf);
1312  xlinfo = XLOG_BTREE_INSERT_POST;
1313  }
1314  else
1315  {
1316  /* Internal page insert, which finishes a split on cbuf */
1317  xlinfo = XLOG_BTREE_INSERT_UPPER;
1319 
1320  if (BufferIsValid(metabuf))
1321  {
1322  /* Actually, it's an internal page insert + meta update */
1323  xlinfo = XLOG_BTREE_INSERT_META;
1324 
1326  xlmeta.version = metad->btm_version;
1327  xlmeta.root = metad->btm_root;
1328  xlmeta.level = metad->btm_level;
1329  xlmeta.fastroot = metad->btm_fastroot;
1330  xlmeta.fastlevel = metad->btm_fastlevel;
1331  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1334  xlmeta.allequalimage = metad->btm_allequalimage;
1335 
1336  XLogRegisterBuffer(2, metabuf,
1338  XLogRegisterBufData(2, (char *) &xlmeta,
1339  sizeof(xl_btree_metadata));
1340  }
1341  }
1342 
1344  if (postingoff == 0)
1345  {
1346  /* Just log itup from caller */
1347  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1348  }
1349  else
1350  {
1351  /*
1352  * Insert with posting list split (XLOG_BTREE_INSERT_POST
1353  * record) case.
1354  *
1355  * Log postingoff. Also log origitup, not itup. REDO routine
1356  * must reconstruct final itup (as well as nposting) using
1357  * _bt_swap_posting().
1358  */
1359  upostingoff = postingoff;
1360 
1361  XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1362  XLogRegisterBufData(0, (char *) origitup,
1363  IndexTupleSize(origitup));
1364  }
1365 
1366  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1367 
1368  if (BufferIsValid(metabuf))
1369  PageSetLSN(metapg, recptr);
1370  if (!isleaf)
1371  PageSetLSN(BufferGetPage(cbuf), recptr);
1372 
1373  PageSetLSN(page, recptr);
1374  }
1375 
1376  END_CRIT_SECTION();
1377 
1378  /* Release subsidiary buffers */
1379  if (BufferIsValid(metabuf))
1380  _bt_relbuf(rel, metabuf);
1381  if (!isleaf)
1382  _bt_relbuf(rel, cbuf);
1383 
1384  /*
1385  * Cache the block number if this is the rightmost leaf page. Cache
1386  * may be used by a future inserter within _bt_search_insert().
1387  */
1388  blockcache = InvalidBlockNumber;
1389  if (isrightmost && isleaf && !P_ISROOT(lpageop))
1390  blockcache = BufferGetBlockNumber(buf);
1391 
1392  /* Release buffer for insertion target block */
1393  _bt_relbuf(rel, buf);
1394 
1395  /*
1396  * If we decided to cache the insertion target block before releasing
1397  * its buffer lock, then cache it now. Check the height of the tree
1398  * first, though. We don't go for the optimization with small
1399  * indexes. Defer final check to this point to ensure that we don't
1400  * call _bt_getrootheight while holding a buffer lock.
1401  */
1402  if (BlockNumberIsValid(blockcache) &&
1404  RelationSetTargetBlock(rel, blockcache);
1405  }
1406 
1407  /* be tidy */
1408  if (postingoff != 0)
1409  {
1410  /* itup is actually a modified copy of caller's original */
1411  pfree(nposting);
1412  pfree(itup);
1413  }
1414 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
bool allequalimage
Definition: nbtxlog.h:57
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:100
uint32 btm_version
Definition: nbtree.h:101
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:603
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:51
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:365
Pointer Item
Definition: item.h:17
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:79
bool allequalimage
Definition: nbtree.h:660
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:2078
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
#define PANIC
Definition: elog.h:53
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:774
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:574
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1444
BlockNumber btm_fastroot
Definition: nbtree.h:104
unsigned short uint16
Definition: c.h:366
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
#define BTPageGetMeta(p)
Definition: nbtree.h:114
bool btm_allequalimage
Definition: nbtree.h:111
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BTREE_METAPAGE
Definition: nbtree.h:141
#define P_ISROOT(opaque)
Definition: nbtree.h:215
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:29
uint32 version
Definition: nbtxlog.h:50
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
OffsetNumber offnum
Definition: nbtxlog.h:81
BlockNumber btm_root
Definition: nbtree.h:102
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
bool heapkeyspace
Definition: nbtree.h:659
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:31
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:548
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
#define MAXALIGN(LEN)
Definition: c.h:691
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:214
uint32 level
Definition: nbtxlog.h:52
BlockNumber fastroot
Definition: nbtxlog.h:53
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
#define BT_WRITE
Definition: nbtree.h:588
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define SizeOfBtreeInsert
Definition: nbtxlog.h:87
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3098
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_newroot()

static Buffer _bt_newroot ( Relation  rel,
Buffer  lbuf,
Buffer  rbuf 
)
static

Definition at line 2416 of file nbtinsert.c.

References _bt_getbuf(), _bt_relbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTreeTupleSetNAtts(), BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), elog, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, MarkBufferDirty(), xl_btree_metadata::oldest_btpo_xact, P_FIRSTKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_NEW, P_NONE, PageAddItem, PageGetItem, PageGetItemId, PageGetSpecialPointer, PageSetLSN, palloc(), PANIC, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insert_parent().

2417 {
2418  Buffer rootbuf;
2419  Page lpage,
2420  rootpage;
2421  BlockNumber lbkno,
2422  rbkno;
2423  BlockNumber rootblknum;
2424  BTPageOpaque rootopaque;
2425  BTPageOpaque lopaque;
2426  ItemId itemid;
2427  IndexTuple item;
2428  IndexTuple left_item;
2429  Size left_item_sz;
2430  IndexTuple right_item;
2431  Size right_item_sz;
2432  Buffer metabuf;
2433  Page metapg;
2434  BTMetaPageData *metad;
2435 
2436  lbkno = BufferGetBlockNumber(lbuf);
2437  rbkno = BufferGetBlockNumber(rbuf);
2438  lpage = BufferGetPage(lbuf);
2439  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2440 
2441  /* get a new root page */
2442  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2443  rootpage = BufferGetPage(rootbuf);
2444  rootblknum = BufferGetBlockNumber(rootbuf);
2445 
2446  /* acquire lock on the metapage */
2447  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2448  metapg = BufferGetPage(metabuf);
2449  metad = BTPageGetMeta(metapg);
2450 
2451  /*
2452  * Create downlink item for left page (old root). The key value used is
2453  * "minus infinity", a sentinel value that's reliably less than any real
2454  * key value that could appear in the left page.
2455  */
2456  left_item_sz = sizeof(IndexTupleData);
2457  left_item = (IndexTuple) palloc(left_item_sz);
2458  left_item->t_info = left_item_sz;
2459  BTreeTupleSetDownLink(left_item, lbkno);
2460  BTreeTupleSetNAtts(left_item, 0, false);
2461 
2462  /*
2463  * Create downlink item for right page. The key for it is obtained from
2464  * the "high key" position in the left page.
2465  */
2466  itemid = PageGetItemId(lpage, P_HIKEY);
2467  right_item_sz = ItemIdGetLength(itemid);
2468  item = (IndexTuple) PageGetItem(lpage, itemid);
2469  right_item = CopyIndexTuple(item);
2470  BTreeTupleSetDownLink(right_item, rbkno);
2471 
2472  /* NO EREPORT(ERROR) from here till newroot op is logged */
2474 
2475  /* upgrade metapage if needed */
2476  if (metad->btm_version < BTREE_NOVAC_VERSION)
2477  _bt_upgrademetapage(metapg);
2478 
2479  /* set btree special data */
2480  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2481  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2482  rootopaque->btpo_flags = BTP_ROOT;
2483  rootopaque->btpo.level =
2484  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2485  rootopaque->btpo_cycleid = 0;
2486 
2487  /* update metapage data */
2488  metad->btm_root = rootblknum;
2489  metad->btm_level = rootopaque->btpo.level;
2490  metad->btm_fastroot = rootblknum;
2491  metad->btm_fastlevel = rootopaque->btpo.level;
2492 
2493  /*
2494  * Insert the left page pointer into the new root page. The root page is
2495  * the rightmost page on its level so there is no "high key" in it; the
2496  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2497  *
2498  * Note: we *must* insert the two items in item-number order, for the
2499  * benefit of _bt_restore_page().
2500  */
2501  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2502  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2503  false, false) == InvalidOffsetNumber)
2504  elog(PANIC, "failed to add leftkey to new root page"
2505  " while splitting block %u of index \"%s\"",
2507 
2508  /*
2509  * insert the right page pointer into the new root page.
2510  */
2511  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2512  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2514  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2515  false, false) == InvalidOffsetNumber)
2516  elog(PANIC, "failed to add rightkey to new root page"
2517  " while splitting block %u of index \"%s\"",
2519 
2520  /* Clear the incomplete-split flag in the left child */
2521  Assert(P_INCOMPLETE_SPLIT(lopaque));
2522  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2523  MarkBufferDirty(lbuf);
2524 
2525  MarkBufferDirty(rootbuf);
2526  MarkBufferDirty(metabuf);
2527 
2528  /* XLOG stuff */
2529  if (RelationNeedsWAL(rel))
2530  {
2531  xl_btree_newroot xlrec;
2532  XLogRecPtr recptr;
2533  xl_btree_metadata md;
2534 
2535  xlrec.rootblk = rootblknum;
2536  xlrec.level = metad->btm_level;
2537 
2538  XLogBeginInsert();
2539  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2540 
2541  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2544 
2546  md.version = metad->btm_version;
2547  md.root = rootblknum;
2548  md.level = metad->btm_level;
2549  md.fastroot = rootblknum;
2550  md.fastlevel = metad->btm_level;
2553  md.allequalimage = metad->btm_allequalimage;
2554 
2555  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2556 
2557  /*
2558  * Direct access to page is not good but faster - we should implement
2559  * some new func in page API.
2560  */
2562  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2563  ((PageHeader) rootpage)->pd_special -
2564  ((PageHeader) rootpage)->pd_upper);
2565 
2566  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2567 
2568  PageSetLSN(lpage, recptr);
2569  PageSetLSN(rootpage, recptr);
2570  PageSetLSN(metapg, recptr);
2571  }
2572 
2573  END_CRIT_SECTION();
2574 
2575  /* done with metapage */
2576  _bt_relbuf(rel, metabuf);
2577 
2578  pfree(left_item);
2579  pfree(right_item);
2580 
2581  return rootbuf;
2582 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
bool allequalimage
Definition: nbtxlog.h:57
BlockNumber rootblk
Definition: nbtxlog.h:318
#define BTP_ROOT
Definition: nbtree.h:73
BlockNumber btpo_next
Definition: nbtree.h:59
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:100
uint32 btm_version
Definition: nbtree.h:101
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:51
union BTPageOpaqueData::@45 btpo
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:206
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
uint32 level
Definition: nbtxlog.h:319
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:79
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:91
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
#define PANIC
Definition: elog.h:53
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:463
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
BTCycleId btpo_cycleid
Definition: nbtree.h:66
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
#define BTPageGetMeta(p)
Definition: nbtree.h:114
bool btm_allequalimage
Definition: nbtree.h:111
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define P_FIRSTKEY
Definition: nbtree.h:243
#define RelationGetRelationName(relation)
Definition: rel.h:490
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:430
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BTREE_METAPAGE
Definition: nbtree.h:141
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:322
uint32 version
Definition: nbtxlog.h:50
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
struct IndexTupleData IndexTupleData
BlockNumber btm_root
Definition: nbtree.h:102
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
void * palloc(Size size)
Definition: mcxt.c:949
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:214
uint32 level
Definition: nbtxlog.h:52
BlockNumber fastroot
Definition: nbtxlog.h:53
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
#define BT_WRITE
Definition: nbtree.h:588
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_pgaddtup()

static bool _bt_pgaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off,
bool  newfirstdataitem 
)
inlinestatic

Definition at line 2603 of file nbtinsert.c.

References BTreeTupleSetNAtts(), InvalidOffsetNumber, PageAddItem, IndexTupleData::t_info, and unlikely.

Referenced by _bt_split().

2608 {
2609  IndexTupleData trunctuple;
2610 
2611  if (newfirstdataitem)
2612  {
2613  trunctuple = *itup;
2614  trunctuple.t_info = sizeof(IndexTupleData);
2615  BTreeTupleSetNAtts(&trunctuple, 0, false);
2616  itup = &trunctuple;
2617  itemsize = sizeof(IndexTupleData);
2618  }
2619 
2620  if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2621  false) == InvalidOffsetNumber))
2622  return false;
2623 
2624  return true;
2625 }
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:463
struct IndexTupleData IndexTupleData
#define InvalidOffsetNumber
Definition: off.h:26
#define unlikely(x)
Definition: c.h:206
unsigned short t_info
Definition: itup.h:49

◆ _bt_search_insert()

static BTStack _bt_search_insert ( Relation  rel,
BTInsertState  insertstate 
)
static

Definition at line 296 of file nbtinsert.c.

References _bt_checkpage(), _bt_compare(), _bt_relbuf(), _bt_search(), Assert, BTInsertStateData::bounds_valid, BT_WRITE, BTInsertStateData::buf, BufferGetPage, ConditionalLockBuffer(), InvalidBlockNumber, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup_key, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::postingoff, ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, and ReleaseBuffer().

Referenced by _bt_doinsert().

297 {
298  Assert(insertstate->buf == InvalidBuffer);
299  Assert(!insertstate->bounds_valid);
300  Assert(insertstate->postingoff == 0);
301 
303  {
304  /* Simulate a _bt_getbuf() call with conditional locking */
305  insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
306  if (ConditionalLockBuffer(insertstate->buf))
307  {
308  Page page;
309  BTPageOpaque lpageop;
310 
311  _bt_checkpage(rel, insertstate->buf);
312  page = BufferGetPage(insertstate->buf);
313  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
314 
315  /*
316  * Check if the page is still the rightmost leaf page and has
317  * enough free space to accommodate the new tuple. Also check
318  * that the insertion scan key is strictly greater than the first
319  * non-pivot tuple on the page. (Note that we expect itup_key's
320  * scantid to be unset when our caller is a checkingunique
321  * inserter.)
322  */
323  if (P_RIGHTMOST(lpageop) &&
324  P_ISLEAF(lpageop) &&
325  !P_IGNORE(lpageop) &&
326  PageGetFreeSpace(page) > insertstate->itemsz &&
327  PageGetMaxOffsetNumber(page) >= P_HIKEY &&
328  _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
329  {
330  /*
331  * Caller can use the fastpath optimization because cached
332  * block is still rightmost leaf page, which can fit caller's
333  * new tuple without splitting. Keep block in local cache for
334  * next insert, and have caller use NULL stack.
335  *
336  * Note that _bt_insert_parent() has an assertion that catches
337  * leaf page splits that somehow follow from a fastpath insert
338  * (it should only be passed a NULL stack when it must deal
339  * with a concurrent root page split, and never because a NULL
340  * stack was returned here).
341  */
342  return NULL;
343  }
344 
345  /* Page unsuitable for caller, drop lock and pin */
346  _bt_relbuf(rel, insertstate->buf);
347  }
348  else
349  {
350  /* Lock unavailable, drop pin */
351  ReleaseBuffer(insertstate->buf);
352  }
353 
354  /* Forget block, since cache doesn't appear to be useful */
356  }
357 
358  /* Cannot use optimization -- descend tree, return proper descent stack */
359  return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
360  NULL);
361 }
#define P_IGNORE(opaque)
Definition: nbtree.h:219
bool bounds_valid
Definition: nbtree.h:696
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3483
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:541
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:574
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:725
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:645
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3748
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:738
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:606
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:548
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define P_HIKEY
Definition: nbtree.h:242
#define BT_WRITE
Definition: nbtree.h:588
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_split()

static Buffer _bt_split ( Relation  rel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
IndexTuple  orignewitem,
IndexTuple  nposting,
uint16  postingoff 
)
static

Definition at line 1444 of file nbtinsert.c.

References _bt_findsplitloc(), _bt_getbuf(), _bt_pageinit(), _bt_pgaddtup(), _bt_relbuf(), _bt_truncate(), _bt_vacuum_cycleid(), Assert, BT_WRITE, BTP_HAS_GARBAGE, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTP_SPLIT_END, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BTreeTupleIsPosting(), BufferGetBlockNumber(), BufferGetPage, BufferGetPageSize, elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_split::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, ItemIdGetLength, ItemPointerCompare(), BTPageOpaqueData::level, xl_btree_split::level, MarkBufferDirty(), MAXALIGN, xl_btree_split::newitemoff, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_NEW, P_RIGHTMOST, PageAddItem, PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPage(), PageRestoreTempPage(), PageSetLSN, pfree(), xl_btree_split::postingoff, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeSplit, START_CRIT_SECTION, IndexTupleData::t_tid, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_R, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insertonpg().

1447 {
1448  Buffer rbuf;
1449  Page origpage;
1450  Page leftpage,
1451  rightpage;
1452  BlockNumber origpagenumber,
1453  rightpagenumber;
1454  BTPageOpaque ropaque,
1455  lopaque,
1456  oopaque;
1457  Buffer sbuf = InvalidBuffer;
1458  Page spage = NULL;
1459  BTPageOpaque sopaque = NULL;
1460  Size itemsz;
1461  ItemId itemid;
1462  IndexTuple firstright,
1463  lefthighkey;
1464  OffsetNumber firstrightoff;
1465  OffsetNumber afterleftoff,
1466  afterrightoff,
1467  minusinfoff;
1468  OffsetNumber origpagepostingoff;
1469  OffsetNumber maxoff;
1470  OffsetNumber i;
1471  bool newitemonleft,
1472  isleaf,
1473  isrightmost;
1474 
1475  /*
1476  * origpage is the original page to be split. leftpage is a temporary
1477  * buffer that receives the left-sibling data, which will be copied back
1478  * into origpage on success. rightpage is the new page that will receive
1479  * the right-sibling data.
1480  *
1481  * leftpage is allocated after choosing a split point. rightpage's new
1482  * buffer isn't acquired until after leftpage is initialized and has new
1483  * high key, the last point where splitting the page may fail (barring
1484  * corruption). Failing before acquiring new buffer won't have lasting
1485  * consequences, since origpage won't have been modified and leftpage is
1486  * only workspace.
1487  */
1488  origpage = BufferGetPage(buf);
1489  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1490  isleaf = P_ISLEAF(oopaque);
1491  isrightmost = P_RIGHTMOST(oopaque);
1492  maxoff = PageGetMaxOffsetNumber(origpage);
1493  origpagenumber = BufferGetBlockNumber(buf);
1494 
1495  /*
1496  * Choose a point to split origpage at.
1497  *
1498  * A split point can be thought of as a point _between_ two existing data
1499  * items on origpage (the lastleft and firstright tuples), provided you
1500  * pretend that the new item that didn't fit is already on origpage.
1501  *
1502  * Since origpage does not actually contain newitem, the representation of
1503  * split points needs to work with two boundary cases: splits where
1504  * newitem is lastleft, and splits where newitem is firstright.
1505  * newitemonleft resolves the ambiguity that would otherwise exist when
1506  * newitemoff == firstrightoff. In all other cases it's clear which side
1507  * of the split every tuple goes on from context. newitemonleft is
1508  * usually (but not always) redundant information.
1509  *
1510  * firstrightoff is supposed to be an origpage offset number, but it's
1511  * possible that its value will be maxoff+1, which is "past the end" of
1512  * origpage. This happens in the rare case where newitem goes after all
1513  * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1514  * origpage at the point that leaves newitem alone on new right page. Any
1515  * "!newitemonleft && newitemoff == firstrightoff" split point makes
1516  * newitem the firstright tuple, though, so this case isn't a special
1517  * case.
1518  */
1519  firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1520  newitem, &newitemonleft);
1521 
1522  /* Allocate temp buffer for leftpage */
1523  leftpage = PageGetTempPage(origpage);
1524  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1525  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1526 
1527  /*
1528  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1529  * and HAS_GARBAGE flags.
1530  */
1531  lopaque->btpo_flags = oopaque->btpo_flags;
1532  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1533  /* set flag in leftpage indicating that rightpage has no downlink yet */
1534  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1535  lopaque->btpo_prev = oopaque->btpo_prev;
1536  /* handle btpo_next after rightpage buffer acquired */
1537  lopaque->btpo.level = oopaque->btpo.level;
1538  /* handle btpo_cycleid after rightpage buffer acquired */
1539 
1540  /*
1541  * Copy the original page's LSN into leftpage, which will become the
1542  * updated version of the page. We need this because XLogInsert will
1543  * examine the LSN and possibly dump it in a page image.
1544  */
1545  PageSetLSN(leftpage, PageGetLSN(origpage));
1546 
1547  /*
1548  * Determine page offset number of existing overlapped-with-orignewitem
1549  * posting list when it is necessary to perform a posting list split in
1550  * passing. Note that newitem was already changed by caller (newitem no
1551  * longer has the orignewitem TID).
1552  *
1553  * This page offset number (origpagepostingoff) will be used to pretend
1554  * that the posting split has already taken place, even though the
1555  * required modifications to origpage won't occur until we reach the
1556  * critical section. The lastleft and firstright tuples of our page split
1557  * point should, in effect, come from an imaginary version of origpage
1558  * that has the nposting tuple instead of the original posting list tuple.
1559  *
1560  * Note: _bt_findsplitloc() should have compensated for coinciding posting
1561  * list splits in just the same way, at least in theory. It doesn't
1562  * bother with that, though. In practice it won't affect its choice of
1563  * split point.
1564  */
1565  origpagepostingoff = InvalidOffsetNumber;
1566  if (postingoff != 0)
1567  {
1568  Assert(isleaf);
1569  Assert(ItemPointerCompare(&orignewitem->t_tid,
1570  &newitem->t_tid) < 0);
1571  Assert(BTreeTupleIsPosting(nposting));
1572  origpagepostingoff = OffsetNumberPrev(newitemoff);
1573  }
1574 
1575  /*
1576  * The high key for the new left page is a possibly-truncated copy of
1577  * firstright on the leaf level (it's "firstright itself" on internal
1578  * pages; see !isleaf comments below). This may seem to be contrary to
1579  * Lehman & Yao's approach of using a copy of lastleft as the new high key
1580  * when splitting on the leaf level. It isn't, though.
1581  *
1582  * Suffix truncation will leave the left page's high key fully equal to
1583  * lastleft when lastleft and firstright are equal prior to heap TID (that
1584  * is, the tiebreaker TID value comes from lastleft). It isn't actually
1585  * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1586  * "subtree" invariant to hold. It's sufficient to make sure that the new
1587  * leaf high key is strictly less than firstright, and greater than or
1588  * equal to (not necessarily equal to) lastleft. In other words, when
1589  * suffix truncation isn't possible during a leaf page split, we take
1590  * L&Y's exact approach to generating a new high key for the left page.
1591  * (Actually, that is slightly inaccurate. We don't just use a copy of
1592  * lastleft. A tuple with all the keys from firstright but the max heap
1593  * TID from lastleft is used, to avoid introducing a special case.)
1594  */
1595  if (!newitemonleft && newitemoff == firstrightoff)
1596  {
1597  /* incoming tuple becomes firstright */
1598  itemsz = newitemsz;
1599  firstright = newitem;
1600  }
1601  else
1602  {
1603  /* existing item at firstrightoff becomes firstright */
1604  itemid = PageGetItemId(origpage, firstrightoff);
1605  itemsz = ItemIdGetLength(itemid);
1606  firstright = (IndexTuple) PageGetItem(origpage, itemid);
1607  if (firstrightoff == origpagepostingoff)
1608  firstright = nposting;
1609  }
1610 
1611  if (isleaf)
1612  {
1613  IndexTuple lastleft;
1614 
1615  /* Attempt suffix truncation for leaf page splits */
1616  if (newitemonleft && newitemoff == firstrightoff)
1617  {
1618  /* incoming tuple becomes lastleft */
1619  lastleft = newitem;
1620  }
1621  else
1622  {
1623  OffsetNumber lastleftoff;
1624 
1625  /* existing item before firstrightoff becomes lastleft */
1626  lastleftoff = OffsetNumberPrev(firstrightoff);
1627  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1628  itemid = PageGetItemId(origpage, lastleftoff);
1629  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1630  if (lastleftoff == origpagepostingoff)
1631  lastleft = nposting;
1632  }
1633 
1634  lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1635  itemsz = IndexTupleSize(lefthighkey);
1636  }
1637  else
1638  {
1639  /*
1640  * Don't perform suffix truncation on a copy of firstright to make
1641  * left page high key for internal page splits. Must use firstright
1642  * as new high key directly.
1643  *
1644  * Each distinct separator key value originates as a leaf level high
1645  * key; all other separator keys/pivot tuples are copied from one
1646  * level down. A separator key in a grandparent page must be
1647  * identical to high key in rightmost parent page of the subtree to
1648  * its left, which must itself be identical to high key in rightmost
1649  * child page of that same subtree (this even applies to separator
1650  * from grandparent's high key). There must always be an unbroken
1651  * "seam" of identical separator keys that guide index scans at every
1652  * level, starting from the grandparent. That's why suffix truncation
1653  * is unsafe here.
1654  *
1655  * Internal page splits will truncate firstright into a "negative
1656  * infinity" data item when it gets inserted on the new right page
1657  * below, though. This happens during the call to _bt_pgaddtup() for
1658  * the new first data item for right page. Do not confuse this
1659  * mechanism with suffix truncation. It is just a convenient way of
1660  * implementing page splits that split the internal page "inside"
1661  * firstright. The lefthighkey separator key cannot appear a second
1662  * time in the right page (only firstright's downlink goes in right
1663  * page).
1664  */
1665  lefthighkey = firstright;
1666  }
1667 
1668  /*
1669  * Add new high key to leftpage
1670  */
1671  afterleftoff = P_HIKEY;
1672 
1673  Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1674  Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1676  Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1677  if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1678  false) == InvalidOffsetNumber)
1679  elog(ERROR, "failed to add high key to the left sibling"
1680  " while splitting block %u of index \"%s\"",
1681  origpagenumber, RelationGetRelationName(rel));
1682  afterleftoff = OffsetNumberNext(afterleftoff);
1683 
1684  /*
1685  * Acquire a new right page to split into, now that left page has a new
1686  * high key. From here on, it's not okay to throw an error without
1687  * zeroing rightpage first. This coding rule ensures that we won't
1688  * confuse future VACUUM operations, which might otherwise try to re-find
1689  * a downlink to a leftover junk page as the page undergoes deletion.
1690  *
1691  * It would be reasonable to start the critical section just after the new
1692  * rightpage buffer is acquired instead; that would allow us to avoid
1693  * leftover junk pages without bothering to zero rightpage. We do it this
1694  * way because it avoids an unnecessary PANIC when either origpage or its
1695  * existing sibling page are corrupt.
1696  */
1697  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1698  rightpage = BufferGetPage(rbuf);
1699  rightpagenumber = BufferGetBlockNumber(rbuf);
1700  /* rightpage was initialized by _bt_getbuf */
1701  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1702 
1703  /*
1704  * Finish off remaining leftpage special area fields. They cannot be set
1705  * before both origpage (leftpage) and rightpage buffers are acquired and
1706  * locked.
1707  *
1708  * btpo_cycleid is only used with leaf pages, though we set it here in all
1709  * cases just to be consistent.
1710  */
1711  lopaque->btpo_next = rightpagenumber;
1712  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1713 
1714  /*
1715  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1716  * and HAS_GARBAGE flags.
1717  */
1718  ropaque->btpo_flags = oopaque->btpo_flags;
1719  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1720  ropaque->btpo_prev = origpagenumber;
1721  ropaque->btpo_next = oopaque->btpo_next;
1722  ropaque->btpo.level = oopaque->btpo.level;
1723  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1724 
1725  /*
1726  * Add new high key to rightpage where necessary.
1727  *
1728  * If the page we're splitting is not the rightmost page at its level in
1729  * the tree, then the first entry on the page is the high key from
1730  * origpage.
1731  */
1732  afterrightoff = P_HIKEY;
1733 
1734  if (!isrightmost)
1735  {
1736  IndexTuple righthighkey;
1737 
1738  itemid = PageGetItemId(origpage, P_HIKEY);
1739  itemsz = ItemIdGetLength(itemid);
1740  righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1741  Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1742  Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1744  if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1745  false, false) == InvalidOffsetNumber)
1746  {
1747  memset(rightpage, 0, BufferGetPageSize(rbuf));
1748  elog(ERROR, "failed to add high key to the right sibling"
1749  " while splitting block %u of index \"%s\"",
1750  origpagenumber, RelationGetRelationName(rel));
1751  }
1752  afterrightoff = OffsetNumberNext(afterrightoff);
1753  }
1754 
1755  /*
1756  * Internal page splits truncate first data item on right page -- it
1757  * becomes "minus infinity" item for the page. Set this up here.
1758  */
1759  minusinfoff = InvalidOffsetNumber;
1760  if (!isleaf)
1761  minusinfoff = afterrightoff;
1762 
1763  /*
1764  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1765  * additional pivot tuples in !isleaf case) to the appropriate page.
1766  *
1767  * Note: we *must* insert at least the right page's items in item-number
1768  * order, for the benefit of _bt_restore_page().
1769  */
1770  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1771  {
1772  IndexTuple dataitem;
1773 
1774  itemid = PageGetItemId(origpage, i);
1775  itemsz = ItemIdGetLength(itemid);
1776  dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1777 
1778  /* replace original item with nposting due to posting split? */
1779  if (i == origpagepostingoff)
1780  {
1781  Assert(BTreeTupleIsPosting(dataitem));
1782  Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1783  dataitem = nposting;
1784  }
1785 
1786  /* does new item belong before this one? */
1787  else if (i == newitemoff)
1788  {
1789  if (newitemonleft)
1790  {
1791  Assert(newitemoff <= firstrightoff);
1792  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1793  false))
1794  {
1795  memset(rightpage, 0, BufferGetPageSize(rbuf));
1796  elog(ERROR, "failed to add new item to the left sibling"
1797  " while splitting block %u of index \"%s\"",
1798  origpagenumber, RelationGetRelationName(rel));
1799  }
1800  afterleftoff = OffsetNumberNext(afterleftoff);
1801  }
1802  else
1803  {
1804  Assert(newitemoff >= firstrightoff);
1805  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1806  afterrightoff == minusinfoff))
1807  {
1808  memset(rightpage, 0, BufferGetPageSize(rbuf));
1809  elog(ERROR, "failed to add new item to the right sibling"
1810  " while splitting block %u of index \"%s\"",
1811  origpagenumber, RelationGetRelationName(rel));
1812  }
1813  afterrightoff = OffsetNumberNext(afterrightoff);
1814  }
1815  }
1816 
1817  /* decide which page to put it on */
1818  if (i < firstrightoff)
1819  {
1820  if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1821  {
1822  memset(rightpage, 0, BufferGetPageSize(rbuf));
1823  elog(ERROR, "failed to add old item to the left sibling"
1824  " while splitting block %u of index \"%s\"",
1825  origpagenumber, RelationGetRelationName(rel));
1826  }
1827  afterleftoff = OffsetNumberNext(afterleftoff);
1828  }
1829  else
1830  {
1831  if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1832  afterrightoff == minusinfoff))
1833  {
1834  memset(rightpage, 0, BufferGetPageSize(rbuf));
1835  elog(ERROR, "failed to add old item to the right sibling"
1836  " while splitting block %u of index \"%s\"",
1837  origpagenumber, RelationGetRelationName(rel));
1838  }
1839  afterrightoff = OffsetNumberNext(afterrightoff);
1840  }
1841  }
1842 
1843  /* Handle case where newitem goes at the end of rightpage */
1844  if (i <= newitemoff)
1845  {
1846  /*
1847  * Can't have newitemonleft here; that would imply we were told to put
1848  * *everything* on the left page, which cannot fit (if it could, we'd
1849  * not be splitting the page).
1850  */
1851  Assert(!newitemonleft && newitemoff == maxoff + 1);
1852  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1853  afterrightoff == minusinfoff))
1854  {
1855  memset(rightpage, 0, BufferGetPageSize(rbuf));
1856  elog(ERROR, "failed to add new item to the right sibling"
1857  " while splitting block %u of index \"%s\"",
1858  origpagenumber, RelationGetRelationName(rel));
1859  }
1860  afterrightoff = OffsetNumberNext(afterrightoff);
1861  }
1862 
1863  /*
1864  * We have to grab the right sibling (if any) and fix the prev pointer
1865  * there. We are guaranteed that this is deadlock-free since no other
1866  * writer will be holding a lock on that page and trying to move left, and
1867  * all readers release locks on a page before trying to fetch its
1868  * neighbors.
1869  */
1870  if (!isrightmost)
1871  {
1872  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1873  spage = BufferGetPage(sbuf);
1874  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1875  if (sopaque->btpo_prev != origpagenumber)
1876  {
1877  memset(rightpage, 0, BufferGetPageSize(rbuf));
1878  ereport(ERROR,
1879  (errcode(ERRCODE_INDEX_CORRUPTED),
1880  errmsg_internal("right sibling's left-link doesn't match: "
1881  "block %u links to %u instead of expected %u in index \"%s\"",
1882  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1883  RelationGetRelationName(rel))));
1884  }
1885 
1886  /*
1887  * Check to see if we can set the SPLIT_END flag in the right-hand
1888  * split page; this can save some I/O for vacuum since it need not
1889  * proceed to the right sibling. We can set the flag if the right
1890  * sibling has a different cycleid: that means it could not be part of
1891  * a group of pages that were all split off from the same ancestor
1892  * page. If you're confused, imagine that page A splits to A B and
1893  * then again, yielding A C B, while vacuum is in progress. Tuples
1894  * originally in A could now be in either B or C, hence vacuum must
1895  * examine both pages. But if D, our right sibling, has a different
1896  * cycleid then it could not contain any tuples that were in A when
1897  * the vacuum started.
1898  */
1899  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1900  ropaque->btpo_flags |= BTP_SPLIT_END;
1901  }
1902 
1903  /*
1904  * Right sibling is locked, new siblings are prepared, but original page
1905  * is not updated yet.
1906  *
1907  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1908  * not starting the critical section till here because we haven't been
1909  * scribbling on the original page yet; see comments above.
1910  */
1912 
1913  /*
1914  * By here, the original data page has been split into two new halves, and
1915  * these are correct. The algorithm requires that the left page never
1916  * move during a split, so we copy the new left page back on top of the
1917  * original. We need to do this before writing the WAL record, so that
1918  * XLogInsert can WAL log an image of the page if necessary.
1919  */
1920  PageRestoreTempPage(leftpage, origpage);
1921  /* leftpage, lopaque must not be used below here */
1922 
1924  MarkBufferDirty(rbuf);
1925 
1926  if (!isrightmost)
1927  {
1928  sopaque->btpo_prev = rightpagenumber;
1929  MarkBufferDirty(sbuf);
1930  }
1931 
1932  /*
1933  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1934  * a split
1935  */
1936  if (!isleaf)
1937  {
1938  Page cpage = BufferGetPage(cbuf);
1939  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1940 
1941  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1942  MarkBufferDirty(cbuf);
1943  }
1944 
1945  /* XLOG stuff */
1946  if (RelationNeedsWAL(rel))
1947  {
1948  xl_btree_split xlrec;
1949  uint8 xlinfo;
1950  XLogRecPtr recptr;
1951 
1952  xlrec.level = ropaque->btpo.level;
1953  /* See comments below on newitem, orignewitem, and posting lists */
1954  xlrec.firstrightoff = firstrightoff;
1955  xlrec.newitemoff = newitemoff;
1956  xlrec.postingoff = 0;
1957  if (postingoff != 0 && origpagepostingoff < firstrightoff)
1958  xlrec.postingoff = postingoff;
1959 
1960  XLogBeginInsert();
1961  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1962 
1965  /* Log original right sibling, since we've changed its prev-pointer */
1966  if (!isrightmost)
1968  if (!isleaf)
1970 
1971  /*
1972  * Log the new item, if it was inserted on the left page. (If it was
1973  * put on the right page, we don't need to explicitly WAL log it
1974  * because it's included with all the other items on the right page.)
1975  * Show the new item as belonging to the left page buffer, so that it
1976  * is not stored if XLogInsert decides it needs a full-page image of
1977  * the left page. We always store newitemoff in the record, though.
1978  *
1979  * The details are sometimes slightly different for page splits that
1980  * coincide with a posting list split. If both the replacement
1981  * posting list and newitem go on the right page, then we don't need
1982  * to log anything extra, just like the simple !newitemonleft
1983  * no-posting-split case (postingoff is set to zero in the WAL record,
1984  * so recovery doesn't need to process a posting list split at all).
1985  * Otherwise, we set postingoff and log orignewitem instead of
1986  * newitem, despite having actually inserted newitem. REDO routine
1987  * must reconstruct nposting and newitem using _bt_swap_posting().
1988  *
1989  * Note: It's possible that our page split point is the point that
1990  * makes the posting list lastleft and newitem firstright. This is
1991  * the only case where we log orignewitem/newitem despite newitem
1992  * going on the right page. If XLogInsert decides that it can omit
1993  * orignewitem due to logging a full-page image of the left page,
1994  * everything still works out, since recovery only needs to log
1995  * orignewitem for items on the left page (just like the regular
1996  * newitem-logged case).
1997  */
1998  if (newitemonleft && xlrec.postingoff == 0)
1999  XLogRegisterBufData(0, (char *) newitem, newitemsz);
2000  else if (xlrec.postingoff != 0)
2001  {
2002  Assert(isleaf);
2003  Assert(newitemonleft || firstrightoff == newitemoff);
2004  Assert(newitemsz == IndexTupleSize(orignewitem));
2005  XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
2006  }
2007 
2008  /* Log the left page's new high key */
2009  if (!isleaf)
2010  {
2011  /* lefthighkey isn't local copy, get current pointer */
2012  itemid = PageGetItemId(origpage, P_HIKEY);
2013  lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2014  }
2015  XLogRegisterBufData(0, (char *) lefthighkey,
2016  MAXALIGN(IndexTupleSize(lefthighkey)));
2017 
2018  /*
2019  * Log the contents of the right page in the format understood by
2020  * _bt_restore_page(). The whole right page will be recreated.
2021  *
2022  * Direct access to page is not good but faster - we should implement
2023  * some new func in page API. Note we only store the tuples
2024  * themselves, knowing that they were inserted in item-number order
2025  * and so the line pointers can be reconstructed. See comments for
2026  * _bt_restore_page().
2027  */
2029  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2030  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2031 
2032  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2033  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2034 
2035  PageSetLSN(origpage, recptr);
2036  PageSetLSN(rightpage, recptr);
2037  if (!isrightmost)
2038  PageSetLSN(spage, recptr);
2039  if (!isleaf)
2040  PageSetLSN(BufferGetPage(cbuf), recptr);
2041  }
2042 
2043  END_CRIT_SECTION();
2044 
2045  /* release the old right sibling */
2046  if (!isrightmost)
2047  _bt_relbuf(rel, sbuf);
2048 
2049  /* release the child */
2050  if (!isleaf)
2051  _bt_relbuf(rel, cbuf);
2052 
2053  /* be tidy */
2054  if (isleaf)
2055  pfree(lefthighkey);
2056 
2057  /* split's done */
2058  return rbuf;
2059 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
#define BTP_ROOT
Definition: nbtree.h:73
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1934
#define BTP_SPLIT_END
Definition: nbtree.h:77
BlockNumber btpo_next
Definition: nbtree.h:59
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:403
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define SizeOfBtreeSplit
Definition: nbtxlog.h:161
ItemPointerData t_tid
Definition: itup.h:37
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:365
Pointer Item
Definition: item.h:17
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
int errcode(int sqlerrcode)
Definition: elog.c:610
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:79
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:91
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2199
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ERROR
Definition: elog.h:43
OffsetNumber newitemoff
Definition: nbtxlog.h:157
BTCycleId btpo_cycleid
Definition: nbtree.h:66
BlockNumber btpo_prev
Definition: nbtree.h:58
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:130
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
Page PageGetTempPage(Page page)
Definition: bufpage.c:344
uint32 level
Definition: nbtxlog.h:155
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
#define ereport(elevel,...)
Definition: elog.h:144
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define MAXALIGN(LEN)
Definition: c.h:691
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define PageGetLSN(page)
Definition: bufpage.h:366
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define elog(elevel,...)
Definition: elog.h:214
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:959
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2603
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
#define BT_WRITE
Definition: nbtree.h:588
uint16 postingoff
Definition: nbtxlog.h:158
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
OffsetNumber firstrightoff
Definition: nbtxlog.h:156
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_stepright()

static void _bt_stepright ( Relation  rel,
BTInsertState  insertstate,
BTStack  stack 
)
static

Definition at line 1025 of file nbtinsert.c.

References _bt_finish_split(), _bt_relandgetbuf(), _bt_relbuf(), BTInsertStateData::bounds_valid, BT_WRITE, BTPageOpaqueData::btpo_next, BTInsertStateData::buf, BufferGetPage, elog, ERROR, InvalidBuffer, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_findinsertloc().

1026 {
1027  Page page;
1028  BTPageOpaque lpageop;
1029  Buffer rbuf;
1030  BlockNumber rblkno;
1031 
1032  page = BufferGetPage(insertstate->buf);
1033  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1034 
1035  rbuf = InvalidBuffer;
1036  rblkno = lpageop->btpo_next;
1037  for (;;)
1038  {
1039  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1040  page = BufferGetPage(rbuf);
1041  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1042 
1043  /*
1044  * If this page was incompletely split, finish the split now. We do
1045  * this while holding a lock on the left sibling, which is not good
1046  * because finishing the split could be a fairly lengthy operation.
1047  * But this should happen very seldom.
1048  */
1049  if (P_INCOMPLETE_SPLIT(lpageop))
1050  {
1051  _bt_finish_split(rel, rbuf, stack);
1052  rbuf = InvalidBuffer;
1053  continue;
1054  }
1055 
1056  if (!P_IGNORE(lpageop))
1057  break;
1058  if (P_RIGHTMOST(lpageop))
1059  elog(ERROR, "fell off the end of index \"%s\"",
1061 
1062  rblkno = lpageop->btpo_next;
1063  }
1064  /* rbuf locked; unlock buf, update state for caller */
1065  _bt_relbuf(rel, insertstate->buf);
1066  insertstate->buf = rbuf;
1067  insertstate->bounds_valid = false;
1068 }
BlockNumber btpo_next
Definition: nbtree.h:59
#define P_IGNORE(opaque)
Definition: nbtree.h:219
bool bounds_valid
Definition: nbtree.h:696
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:928
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2215
#define ERROR
Definition: elog.h:43
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define elog(elevel,...)
Definition: elog.h:214
#define BT_WRITE
Definition: nbtree.h:588
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Pointer Page
Definition: bufpage.h:78

◆ _bt_vacuum_one_page()

static void _bt_vacuum_one_page ( Relation  rel,
Buffer  buffer,
Relation  heapRel 
)
static

Definition at line 2635 of file nbtinsert.c.

References _bt_delitems_delete(), Assert, BufferGetPage, ItemIdIsDead, MaxIndexTuplesPerPage, OffsetNumberNext, P_FIRSTDATAKEY, P_ISLEAF, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc().

2636 {
2638  int ndeletable = 0;
2639  OffsetNumber offnum,
2640  minoff,
2641  maxoff;
2642  Page page = BufferGetPage(buffer);
2644 
2645  Assert(P_ISLEAF(opaque));
2646 
2647  /*
2648  * Scan over all items to see which ones need to be deleted according to
2649  * LP_DEAD flags.
2650  */
2651  minoff = P_FIRSTDATAKEY(opaque);
2652  maxoff = PageGetMaxOffsetNumber(page);
2653  for (offnum = minoff;
2654  offnum <= maxoff;
2655  offnum = OffsetNumberNext(offnum))
2656  {
2657  ItemId itemId = PageGetItemId(page, offnum);
2658 
2659  if (ItemIdIsDead(itemId))
2660  deletable[ndeletable++] = offnum;
2661  }
2662 
2663  if (ndeletable > 0)
2664  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2665 
2666  /*
2667  * Note: if we didn't find any LP_DEAD items, then the page's
2668  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2669  * separate write to clear it, however. We will clear it when we split
2670  * the page, or when deduplication runs.
2671  */
2672 }
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1183
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define Assert(condition)
Definition: c.h:738
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MaxIndexTuplesPerPage
Definition: itup.h:145
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214