PostgreSQL Source Code  git master
nbtinsert.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
Include dependency graph for nbtinsert.c:

Go to the source code of this file.

Macros

#define BTREE_FASTPATH_MIN_LEVEL   2
 

Functions

static Buffer _bt_newroot (Relation rel, Buffer lbuf, Buffer rbuf)
 
static TransactionId _bt_check_unique (Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
 
static OffsetNumber _bt_findinsertloc (Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
 
static void _bt_stepright (Relation rel, BTInsertState insertstate, BTStack stack)
 
static void _bt_insertonpg (Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
 
static Buffer _bt_split (Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
 
static void _bt_insert_parent (Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
 
static bool _bt_pgaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
 
static void _bt_vacuum_one_page (Relation rel, Buffer buffer, Relation heapRel)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, BlockNumber child)
 

Macro Definition Documentation

◆ BTREE_FASTPATH_MIN_LEVEL

#define BTREE_FASTPATH_MIN_LEVEL   2

Definition at line 29 of file nbtinsert.c.

Referenced by _bt_insertonpg().

Function Documentation

◆ _bt_check_unique()

static TransactionId _bt_check_unique ( Relation  rel,
BTInsertState  insertstate,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool is_unique,
uint32 speculativeToken 
)
static

Definition at line 343 of file nbtinsert.c.

References _bt_binsrch_insert(), _bt_compare(), _bt_relandgetbuf(), _bt_relbuf(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BT_READ, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTInsertStateData::buf, BufferGetPage, BuildIndexValueDescription(), CheckForSerializableConflictIn(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), index_deform_tuple(), INDEX_MAX_KEYS, InitDirtySnapshot, InvalidBuffer, InvalidTransactionId, ItemIdIsDead, ItemIdMarkDead, ItemPointerCompare(), BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MarkBufferDirtyHint(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, RelationGetRelationName, BTScanInsertData::scantid, SnapshotSelf, SnapshotData::speculativeToken, BTInsertStateData::stricthigh, IndexTupleData::t_tid, table_index_fetch_tuple_check(), TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_PARTIAL, values, SnapshotData::xmax, and SnapshotData::xmin.

Referenced by _bt_doinsert().

346 {
347  IndexTuple itup = insertstate->itup;
348  BTScanInsert itup_key = insertstate->itup_key;
349  SnapshotData SnapshotDirty;
350  OffsetNumber offset;
351  OffsetNumber maxoff;
352  Page page;
353  BTPageOpaque opaque;
354  Buffer nbuf = InvalidBuffer;
355  bool found = false;
356 
357  /* Assume unique until we find a duplicate */
358  *is_unique = true;
359 
360  InitDirtySnapshot(SnapshotDirty);
361 
362  page = BufferGetPage(insertstate->buf);
363  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
364  maxoff = PageGetMaxOffsetNumber(page);
365 
366  /*
367  * Find the first tuple with the same key.
368  *
369  * This also saves the binary search bounds in insertstate. We use them
370  * in the fastpath below, but also in the _bt_findinsertloc() call later.
371  */
372  Assert(!insertstate->bounds_valid);
373  offset = _bt_binsrch_insert(rel, insertstate);
374 
375  /*
376  * Scan over all equal tuples, looking for live conflicts.
377  */
378  Assert(!insertstate->bounds_valid || insertstate->low == offset);
379  Assert(!itup_key->anynullkeys);
380  Assert(itup_key->scantid == NULL);
381  for (;;)
382  {
383  ItemId curitemid;
384  IndexTuple curitup;
385  BlockNumber nblkno;
386 
387  /*
388  * make sure the offset points to an actual item before trying to
389  * examine it...
390  */
391  if (offset <= maxoff)
392  {
393  /*
394  * Fastpath: In most cases, we can use cached search bounds to
395  * limit our consideration to items that are definitely
396  * duplicates. This fastpath doesn't apply when the original page
397  * is empty, or when initial offset is past the end of the
398  * original page, which may indicate that we need to examine a
399  * second or subsequent page.
400  *
401  * Note that this optimization allows us to avoid calling
402  * _bt_compare() directly when there are no duplicates, as long as
403  * the offset where the key will go is not at the end of the page.
404  */
405  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
406  {
407  Assert(insertstate->bounds_valid);
408  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
409  Assert(insertstate->low <= insertstate->stricthigh);
410  Assert(_bt_compare(rel, itup_key, page, offset) < 0);
411  break;
412  }
413 
414  curitemid = PageGetItemId(page, offset);
415 
416  /*
417  * We can skip items that are marked killed.
418  *
419  * In the presence of heavy update activity an index may contain
420  * many killed items with the same key; running _bt_compare() on
421  * each killed item gets expensive. Just advance over killed
422  * items as quickly as we can. We only apply _bt_compare() when
423  * we get to a non-killed item. Even those comparisons could be
424  * avoided (in the common case where there is only one page to
425  * visit) by reusing bounds, but just skipping dead items is fast
426  * enough.
427  */
428  if (!ItemIdIsDead(curitemid))
429  {
430  ItemPointerData htid;
431  bool all_dead;
432 
433  if (_bt_compare(rel, itup_key, page, offset) != 0)
434  break; /* we're past all the equal tuples */
435 
436  /* okay, we gotta fetch the heap tuple ... */
437  curitup = (IndexTuple) PageGetItem(page, curitemid);
438  htid = curitup->t_tid;
439 
440  /*
441  * If we are doing a recheck, we expect to find the tuple we
442  * are rechecking. It's not a duplicate, but we have to keep
443  * scanning.
444  */
445  if (checkUnique == UNIQUE_CHECK_EXISTING &&
446  ItemPointerCompare(&htid, &itup->t_tid) == 0)
447  {
448  found = true;
449  }
450 
451  /*
452  * Check if there's any table tuples for this index entry
453  * satisfying SnapshotDirty. This is necessary because for AMs
454  * with optimizations like heap's HOT, we have just a single
455  * index entry for the entire chain.
456  */
457  else if (table_index_fetch_tuple_check(heapRel, &htid,
458  &SnapshotDirty,
459  &all_dead))
460  {
461  TransactionId xwait;
462 
463  /*
464  * It is a duplicate. If we are only doing a partial
465  * check, then don't bother checking if the tuple is being
466  * updated in another transaction. Just return the fact
467  * that it is a potential conflict and leave the full
468  * check till later. Don't invalidate binary search
469  * bounds.
470  */
471  if (checkUnique == UNIQUE_CHECK_PARTIAL)
472  {
473  if (nbuf != InvalidBuffer)
474  _bt_relbuf(rel, nbuf);
475  *is_unique = false;
476  return InvalidTransactionId;
477  }
478 
479  /*
480  * If this tuple is being updated by other transaction
481  * then we have to wait for its commit/abort.
482  */
483  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
484  SnapshotDirty.xmin : SnapshotDirty.xmax;
485 
486  if (TransactionIdIsValid(xwait))
487  {
488  if (nbuf != InvalidBuffer)
489  _bt_relbuf(rel, nbuf);
490  /* Tell _bt_doinsert to wait... */
491  *speculativeToken = SnapshotDirty.speculativeToken;
492  /* Caller releases lock on buf immediately */
493  insertstate->bounds_valid = false;
494  return xwait;
495  }
496 
497  /*
498  * Otherwise we have a definite conflict. But before
499  * complaining, look to see if the tuple we want to insert
500  * is itself now committed dead --- if so, don't complain.
501  * This is a waste of time in normal scenarios but we must
502  * do it to support CREATE INDEX CONCURRENTLY.
503  *
504  * We must follow HOT-chains here because during
505  * concurrent index build, we insert the root TID though
506  * the actual tuple may be somewhere in the HOT-chain.
507  * While following the chain we might not stop at the
508  * exact tuple which triggered the insert, but that's OK
509  * because if we find a live tuple anywhere in this chain,
510  * we have a unique key conflict. The other live tuple is
511  * not part of this chain because it had a different index
512  * entry.
513  */
514  htid = itup->t_tid;
515  if (table_index_fetch_tuple_check(heapRel, &htid,
516  SnapshotSelf, NULL))
517  {
518  /* Normal case --- it's still live */
519  }
520  else
521  {
522  /*
523  * It's been deleted, so no error, and no need to
524  * continue searching
525  */
526  break;
527  }
528 
529  /*
530  * Check for a conflict-in as we would if we were going to
531  * write to this page. We aren't actually going to write,
532  * but we want a chance to report SSI conflicts that would
533  * otherwise be masked by this unique constraint
534  * violation.
535  */
536  CheckForSerializableConflictIn(rel, NULL, insertstate->buf);
537 
538  /*
539  * This is a definite conflict. Break the tuple down into
540  * datums and report the error. But first, make sure we
541  * release the buffer locks we're holding ---
542  * BuildIndexValueDescription could make catalog accesses,
543  * which in the worst case might touch this same index and
544  * cause deadlocks.
545  */
546  if (nbuf != InvalidBuffer)
547  _bt_relbuf(rel, nbuf);
548  _bt_relbuf(rel, insertstate->buf);
549  insertstate->buf = InvalidBuffer;
550  insertstate->bounds_valid = false;
551 
552  {
554  bool isnull[INDEX_MAX_KEYS];
555  char *key_desc;
556 
558  values, isnull);
559 
560  key_desc = BuildIndexValueDescription(rel, values,
561  isnull);
562 
563  ereport(ERROR,
564  (errcode(ERRCODE_UNIQUE_VIOLATION),
565  errmsg("duplicate key value violates unique constraint \"%s\"",
567  key_desc ? errdetail("Key %s already exists.",
568  key_desc) : 0,
569  errtableconstraint(heapRel,
570  RelationGetRelationName(rel))));
571  }
572  }
573  else if (all_dead)
574  {
575  /*
576  * The conflicting tuple (or whole HOT chain) is dead to
577  * everyone, so we may as well mark the index entry
578  * killed.
579  */
580  ItemIdMarkDead(curitemid);
581  opaque->btpo_flags |= BTP_HAS_GARBAGE;
582 
583  /*
584  * Mark buffer with a dirty hint, since state is not
585  * crucial. Be sure to mark the proper buffer dirty.
586  */
587  if (nbuf != InvalidBuffer)
588  MarkBufferDirtyHint(nbuf, true);
589  else
590  MarkBufferDirtyHint(insertstate->buf, true);
591  }
592  }
593  }
594 
595  /*
596  * Advance to next tuple to continue checking.
597  */
598  if (offset < maxoff)
599  offset = OffsetNumberNext(offset);
600  else
601  {
602  int highkeycmp;
603 
604  /* If scankey == hikey we gotta check the next page too */
605  if (P_RIGHTMOST(opaque))
606  break;
607  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
608  Assert(highkeycmp <= 0);
609  if (highkeycmp != 0)
610  break;
611  /* Advance to next non-dead page --- there must be one */
612  for (;;)
613  {
614  nblkno = opaque->btpo_next;
615  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
616  page = BufferGetPage(nbuf);
617  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
618  if (!P_IGNORE(opaque))
619  break;
620  if (P_RIGHTMOST(opaque))
621  elog(ERROR, "fell off the end of index \"%s\"",
623  }
624  maxoff = PageGetMaxOffsetNumber(page);
625  offset = P_FIRSTDATAKEY(opaque);
626  /* Don't invalidate binary search bounds */
627  }
628  }
629 
630  /*
631  * If we are doing a recheck then we should have found the tuple we are
632  * checking. Otherwise there's something very wrong --- probably, the
633  * index is on a non-immutable expression.
634  */
635  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
636  ereport(ERROR,
637  (errcode(ERRCODE_INTERNAL_ERROR),
638  errmsg("failed to re-find tuple within index \"%s\"",
640  errhint("This may be because of a non-immutable index expression."),
641  errtableconstraint(heapRel,
642  RelationGetRelationName(rel))));
643 
644  if (nbuf != InvalidBuffer)
645  _bt_relbuf(rel, nbuf);
646 
647  return InvalidTransactionId;
648 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
BlockNumber btpo_next
Definition: nbtree.h:58
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:77
int errhint(const char *fmt,...)
Definition: elog.c:974
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:504
uint32 TransactionId
Definition: c.h:507
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3423
#define RelationGetDescr(relation)
Definition: rel.h:442
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
ItemPointer scantid
Definition: nbtree.h:472
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
OffsetNumber stricthigh
Definition: nbtree.h:506
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
OffsetNumber low
Definition: nbtree.h:505
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4442
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5222
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
#define SnapshotSelf
Definition: snapmgr.h:69
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:438
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:450
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:426
TransactionId xmax
Definition: snapshot.h:158
TransactionId xmin
Definition: snapshot.h:157
IndexTuple itup
Definition: nbtree.h:492
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:552
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:201
#define ereport(elevel, rest)
Definition: elog.h:141
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
bool anynullkeys
Definition: nbtree.h:469
uintptr_t Datum
Definition: postgres.h:367
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
uint32 speculativeToken
Definition: snapshot.h:193
BTScanInsert itup_key
Definition: nbtree.h:494
#define Assert(condition)
Definition: c.h:732
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define P_HIKEY
Definition: nbtree.h:217
static Datum values[MAXATTR]
Definition: bootstrap.c:167
int errmsg(const char *fmt,...)
Definition: elog.c:784
#define elog(elevel,...)
Definition: elog.h:226
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:176
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint16 btpo_flags
Definition: nbtree.h:64
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 79 of file nbtinsert.c.

References _bt_check_unique(), _bt_checkpage(), _bt_compare(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BT_WRITE, buf, BTInsertStateData::buf, BufferGetPage, CheckForSerializableConflictIn(), ConditionalLockBuffer(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, pfree(), ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, ReleaseBuffer(), BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

81 {
82  bool is_unique = false;
83  BTInsertStateData insertstate;
84  BTScanInsert itup_key;
85  BTStack stack = NULL;
86  Buffer buf;
87  bool fastpath;
88  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
89 
90  /* we need an insertion scan key to do our search, so build one */
91  itup_key = _bt_mkscankey(rel, itup);
92 
93  if (checkingunique)
94  {
95  if (!itup_key->anynullkeys)
96  {
97  /* No (heapkeyspace) scantid until uniqueness established */
98  itup_key->scantid = NULL;
99  }
100  else
101  {
102  /*
103  * Scan key for new tuple contains NULL key values. Bypass
104  * checkingunique steps. They are unnecessary because core code
105  * considers NULL unequal to every value, including NULL.
106  *
107  * This optimization avoids O(N^2) behavior within the
108  * _bt_findinsertloc() heapkeyspace path when a unique index has a
109  * large number of "duplicates" with NULL key values.
110  */
111  checkingunique = false;
112  /* Tuple is unique in the sense that core code cares about */
113  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
114  is_unique = true;
115  }
116  }
117 
118  /*
119  * Fill in the BTInsertState working area, to track the current page and
120  * position within the page to insert on
121  */
122  insertstate.itup = itup;
123  /* PageAddItem will MAXALIGN(), but be consistent */
124  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
125  insertstate.itup_key = itup_key;
126  insertstate.bounds_valid = false;
127  insertstate.buf = InvalidBuffer;
128 
129  /*
130  * It's very common to have an index on an auto-incremented or
131  * monotonically increasing value. In such cases, every insertion happens
132  * towards the end of the index. We try to optimize that case by caching
133  * the right-most leaf of the index. If our cached block is still the
134  * rightmost leaf, has enough free space to accommodate a new entry and
135  * the insertion key is strictly greater than the first key in this page,
136  * then we can safely conclude that the new key will be inserted in the
137  * cached block. So we simply search within the cached block and insert
138  * the key at the appropriate location. We call it a fastpath.
139  *
140  * Testing has revealed, though, that the fastpath can result in increased
141  * contention on the exclusive-lock on the rightmost leaf page. So we
142  * conditionally check if the lock is available. If it's not available
143  * then we simply abandon the fastpath and take the regular path. This
144  * makes sense because unavailability of the lock also signals that some
145  * other backend might be concurrently inserting into the page, thus
146  * reducing our chances to finding an insertion place in this page.
147  */
148 top:
149  fastpath = false;
151  {
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change,
159  * as far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170 
171  /*
172  * Check if the page is still the rightmost leaf page, has enough
173  * free space to accommodate the new tuple, and the insertion scan
174  * key is strictly greater than the first key on the page.
175  */
176  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
177  !P_IGNORE(lpageop) &&
178  (PageGetFreeSpace(page) > insertstate.itemsz) &&
179  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
180  _bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
181  {
182  /*
183  * The right-most block should never have an incomplete split.
184  * But be paranoid and check for it anyway.
185  */
186  Assert(!P_INCOMPLETE_SPLIT(lpageop));
187  fastpath = true;
188  }
189  else
190  {
191  _bt_relbuf(rel, buf);
192 
193  /*
194  * Something did not work out. Just forget about the cached
195  * block and follow the normal path. It might be set again if
196  * the conditions are favourable.
197  */
199  }
200  }
201  else
202  {
203  ReleaseBuffer(buf);
204 
205  /*
206  * If someone's holding a lock, it's likely to change anyway, so
207  * don't try again until we get an updated rightmost leaf.
208  */
210  }
211  }
212 
213  if (!fastpath)
214  {
215  /*
216  * Find the first page containing this key. Buffer returned by
217  * _bt_search() is locked in exclusive mode.
218  */
219  stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
220  }
221 
222  insertstate.buf = buf;
223  buf = InvalidBuffer; /* insertstate.buf now owns the buffer */
224 
225  /*
226  * If we're not allowing duplicates, make sure the key isn't already in
227  * the index.
228  *
229  * NOTE: obviously, _bt_check_unique can only detect keys that are already
230  * in the index; so it cannot defend against concurrent insertions of the
231  * same key. We protect against that by means of holding a write lock on
232  * the first page the value could be on, with omitted/-inf value for the
233  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
234  * the same key must acquire a write lock on the same page, so only one
235  * would-be inserter can be making the check at one time. Furthermore,
236  * once we are past the check we hold write locks continuously until we
237  * have performed our insertion, so no later inserter can fail to see our
238  * insertion. (This requires some care in _bt_findinsertloc.)
239  *
240  * If we must wait for another xact, we release the lock while waiting,
241  * and then must start over completely.
242  *
243  * For a partial uniqueness check, we don't wait for the other xact. Just
244  * let the tuple in and return false for possibly non-unique, or true for
245  * definitely unique.
246  */
247  if (checkingunique)
248  {
249  TransactionId xwait;
250  uint32 speculativeToken;
251 
252  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
253  &is_unique, &speculativeToken);
254 
255  if (TransactionIdIsValid(xwait))
256  {
257  /* Have to wait for the other guy ... */
258  _bt_relbuf(rel, insertstate.buf);
259  insertstate.buf = InvalidBuffer;
260 
261  /*
262  * If it's a speculative insertion, wait for it to finish (ie. to
263  * go ahead with the insertion, or kill the tuple). Otherwise
264  * wait for the transaction to finish as usual.
265  */
266  if (speculativeToken)
267  SpeculativeInsertionWait(xwait, speculativeToken);
268  else
269  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
270 
271  /* start over... */
272  if (stack)
273  _bt_freestack(stack);
274  goto top;
275  }
276 
277  /* Uniqueness is established -- restore heap tid as scantid */
278  if (itup_key->heapkeyspace)
279  itup_key->scantid = &itup->t_tid;
280  }
281 
282  if (checkUnique != UNIQUE_CHECK_EXISTING)
283  {
284  OffsetNumber newitemoff;
285 
286  /*
287  * The only conflict predicate locking cares about for indexes is when
288  * an index tuple insert conflicts with an existing lock. We don't
289  * know the actual page we're going to insert on for sure just yet in
290  * checkingunique and !heapkeyspace cases, but it's okay to use the
291  * first page the value could be on (with scantid omitted) instead.
292  */
293  CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
294 
295  /*
296  * Do the insertion. Note that insertstate contains cached binary
297  * search bounds established within _bt_check_unique when insertion is
298  * checkingunique.
299  */
300  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
301  stack, heapRel);
302  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
303  itup, newitemoff, false);
304  }
305  else
306  {
307  /* just release the buffer */
308  _bt_relbuf(rel, insertstate.buf);
309  }
310 
311  /* be tidy */
312  if (stack)
313  _bt_freestack(stack);
314  pfree(itup_key);
315 
316  return is_unique;
317 }
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:932
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:504
uint32 TransactionId
Definition: c.h:507
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:85
ItemPointer scantid
Definition: nbtree.h:472
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:683
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:501
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4442
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:690
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:781
static char * buf
Definition: pg_test_fsync.c:68
unsigned int uint32
Definition: c.h:358
IndexTuple itup
Definition: nbtree.h:492
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:552
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3616
bool anynullkeys
Definition: nbtree.h:469
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:624
BTScanInsert itup_key
Definition: nbtree.h:494
#define Assert(condition)
Definition: c.h:732
bool heapkeyspace
Definition: nbtree.h:468
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:508
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define MAXALIGN(LEN)
Definition: c.h:685
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:343
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_findinsertloc()

static OffsetNumber _bt_findinsertloc ( Relation  rel,
BTInsertState  insertstate,
bool  checkingunique,
BTStack  stack,
Relation  heapRel 
)
static

Definition at line 683 of file nbtinsert.c.

References _bt_binsrch_insert(), _bt_check_third_page(), _bt_compare(), _bt_stepright(), _bt_vacuum_one_page(), Assert, BTInsertStateData::bounds_valid, BTMaxItemSize, BTInsertStateData::buf, BufferGetPage, BTScanInsertData::heapkeyspace, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MAX_RANDOM_VALUE, P_HAS_GARBAGE, P_HIKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, random(), BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_doinsert().

688 {
689  BTScanInsert itup_key = insertstate->itup_key;
690  Page page = BufferGetPage(insertstate->buf);
691  BTPageOpaque lpageop;
692 
693  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
694 
695  /* Check 1/3 of a page restriction */
696  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
697  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
698  insertstate->itup);
699 
700  Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
701  Assert(!insertstate->bounds_valid || checkingunique);
702  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
703  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
704 
705  if (itup_key->heapkeyspace)
706  {
707  /*
708  * If we're inserting into a unique index, we may have to walk right
709  * through leaf pages to find the one leaf page that we must insert on
710  * to.
711  *
712  * This is needed for checkingunique callers because a scantid was not
713  * used when we called _bt_search(). scantid can only be set after
714  * _bt_check_unique() has checked for duplicates. The buffer
715  * initially stored in insertstate->buf has the page where the first
716  * duplicate key might be found, which isn't always the page that new
717  * tuple belongs on. The heap TID attribute for new tuple (scantid)
718  * could force us to insert on a sibling page, though that should be
719  * very rare in practice.
720  */
721  if (checkingunique)
722  {
723  for (;;)
724  {
725  /*
726  * Does the new tuple belong on this page?
727  *
728  * The earlier _bt_check_unique() call may well have
729  * established a strict upper bound on the offset for the new
730  * item. If it's not the last item of the page (i.e. if there
731  * is at least one tuple on the page that goes after the tuple
732  * we're inserting) then we know that the tuple belongs on
733  * this page. We can skip the high key check.
734  */
735  if (insertstate->bounds_valid &&
736  insertstate->low <= insertstate->stricthigh &&
737  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
738  break;
739 
740  /* Test '<=', not '!=', since scantid is set now */
741  if (P_RIGHTMOST(lpageop) ||
742  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
743  break;
744 
745  _bt_stepright(rel, insertstate, stack);
746  /* Update local state after stepping right */
747  page = BufferGetPage(insertstate->buf);
748  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
749  }
750  }
751 
752  /*
753  * If the target page is full, see if we can obtain enough space by
754  * erasing LP_DEAD items
755  */
756  if (PageGetFreeSpace(page) < insertstate->itemsz &&
757  P_HAS_GARBAGE(lpageop))
758  {
759  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
760  insertstate->bounds_valid = false;
761  }
762  }
763  else
764  {
765  /*----------
766  * This is a !heapkeyspace (version 2 or 3) index. The current page
767  * is the first page that we could insert the new tuple to, but there
768  * may be other pages to the right that we could opt to use instead.
769  *
770  * If the new key is equal to one or more existing keys, we can
771  * legitimately place it anywhere in the series of equal keys. In
772  * fact, if the new key is equal to the page's "high key" we can place
773  * it on the next page. If it is equal to the high key, and there's
774  * not room to insert the new tuple on the current page without
775  * splitting, then we move right hoping to find more free space and
776  * avoid a split.
777  *
778  * Keep scanning right until we
779  * (a) find a page with enough free space,
780  * (b) reach the last page where the tuple can legally go, or
781  * (c) get tired of searching.
782  * (c) is not flippant; it is important because if there are many
783  * pages' worth of equal keys, it's better to split one of the early
784  * pages than to scan all the way to the end of the run of equal keys
785  * on every insert. We implement "get tired" as a random choice,
786  * since stopping after scanning a fixed number of pages wouldn't work
787  * well (we'd never reach the right-hand side of previously split
788  * pages). The probability of moving right is set at 0.99, which may
789  * seem too high to change the behavior much, but it does an excellent
790  * job of preventing O(N^2) behavior with many equal keys.
791  *----------
792  */
793  while (PageGetFreeSpace(page) < insertstate->itemsz)
794  {
795  /*
796  * Before considering moving right, see if we can obtain enough
797  * space by erasing LP_DEAD items
798  */
799  if (P_HAS_GARBAGE(lpageop))
800  {
801  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
802  insertstate->bounds_valid = false;
803 
804  if (PageGetFreeSpace(page) >= insertstate->itemsz)
805  break; /* OK, now we have enough space */
806  }
807 
808  /*
809  * Nope, so check conditions (b) and (c) enumerated above
810  *
811  * The earlier _bt_check_unique() call may well have established a
812  * strict upper bound on the offset for the new item. If it's not
813  * the last item of the page (i.e. if there is at least one tuple
814  * on the page that's greater than the tuple we're inserting to)
815  * then we know that the tuple belongs on this page. We can skip
816  * the high key check.
817  */
818  if (insertstate->bounds_valid &&
819  insertstate->low <= insertstate->stricthigh &&
820  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
821  break;
822 
823  if (P_RIGHTMOST(lpageop) ||
824  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
825  random() <= (MAX_RANDOM_VALUE / 100))
826  break;
827 
828  _bt_stepright(rel, insertstate, stack);
829  /* Update local state after stepping right */
830  page = BufferGetPage(insertstate->buf);
831  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
832  }
833  }
834 
835  /*
836  * We should now be on the correct page. Find the offset within the page
837  * for the new tuple. (Possibly reusing earlier search bounds.)
838  */
839  Assert(P_RIGHTMOST(lpageop) ||
840  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
841 
842  return _bt_binsrch_insert(rel, insertstate);
843 }
bool bounds_valid
Definition: nbtree.h:504
ItemPointer scantid
Definition: nbtree.h:472
long random(void)
Definition: random.c:22
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:195
OffsetNumber stricthigh
Definition: nbtree.h:506
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:858
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
OffsetNumber low
Definition: nbtree.h:505
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2269
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:438
#define MAX_RANDOM_VALUE
IndexTuple itup
Definition: nbtree.h:492
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:552
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
BTScanInsert itup_key
Definition: nbtree.h:494
#define Assert(condition)
Definition: c.h:732
bool heapkeyspace
Definition: nbtree.h:468
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2532
#define BTMaxItemSize(page)
Definition: nbtree.h:147
#define P_HIKEY
Definition: nbtree.h:217
#define unlikely(x)
Definition: c.h:208
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 1856 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

1857 {
1858  Page lpage = BufferGetPage(lbuf);
1859  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1860  Buffer rbuf;
1861  Page rpage;
1862  BTPageOpaque rpageop;
1863  bool was_root;
1864  bool was_only;
1865 
1866  Assert(P_INCOMPLETE_SPLIT(lpageop));
1867 
1868  /* Lock right sibling, the one missing the downlink */
1869  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1870  rpage = BufferGetPage(rbuf);
1871  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1872 
1873  /* Could this be a root split? */
1874  if (!stack)
1875  {
1876  Buffer metabuf;
1877  Page metapg;
1878  BTMetaPageData *metad;
1879 
1880  /* acquire lock on the metapage */
1881  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1882  metapg = BufferGetPage(metabuf);
1883  metad = BTPageGetMeta(metapg);
1884 
1885  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1886 
1887  _bt_relbuf(rel, metabuf);
1888  }
1889  else
1890  was_root = false;
1891 
1892  /* Was this the only page on the level before split? */
1893  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1894 
1895  elog(DEBUG1, "finishing incomplete split of %u/%u",
1897 
1898  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1899 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1744
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 1932 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_branch_parent().

1933 {
1934  BlockNumber blkno;
1935  OffsetNumber start;
1936 
1937  blkno = stack->bts_blkno;
1938  start = stack->bts_offset;
1939 
1940  for (;;)
1941  {
1942  Buffer buf;
1943  Page page;
1944  BTPageOpaque opaque;
1945 
1946  buf = _bt_getbuf(rel, blkno, BT_WRITE);
1947  page = BufferGetPage(buf);
1948  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1949 
1950  if (P_INCOMPLETE_SPLIT(opaque))
1951  {
1952  _bt_finish_split(rel, buf, stack->bts_parent);
1953  continue;
1954  }
1955 
1956  if (!P_IGNORE(opaque))
1957  {
1958  OffsetNumber offnum,
1959  minoff,
1960  maxoff;
1961  ItemId itemid;
1962  IndexTuple item;
1963 
1964  minoff = P_FIRSTDATAKEY(opaque);
1965  maxoff = PageGetMaxOffsetNumber(page);
1966 
1967  /*
1968  * start = InvalidOffsetNumber means "search the whole page". We
1969  * need this test anyway due to possibility that page has a high
1970  * key now when it didn't before.
1971  */
1972  if (start < minoff)
1973  start = minoff;
1974 
1975  /*
1976  * Need this check too, to guard against possibility that page
1977  * split since we visited it originally.
1978  */
1979  if (start > maxoff)
1980  start = OffsetNumberNext(maxoff);
1981 
1982  /*
1983  * These loops will check every item on the page --- but in an
1984  * order that's attuned to the probability of where it actually
1985  * is. Scan to the right first, then to the left.
1986  */
1987  for (offnum = start;
1988  offnum <= maxoff;
1989  offnum = OffsetNumberNext(offnum))
1990  {
1991  itemid = PageGetItemId(page, offnum);
1992  item = (IndexTuple) PageGetItem(page, itemid);
1993 
1994  if (BTreeInnerTupleGetDownLink(item) == child)
1995  {
1996  /* Return accurate pointer to where link is now */
1997  stack->bts_blkno = blkno;
1998  stack->bts_offset = offnum;
1999  return buf;
2000  }
2001  }
2002 
2003  for (offnum = OffsetNumberPrev(start);
2004  offnum >= minoff;
2005  offnum = OffsetNumberPrev(offnum))
2006  {
2007  itemid = PageGetItemId(page, offnum);
2008  item = (IndexTuple) PageGetItem(page, itemid);
2009 
2010  if (BTreeInnerTupleGetDownLink(item) == child)
2011  {
2012  /* Return accurate pointer to where link is now */
2013  stack->bts_blkno = blkno;
2014  stack->bts_offset = offnum;
2015  return buf;
2016  }
2017  }
2018  }
2019 
2020  /*
2021  * The item we're looking for moved right at least one page.
2022  */
2023  if (P_RIGHTMOST(opaque))
2024  {
2025  _bt_relbuf(rel, buf);
2026  return InvalidBuffer;
2027  }
2028  blkno = opaque->btpo_next;
2029  start = InvalidOffsetNumber;
2030  _bt_relbuf(rel, buf);
2031  }
2032 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1856
OffsetNumber bts_offset
Definition: nbtree.h:415
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:414
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
struct BTStackData * bts_parent
Definition: nbtree.h:416
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_insert_parent()

static void _bt_insert_parent ( Relation  rel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  is_root,
bool  is_only 
)
static

Definition at line 1744 of file nbtinsert.c.

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newroot(), _bt_relbuf(), Assert, BTPageOpaqueData::btpo, BTreeInnerTupleSetDownLink, BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), DEBUG2, elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, InvalidOffsetNumber, BTPageOpaqueData::level, P_HIKEY, PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), and RelationGetRelationName.

Referenced by _bt_finish_split(), and _bt_insertonpg().

1750 {
1751  /*
1752  * Here we have to do something Lehman and Yao don't talk about: deal with
1753  * a root split and construction of a new root. If our stack is empty
1754  * then we have just split a node on what had been the root level when we
1755  * descended the tree. If it was still the root then we perform a
1756  * new-root construction. If it *wasn't* the root anymore, search to find
1757  * the next higher level that someone constructed meanwhile, and find the
1758  * right place to insert as for the normal case.
1759  *
1760  * If we have to search for the parent level, we do so by re-descending
1761  * from the root. This is not super-efficient, but it's rare enough not
1762  * to matter.
1763  */
1764  if (is_root)
1765  {
1766  Buffer rootbuf;
1767 
1768  Assert(stack == NULL);
1769  Assert(is_only);
1770  /* create a new root node and update the metapage */
1771  rootbuf = _bt_newroot(rel, buf, rbuf);
1772  /* release the split buffers */
1773  _bt_relbuf(rel, rootbuf);
1774  _bt_relbuf(rel, rbuf);
1775  _bt_relbuf(rel, buf);
1776  }
1777  else
1778  {
1780  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1781  Page page = BufferGetPage(buf);
1782  IndexTuple new_item;
1783  BTStackData fakestack;
1784  IndexTuple ritem;
1785  Buffer pbuf;
1786 
1787  if (stack == NULL)
1788  {
1789  BTPageOpaque lpageop;
1790 
1791  elog(DEBUG2, "concurrent ROOT page split");
1792  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1793  /* Find the leftmost page at the next level up */
1794  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
1795  NULL);
1796  /* Set up a phony stack entry pointing there */
1797  stack = &fakestack;
1798  stack->bts_blkno = BufferGetBlockNumber(pbuf);
1800  stack->bts_parent = NULL;
1801  _bt_relbuf(rel, pbuf);
1802  }
1803 
1804  /* get high key from left, a strict lower bound for new right page */
1805  ritem = (IndexTuple) PageGetItem(page,
1806  PageGetItemId(page, P_HIKEY));
1807 
1808  /* form an index tuple that points at the new right page */
1809  new_item = CopyIndexTuple(ritem);
1810  BTreeInnerTupleSetDownLink(new_item, rbknum);
1811 
1812  /*
1813  * Re-find and write lock the parent of buf.
1814  *
1815  * It's possible that the location of buf's downlink has changed since
1816  * our initial _bt_search() descent. _bt_getstackbuf() will detect
1817  * and recover from this, updating the stack, which ensures that the
1818  * new downlink will be inserted at the correct offset. Even buf's
1819  * parent may have changed.
1820  */
1821  pbuf = _bt_getstackbuf(rel, stack, bknum);
1822 
1823  /*
1824  * Now we can unlock the right child. The left child will be unlocked
1825  * by _bt_insertonpg().
1826  */
1827  _bt_relbuf(rel, rbuf);
1828 
1829  if (pbuf == InvalidBuffer)
1830  ereport(ERROR,
1831  (errcode(ERRCODE_INDEX_CORRUPTED),
1832  errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
1833  RelationGetRelationName(rel), bknum, rbknum)));
1834 
1835  /* Recursively insert into the parent */
1836  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
1837  new_item, stack->bts_offset + 1,
1838  is_only);
1839 
1840  /* be tidy */
1841  pfree(new_item);
1842  }
1843 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:932
union BTPageOpaqueData::@46 btpo
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2053
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:2057
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ERROR
Definition: elog.h:43
#define DEBUG2
Definition: elog.h:24
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:502
OffsetNumber bts_offset
Definition: nbtree.h:415
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:61
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:414
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
int errmsg_internal(const char *fmt,...)
Definition: elog.c:814
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
struct BTStackData * bts_parent
Definition: nbtree.h:416
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:303
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define elog(elevel,...)
Definition: elog.h:226
int Buffer
Definition: buf.h:23
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:1932
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_insertonpg()

static void _bt_insertonpg ( Relation  rel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
BTStack  stack,
IndexTuple  itup,
OffsetNumber  newitemoff,
bool  split_only_page 
)
static

Definition at line 932 of file nbtinsert.c.

References _bt_getbuf(), _bt_getrootheight(), _bt_insert_parent(), _bt_pgaddtup(), _bt_relbuf(), _bt_split(), _bt_upgrademetapage(), Assert, BlockNumberIsValid, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTREE_FASTPATH_MIN_LEVEL, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, MarkBufferDirty(), MAXALIGN, xl_btree_insert::offnum, xl_btree_metadata::oldest_btpo_xact, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageGetFreeSpace(), PageGetSpecialPointer, PageSetLSN, PANIC, PredicateLockPageSplit(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationGetTargetBlock, RelationNeedsWAL, RelationSetTargetBlock, xl_btree_metadata::root, SizeOfBtreeInsert, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_doinsert(), and _bt_insert_parent().

940 {
941  Page page;
942  BTPageOpaque lpageop;
943  Size itemsz;
944 
945  page = BufferGetPage(buf);
946  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
947 
948  /* child buffer must be given iff inserting on an internal page */
949  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
950  /* tuple must have appropriate number of attributes */
951  Assert(!P_ISLEAF(lpageop) ||
952  BTreeTupleGetNAtts(itup, rel) ==
954  Assert(P_ISLEAF(lpageop) ||
955  BTreeTupleGetNAtts(itup, rel) <=
957 
958  /* The caller should've finished any incomplete splits already. */
959  if (P_INCOMPLETE_SPLIT(lpageop))
960  elog(ERROR, "cannot insert to incompletely split page %u",
962 
963  itemsz = IndexTupleSize(itup);
964  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
965  * need to be consistent */
966 
967  /*
968  * Do we need to split the page to fit the item on it?
969  *
970  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
971  * so this comparison is correct even though we appear to be accounting
972  * only for the item and not for its line pointer.
973  */
974  if (PageGetFreeSpace(page) < itemsz)
975  {
976  bool is_root = P_ISROOT(lpageop);
977  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
978  Buffer rbuf;
979 
980  /*
981  * If we're here then a pagesplit is needed. We should never reach
982  * here if we're using the fastpath since we should have checked for
983  * all the required conditions, including the fact that this page has
984  * enough freespace. Note that this routine can in theory deal with
985  * the situation where a NULL stack pointer is passed (that's what
986  * would happen if the fastpath is taken). But that path is much
987  * slower, defeating the very purpose of the optimization. The
988  * following assertion should protect us from any future code changes
989  * that invalidate those assumptions.
990  *
991  * Note that whenever we fail to take the fastpath, we clear the
992  * cached block. Checking for a valid cached block at this point is
993  * enough to decide whether we're in a fastpath or not.
994  */
995  Assert(!(P_ISLEAF(lpageop) &&
997 
998  /* split the buffer into left and right halves */
999  rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
1002  BufferGetBlockNumber(rbuf));
1003 
1004  /*----------
1005  * By here,
1006  *
1007  * + our target page has been split;
1008  * + the original tuple has been inserted;
1009  * + we have write locks on both the old (left half)
1010  * and new (right half) buffers, after the split; and
1011  * + we know the key we want to insert into the parent
1012  * (it's the "high key" on the left child page).
1013  *
1014  * We're ready to do the parent insertion. We need to hold onto the
1015  * locks for the child pages until we locate the parent, but we can
1016  * at least release the lock on the right child before doing the
1017  * actual insertion. The lock on the left child will be released
1018  * last of all by parent insertion, where it is the 'cbuf' of parent
1019  * page.
1020  *----------
1021  */
1022  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
1023  }
1024  else
1025  {
1026  Buffer metabuf = InvalidBuffer;
1027  Page metapg = NULL;
1028  BTMetaPageData *metad = NULL;
1029  OffsetNumber itup_off;
1030  BlockNumber itup_blkno;
1031  BlockNumber cachedBlock = InvalidBlockNumber;
1032 
1033  itup_off = newitemoff;
1034  itup_blkno = BufferGetBlockNumber(buf);
1035 
1036  /*
1037  * If we are doing this insert because we split a page that was the
1038  * only one on its tree level, but was not the root, it may have been
1039  * the "fast root". We need to ensure that the fast root link points
1040  * at or above the current page. We can safely acquire a lock on the
1041  * metapage here --- see comments for _bt_newroot().
1042  */
1043  if (split_only_page)
1044  {
1045  Assert(!P_ISLEAF(lpageop));
1046 
1047  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1048  metapg = BufferGetPage(metabuf);
1049  metad = BTPageGetMeta(metapg);
1050 
1051  if (metad->btm_fastlevel >= lpageop->btpo.level)
1052  {
1053  /* no update wanted */
1054  _bt_relbuf(rel, metabuf);
1055  metabuf = InvalidBuffer;
1056  }
1057  }
1058 
1059  /*
1060  * Every internal page should have exactly one negative infinity item
1061  * at all times. Only _bt_split() and _bt_newroot() should add items
1062  * that become negative infinity items through truncation, since
1063  * they're the only routines that allocate new internal pages. Do not
1064  * allow a retail insertion of a new item at the negative infinity
1065  * offset.
1066  */
1067  if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
1068  elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
1069  itup_blkno, RelationGetRelationName(rel));
1070 
1071  /* Do the update. No ereport(ERROR) until changes are logged */
1073 
1074  if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
1075  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1076  itup_blkno, RelationGetRelationName(rel));
1077 
1079 
1080  if (BufferIsValid(metabuf))
1081  {
1082  /* upgrade meta-page if needed */
1083  if (metad->btm_version < BTREE_NOVAC_VERSION)
1084  _bt_upgrademetapage(metapg);
1085  metad->btm_fastroot = itup_blkno;
1086  metad->btm_fastlevel = lpageop->btpo.level;
1087  MarkBufferDirty(metabuf);
1088  }
1089 
1090  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
1091  if (BufferIsValid(cbuf))
1092  {
1093  Page cpage = BufferGetPage(cbuf);
1094  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1095 
1096  Assert(P_INCOMPLETE_SPLIT(cpageop));
1097  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1098  MarkBufferDirty(cbuf);
1099  }
1100 
1101  /*
1102  * Cache the block information if we just inserted into the rightmost
1103  * leaf page of the index and it's not the root page. For very small
1104  * index where root is also the leaf, there is no point trying for any
1105  * optimization.
1106  */
1107  if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
1108  cachedBlock = BufferGetBlockNumber(buf);
1109 
1110  /* XLOG stuff */
1111  if (RelationNeedsWAL(rel))
1112  {
1113  xl_btree_insert xlrec;
1114  xl_btree_metadata xlmeta;
1115  uint8 xlinfo;
1116  XLogRecPtr recptr;
1117 
1118  xlrec.offnum = itup_off;
1119 
1120  XLogBeginInsert();
1121  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1122 
1123  if (P_ISLEAF(lpageop))
1124  xlinfo = XLOG_BTREE_INSERT_LEAF;
1125  else
1126  {
1127  /*
1128  * Register the left child whose INCOMPLETE_SPLIT flag was
1129  * cleared.
1130  */
1132 
1133  xlinfo = XLOG_BTREE_INSERT_UPPER;
1134  }
1135 
1136  if (BufferIsValid(metabuf))
1137  {
1139  xlmeta.version = metad->btm_version;
1140  xlmeta.root = metad->btm_root;
1141  xlmeta.level = metad->btm_level;
1142  xlmeta.fastroot = metad->btm_fastroot;
1143  xlmeta.fastlevel = metad->btm_fastlevel;
1144  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1147 
1149  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1150 
1151  xlinfo = XLOG_BTREE_INSERT_META;
1152  }
1153 
1155  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1156 
1157  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1158 
1159  if (BufferIsValid(metabuf))
1160  {
1161  PageSetLSN(metapg, recptr);
1162  }
1163  if (BufferIsValid(cbuf))
1164  {
1165  PageSetLSN(BufferGetPage(cbuf), recptr);
1166  }
1167 
1168  PageSetLSN(page, recptr);
1169  }
1170 
1171  END_CRIT_SECTION();
1172 
1173  /* release buffers */
1174  if (BufferIsValid(metabuf))
1175  _bt_relbuf(rel, metabuf);
1176  if (BufferIsValid(cbuf))
1177  _bt_relbuf(rel, cbuf);
1178  _bt_relbuf(rel, buf);
1179 
1180  /*
1181  * If we decided to cache the insertion target block, then set it now.
1182  * But before that, check for the height of the tree and don't go for
1183  * the optimization for small indexes. We defer that check to this
1184  * point to ensure that we don't call _bt_getrootheight while holding
1185  * lock on any other block.
1186  *
1187  * We do this after dropping locks on all buffers. So the information
1188  * about whether the insertion block is still the rightmost block or
1189  * not may have changed in between. But we will deal with that during
1190  * next insert operation. No special care is required while setting
1191  * it.
1192  */
1193  if (BlockNumberIsValid(cachedBlock) &&
1195  RelationSetTargetBlock(rel, cachedBlock);
1196  }
1197 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:88
uint32 btm_version
Definition: nbtree.h:100
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:584
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:356
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
uint32 BlockNumber
Definition: block.h:31
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1744
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
#define PANIC
Definition: elog.h:53
#define RelationGetTargetBlock(relation)
Definition: rel.h:501
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
uint16 OffsetNumber
Definition: off.h:24
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
Definition: nbtinsert.c:1216
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:428
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:435
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define P_ISROOT(opaque)
Definition: nbtree.h:190
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:29
uint32 version
Definition: nbtxlog.h:49
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2236
OffsetNumber offnum
Definition: nbtxlog.h:70
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:508
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define RelationNeedsWAL(relation)
Definition: rel.h:518
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
#define BT_WRITE
Definition: nbtree.h:403
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define SizeOfBtreeInsert
Definition: nbtxlog.h:73
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3118
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_newroot()

static Buffer _bt_newroot ( Relation  rel,
Buffer  lbuf,
Buffer  rbuf 
)
static

Definition at line 2053 of file nbtinsert.c.

References _bt_getbuf(), _bt_relbuf(), _bt_upgrademetapage(), Assert, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeInnerTupleSetDownLink, BTreeTupleGetNAtts, BTreeTupleSetNAtts, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), elog, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, MarkBufferDirty(), xl_btree_metadata::oldest_btpo_xact, P_FIRSTKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_NEW, P_NONE, PageAddItem, PageGetItem, PageGetItemId, PageGetSpecialPointer, PageSetLSN, palloc(), PANIC, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insert_parent().

2054 {
2055  Buffer rootbuf;
2056  Page lpage,
2057  rootpage;
2058  BlockNumber lbkno,
2059  rbkno;
2060  BlockNumber rootblknum;
2061  BTPageOpaque rootopaque;
2062  BTPageOpaque lopaque;
2063  ItemId itemid;
2064  IndexTuple item;
2065  IndexTuple left_item;
2066  Size left_item_sz;
2067  IndexTuple right_item;
2068  Size right_item_sz;
2069  Buffer metabuf;
2070  Page metapg;
2071  BTMetaPageData *metad;
2072 
2073  lbkno = BufferGetBlockNumber(lbuf);
2074  rbkno = BufferGetBlockNumber(rbuf);
2075  lpage = BufferGetPage(lbuf);
2076  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2077 
2078  /* get a new root page */
2079  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2080  rootpage = BufferGetPage(rootbuf);
2081  rootblknum = BufferGetBlockNumber(rootbuf);
2082 
2083  /* acquire lock on the metapage */
2084  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2085  metapg = BufferGetPage(metabuf);
2086  metad = BTPageGetMeta(metapg);
2087 
2088  /*
2089  * Create downlink item for left page (old root). Since this will be the
2090  * first item in a non-leaf page, it implicitly has minus-infinity key
2091  * value, so we need not store any actual key in it.
2092  */
2093  left_item_sz = sizeof(IndexTupleData);
2094  left_item = (IndexTuple) palloc(left_item_sz);
2095  left_item->t_info = left_item_sz;
2096  BTreeInnerTupleSetDownLink(left_item, lbkno);
2097  BTreeTupleSetNAtts(left_item, 0);
2098 
2099  /*
2100  * Create downlink item for right page. The key for it is obtained from
2101  * the "high key" position in the left page.
2102  */
2103  itemid = PageGetItemId(lpage, P_HIKEY);
2104  right_item_sz = ItemIdGetLength(itemid);
2105  item = (IndexTuple) PageGetItem(lpage, itemid);
2106  right_item = CopyIndexTuple(item);
2107  BTreeInnerTupleSetDownLink(right_item, rbkno);
2108 
2109  /* NO EREPORT(ERROR) from here till newroot op is logged */
2111 
2112  /* upgrade metapage if needed */
2113  if (metad->btm_version < BTREE_NOVAC_VERSION)
2114  _bt_upgrademetapage(metapg);
2115 
2116  /* set btree special data */
2117  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2118  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2119  rootopaque->btpo_flags = BTP_ROOT;
2120  rootopaque->btpo.level =
2121  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2122  rootopaque->btpo_cycleid = 0;
2123 
2124  /* update metapage data */
2125  metad->btm_root = rootblknum;
2126  metad->btm_level = rootopaque->btpo.level;
2127  metad->btm_fastroot = rootblknum;
2128  metad->btm_fastlevel = rootopaque->btpo.level;
2129 
2130  /*
2131  * Insert the left page pointer into the new root page. The root page is
2132  * the rightmost page on its level so there is no "high key" in it; the
2133  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2134  *
2135  * Note: we *must* insert the two items in item-number order, for the
2136  * benefit of _bt_restore_page().
2137  */
2138  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2139  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2140  false, false) == InvalidOffsetNumber)
2141  elog(PANIC, "failed to add leftkey to new root page"
2142  " while splitting block %u of index \"%s\"",
2144 
2145  /*
2146  * insert the right page pointer into the new root page.
2147  */
2148  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2149  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2151  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2152  false, false) == InvalidOffsetNumber)
2153  elog(PANIC, "failed to add rightkey to new root page"
2154  " while splitting block %u of index \"%s\"",
2156 
2157  /* Clear the incomplete-split flag in the left child */
2158  Assert(P_INCOMPLETE_SPLIT(lopaque));
2159  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2160  MarkBufferDirty(lbuf);
2161 
2162  MarkBufferDirty(rootbuf);
2163  MarkBufferDirty(metabuf);
2164 
2165  /* XLOG stuff */
2166  if (RelationNeedsWAL(rel))
2167  {
2168  xl_btree_newroot xlrec;
2169  XLogRecPtr recptr;
2170  xl_btree_metadata md;
2171 
2172  xlrec.rootblk = rootblknum;
2173  xlrec.level = metad->btm_level;
2174 
2175  XLogBeginInsert();
2176  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2177 
2178  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2181 
2183  md.version = metad->btm_version;
2184  md.root = rootblknum;
2185  md.level = metad->btm_level;
2186  md.fastroot = rootblknum;
2187  md.fastlevel = metad->btm_level;
2190 
2191  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2192 
2193  /*
2194  * Direct access to page is not good but faster - we should implement
2195  * some new func in page API.
2196  */
2198  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2199  ((PageHeader) rootpage)->pd_special -
2200  ((PageHeader) rootpage)->pd_upper);
2201 
2202  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2203 
2204  PageSetLSN(lpage, recptr);
2205  PageSetLSN(rootpage, recptr);
2206  PageSetLSN(metapg, recptr);
2207  }
2208 
2209  END_CRIT_SECTION();
2210 
2211  /* done with metapage */
2212  _bt_relbuf(rel, metabuf);
2213 
2214  pfree(left_item);
2215  pfree(right_item);
2216 
2217  return rootbuf;
2218 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:247
#define BTP_ROOT
Definition: nbtree.h:72
BlockNumber btpo_next
Definition: nbtree.h:58
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:88
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:181
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
uint32 level
Definition: nbtxlog.h:248
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:81
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
#define PANIC
Definition: elog.h:53
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:336
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:502
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define P_FIRSTKEY
Definition: nbtree.h:218
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:435
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:251
uint32 version
Definition: nbtxlog.h:49
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
struct IndexTupleData IndexTupleData
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define RelationNeedsWAL(relation)
Definition: rel.h:518
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:303
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
void * palloc(Size size)
Definition: mcxt.c:924
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
#define BT_WRITE
Definition: nbtree.h:403
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_pgaddtup()

static bool _bt_pgaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off 
)
static

Definition at line 2236 of file nbtinsert.c.

References BTreeTupleSetNAtts, InvalidOffsetNumber, P_FIRSTDATAKEY, P_ISLEAF, PageAddItem, PageGetSpecialPointer, and IndexTupleData::t_info.

Referenced by _bt_insertonpg(), and _bt_split().

2240 {
2242  IndexTupleData trunctuple;
2243 
2244  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2245  {
2246  trunctuple = *itup;
2247  trunctuple.t_info = sizeof(IndexTupleData);
2248  /* Deliberately zero INDEX_ALT_TID_MASK bits */
2249  BTreeTupleSetNAtts(&trunctuple, 0);
2250  itup = &trunctuple;
2251  itemsize = sizeof(IndexTupleData);
2252  }
2253 
2254  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2255  false, false) == InvalidOffsetNumber)
2256  return false;
2257 
2258  return true;
2259 }
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:336
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
struct IndexTupleData IndexTupleData
#define InvalidOffsetNumber
Definition: off.h:26
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
unsigned short t_info
Definition: itup.h:49
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_split()

static Buffer _bt_split ( Relation  rel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem 
)
static

Definition at line 1216 of file nbtinsert.c.

References _bt_findsplitloc(), _bt_getbuf(), _bt_pageinit(), _bt_pgaddtup(), _bt_relbuf(), _bt_truncate(), _bt_vacuum_cycleid(), Assert, BT_WRITE, BTP_HAS_GARBAGE, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTP_SPLIT_END, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferGetPageSize, BufferIsValid, elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_split::firstright, BTScanInsertData::heapkeyspace, i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, ItemIdGetLength, BTPageOpaqueData::level, xl_btree_split::level, MarkBufferDirty(), MAXALIGN, xl_btree_split::newitemoff, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_NEW, P_RIGHTMOST, PageAddItem, PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPage(), PageRestoreTempPage(), PageSetLSN, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeSplit, START_CRIT_SECTION, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_R, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insertonpg().

1218 {
1219  Buffer rbuf;
1220  Page origpage;
1221  Page leftpage,
1222  rightpage;
1223  BlockNumber origpagenumber,
1224  rightpagenumber;
1225  BTPageOpaque ropaque,
1226  lopaque,
1227  oopaque;
1228  Buffer sbuf = InvalidBuffer;
1229  Page spage = NULL;
1230  BTPageOpaque sopaque = NULL;
1231  Size itemsz;
1232  ItemId itemid;
1233  IndexTuple item;
1234  OffsetNumber leftoff,
1235  rightoff;
1236  OffsetNumber firstright;
1237  OffsetNumber maxoff;
1238  OffsetNumber i;
1239  bool newitemonleft,
1240  isleaf;
1241  IndexTuple lefthikey;
1242  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1243  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1244 
1245  /*
1246  * origpage is the original page to be split. leftpage is a temporary
1247  * buffer that receives the left-sibling data, which will be copied back
1248  * into origpage on success. rightpage is the new page that will receive
1249  * the right-sibling data.
1250  *
1251  * leftpage is allocated after choosing a split point. rightpage's new
1252  * buffer isn't acquired until after leftpage is initialized and has new
1253  * high key, the last point where splitting the page may fail (barring
1254  * corruption). Failing before acquiring new buffer won't have lasting
1255  * consequences, since origpage won't have been modified and leftpage is
1256  * only workspace.
1257  */
1258  origpage = BufferGetPage(buf);
1259  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1260  origpagenumber = BufferGetBlockNumber(buf);
1261 
1262  /*
1263  * Choose a point to split origpage at.
1264  *
1265  * A split point can be thought of as a point _between_ two existing
1266  * tuples on origpage (lastleft and firstright tuples), provided you
1267  * pretend that the new item that didn't fit is already on origpage.
1268  *
1269  * Since origpage does not actually contain newitem, the representation of
1270  * split points needs to work with two boundary cases: splits where
1271  * newitem is lastleft, and splits where newitem is firstright.
1272  * newitemonleft resolves the ambiguity that would otherwise exist when
1273  * newitemoff == firstright. In all other cases it's clear which side of
1274  * the split every tuple goes on from context. newitemonleft is usually
1275  * (but not always) redundant information.
1276  */
1277  firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1278  newitem, &newitemonleft);
1279 
1280  /* Allocate temp buffer for leftpage */
1281  leftpage = PageGetTempPage(origpage);
1282  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1283  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1284 
1285  /*
1286  * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1287  * and HAS_GARBAGE flags.
1288  */
1289  lopaque->btpo_flags = oopaque->btpo_flags;
1290  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1291  /* set flag in leftpage indicating that rightpage has no downlink yet */
1292  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1293  lopaque->btpo_prev = oopaque->btpo_prev;
1294  /* handle btpo_next after rightpage buffer acquired */
1295  lopaque->btpo.level = oopaque->btpo.level;
1296  /* handle btpo_cycleid after rightpage buffer acquired */
1297 
1298  /*
1299  * Copy the original page's LSN into leftpage, which will become the
1300  * updated version of the page. We need this because XLogInsert will
1301  * examine the LSN and possibly dump it in a page image.
1302  */
1303  PageSetLSN(leftpage, PageGetLSN(origpage));
1304  isleaf = P_ISLEAF(oopaque);
1305 
1306  /*
1307  * The "high key" for the new left page will be the first key that's going
1308  * to go into the new right page, or a truncated version if this is a leaf
1309  * page split.
1310  *
1311  * The high key for the left page is formed using the first item on the
1312  * right page, which may seem to be contrary to Lehman & Yao's approach of
1313  * using the left page's last item as its new high key when splitting on
1314  * the leaf level. It isn't, though: suffix truncation will leave the
1315  * left page's high key fully equal to the last item on the left page when
1316  * two tuples with equal key values (excluding heap TID) enclose the split
1317  * point. It isn't actually necessary for a new leaf high key to be equal
1318  * to the last item on the left for the L&Y "subtree" invariant to hold.
1319  * It's sufficient to make sure that the new leaf high key is strictly
1320  * less than the first item on the right leaf page, and greater than or
1321  * equal to (not necessarily equal to) the last item on the left leaf
1322  * page.
1323  *
1324  * In other words, when suffix truncation isn't possible, L&Y's exact
1325  * approach to leaf splits is taken. (Actually, even that is slightly
1326  * inaccurate. A tuple with all the keys from firstright but the heap TID
1327  * from lastleft will be used as the new high key, since the last left
1328  * tuple could be physically larger despite being opclass-equal in respect
1329  * of all attributes prior to the heap TID attribute.)
1330  */
1331  if (!newitemonleft && newitemoff == firstright)
1332  {
1333  /* incoming tuple will become first on right page */
1334  itemsz = newitemsz;
1335  item = newitem;
1336  }
1337  else
1338  {
1339  /* existing item at firstright will become first on right page */
1340  itemid = PageGetItemId(origpage, firstright);
1341  itemsz = ItemIdGetLength(itemid);
1342  item = (IndexTuple) PageGetItem(origpage, itemid);
1343  }
1344 
1345  /*
1346  * Truncate unneeded key and non-key attributes of the high key item
1347  * before inserting it on the left page. This can only happen at the leaf
1348  * level, since in general all pivot tuple values originate from leaf
1349  * level high keys. A pivot tuple in a grandparent page must guide a
1350  * search not only to the correct parent page, but also to the correct
1351  * leaf page.
1352  */
1353  if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
1354  {
1355  IndexTuple lastleft;
1356 
1357  /*
1358  * Determine which tuple will become the last on the left page. This
1359  * is needed to decide how many attributes from the first item on the
1360  * right page must remain in new high key for left page.
1361  */
1362  if (newitemonleft && newitemoff == firstright)
1363  {
1364  /* incoming tuple will become last on left page */
1365  lastleft = newitem;
1366  }
1367  else
1368  {
1369  OffsetNumber lastleftoff;
1370 
1371  /* item just before firstright will become last on left page */
1372  lastleftoff = OffsetNumberPrev(firstright);
1373  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1374  itemid = PageGetItemId(origpage, lastleftoff);
1375  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1376  }
1377 
1378  Assert(lastleft != item);
1379  lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
1380  itemsz = IndexTupleSize(lefthikey);
1381  itemsz = MAXALIGN(itemsz);
1382  }
1383  else
1384  lefthikey = item;
1385 
1386  /*
1387  * Add new high key to leftpage
1388  */
1389  leftoff = P_HIKEY;
1390 
1391  Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
1392  Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
1393  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1394  false, false) == InvalidOffsetNumber)
1395  elog(ERROR, "failed to add hikey to the left sibling"
1396  " while splitting block %u of index \"%s\"",
1397  origpagenumber, RelationGetRelationName(rel));
1398  leftoff = OffsetNumberNext(leftoff);
1399  /* be tidy */
1400  if (lefthikey != item)
1401  pfree(lefthikey);
1402 
1403  /*
1404  * Acquire a new right page to split into, now that left page has a new
1405  * high key. From here on, it's not okay to throw an error without
1406  * zeroing rightpage first. This coding rule ensures that we won't
1407  * confuse future VACUUM operations, which might otherwise try to re-find
1408  * a downlink to a leftover junk page as the page undergoes deletion.
1409  *
1410  * It would be reasonable to start the critical section just after the new
1411  * rightpage buffer is acquired instead; that would allow us to avoid
1412  * leftover junk pages without bothering to zero rightpage. We do it this
1413  * way because it avoids an unnecessary PANIC when either origpage or its
1414  * existing sibling page are corrupt.
1415  */
1416  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1417  rightpage = BufferGetPage(rbuf);
1418  rightpagenumber = BufferGetBlockNumber(rbuf);
1419  /* rightpage was initialized by _bt_getbuf */
1420  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1421 
1422  /*
1423  * Finish off remaining leftpage special area fields. They cannot be set
1424  * before both origpage (leftpage) and rightpage buffers are acquired and
1425  * locked.
1426  */
1427  lopaque->btpo_next = rightpagenumber;
1428  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1429 
1430  /*
1431  * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1432  * and HAS_GARBAGE flags.
1433  */
1434  ropaque->btpo_flags = oopaque->btpo_flags;
1435  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1436  ropaque->btpo_prev = origpagenumber;
1437  ropaque->btpo_next = oopaque->btpo_next;
1438  ropaque->btpo.level = oopaque->btpo.level;
1439  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1440 
1441  /*
1442  * Add new high key to rightpage where necessary.
1443  *
1444  * If the page we're splitting is not the rightmost page at its level in
1445  * the tree, then the first entry on the page is the high key from
1446  * origpage.
1447  */
1448  rightoff = P_HIKEY;
1449 
1450  if (!P_RIGHTMOST(oopaque))
1451  {
1452  itemid = PageGetItemId(origpage, P_HIKEY);
1453  itemsz = ItemIdGetLength(itemid);
1454  item = (IndexTuple) PageGetItem(origpage, itemid);
1455  Assert(BTreeTupleGetNAtts(item, rel) > 0);
1456  Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
1457  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1458  false, false) == InvalidOffsetNumber)
1459  {
1460  memset(rightpage, 0, BufferGetPageSize(rbuf));
1461  elog(ERROR, "failed to add hikey to the right sibling"
1462  " while splitting block %u of index \"%s\"",
1463  origpagenumber, RelationGetRelationName(rel));
1464  }
1465  rightoff = OffsetNumberNext(rightoff);
1466  }
1467 
1468  /*
1469  * Now transfer all the data items (non-pivot tuples in isleaf case, or
1470  * additional pivot tuples in !isleaf case) to the appropriate page.
1471  *
1472  * Note: we *must* insert at least the right page's items in item-number
1473  * order, for the benefit of _bt_restore_page().
1474  */
1475  maxoff = PageGetMaxOffsetNumber(origpage);
1476 
1477  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1478  {
1479  itemid = PageGetItemId(origpage, i);
1480  itemsz = ItemIdGetLength(itemid);
1481  item = (IndexTuple) PageGetItem(origpage, itemid);
1482 
1483  /* does new item belong before this one? */
1484  if (i == newitemoff)
1485  {
1486  if (newitemonleft)
1487  {
1488  Assert(newitemoff <= firstright);
1489  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1490  {
1491  memset(rightpage, 0, BufferGetPageSize(rbuf));
1492  elog(ERROR, "failed to add new item to the left sibling"
1493  " while splitting block %u of index \"%s\"",
1494  origpagenumber, RelationGetRelationName(rel));
1495  }
1496  leftoff = OffsetNumberNext(leftoff);
1497  }
1498  else
1499  {
1500  Assert(newitemoff >= firstright);
1501  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1502  {
1503  memset(rightpage, 0, BufferGetPageSize(rbuf));
1504  elog(ERROR, "failed to add new item to the right sibling"
1505  " while splitting block %u of index \"%s\"",
1506  origpagenumber, RelationGetRelationName(rel));
1507  }
1508  rightoff = OffsetNumberNext(rightoff);
1509  }
1510  }
1511 
1512  /* decide which page to put it on */
1513  if (i < firstright)
1514  {
1515  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1516  {
1517  memset(rightpage, 0, BufferGetPageSize(rbuf));
1518  elog(ERROR, "failed to add old item to the left sibling"
1519  " while splitting block %u of index \"%s\"",
1520  origpagenumber, RelationGetRelationName(rel));
1521  }
1522  leftoff = OffsetNumberNext(leftoff);
1523  }
1524  else
1525  {
1526  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1527  {
1528  memset(rightpage, 0, BufferGetPageSize(rbuf));
1529  elog(ERROR, "failed to add old item to the right sibling"
1530  " while splitting block %u of index \"%s\"",
1531  origpagenumber, RelationGetRelationName(rel));
1532  }
1533  rightoff = OffsetNumberNext(rightoff);
1534  }
1535  }
1536 
1537  /* cope with possibility that newitem goes at the end */
1538  if (i <= newitemoff)
1539  {
1540  /*
1541  * Can't have newitemonleft here; that would imply we were told to put
1542  * *everything* on the left page, which cannot fit (if it could, we'd
1543  * not be splitting the page).
1544  */
1545  Assert(!newitemonleft);
1546  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1547  {
1548  memset(rightpage, 0, BufferGetPageSize(rbuf));
1549  elog(ERROR, "failed to add new item to the right sibling"
1550  " while splitting block %u of index \"%s\"",
1551  origpagenumber, RelationGetRelationName(rel));
1552  }
1553  rightoff = OffsetNumberNext(rightoff);
1554  }
1555 
1556  /*
1557  * We have to grab the right sibling (if any) and fix the prev pointer
1558  * there. We are guaranteed that this is deadlock-free since no other
1559  * writer will be holding a lock on that page and trying to move left, and
1560  * all readers release locks on a page before trying to fetch its
1561  * neighbors.
1562  */
1563  if (!P_RIGHTMOST(oopaque))
1564  {
1565  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1566  spage = BufferGetPage(sbuf);
1567  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1568  if (sopaque->btpo_prev != origpagenumber)
1569  {
1570  memset(rightpage, 0, BufferGetPageSize(rbuf));
1571  ereport(ERROR,
1572  (errcode(ERRCODE_INDEX_CORRUPTED),
1573  errmsg_internal("right sibling's left-link doesn't match: "
1574  "block %u links to %u instead of expected %u in index \"%s\"",
1575  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1576  RelationGetRelationName(rel))));
1577  }
1578 
1579  /*
1580  * Check to see if we can set the SPLIT_END flag in the right-hand
1581  * split page; this can save some I/O for vacuum since it need not
1582  * proceed to the right sibling. We can set the flag if the right
1583  * sibling has a different cycleid: that means it could not be part of
1584  * a group of pages that were all split off from the same ancestor
1585  * page. If you're confused, imagine that page A splits to A B and
1586  * then again, yielding A C B, while vacuum is in progress. Tuples
1587  * originally in A could now be in either B or C, hence vacuum must
1588  * examine both pages. But if D, our right sibling, has a different
1589  * cycleid then it could not contain any tuples that were in A when
1590  * the vacuum started.
1591  */
1592  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1593  ropaque->btpo_flags |= BTP_SPLIT_END;
1594  }
1595 
1596  /*
1597  * Right sibling is locked, new siblings are prepared, but original page
1598  * is not updated yet.
1599  *
1600  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1601  * not starting the critical section till here because we haven't been
1602  * scribbling on the original page yet; see comments above.
1603  */
1605 
1606  /*
1607  * By here, the original data page has been split into two new halves, and
1608  * these are correct. The algorithm requires that the left page never
1609  * move during a split, so we copy the new left page back on top of the
1610  * original. Note that this is not a waste of time, since we also require
1611  * (in the page management code) that the center of a page always be
1612  * clean, and the most efficient way to guarantee this is just to compact
1613  * the data by reinserting it into a new left page. (XXX the latter
1614  * comment is probably obsolete; but in any case it's good to not scribble
1615  * on the original page until we enter the critical section.)
1616  *
1617  * We need to do this before writing the WAL record, so that XLogInsert
1618  * can WAL log an image of the page if necessary.
1619  */
1620  PageRestoreTempPage(leftpage, origpage);
1621  /* leftpage, lopaque must not be used below here */
1622 
1624  MarkBufferDirty(rbuf);
1625 
1626  if (!P_RIGHTMOST(ropaque))
1627  {
1628  sopaque->btpo_prev = rightpagenumber;
1629  MarkBufferDirty(sbuf);
1630  }
1631 
1632  /*
1633  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1634  * a split.
1635  */
1636  if (!isleaf)
1637  {
1638  Page cpage = BufferGetPage(cbuf);
1639  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1640 
1641  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1642  MarkBufferDirty(cbuf);
1643  }
1644 
1645  /* XLOG stuff */
1646  if (RelationNeedsWAL(rel))
1647  {
1648  xl_btree_split xlrec;
1649  uint8 xlinfo;
1650  XLogRecPtr recptr;
1651 
1652  xlrec.level = ropaque->btpo.level;
1653  xlrec.firstright = firstright;
1654  xlrec.newitemoff = newitemoff;
1655 
1656  XLogBeginInsert();
1657  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1658 
1661  /* Log the right sibling, because we've changed its prev-pointer. */
1662  if (!P_RIGHTMOST(ropaque))
1664  if (BufferIsValid(cbuf))
1666 
1667  /*
1668  * Log the new item, if it was inserted on the left page. (If it was
1669  * put on the right page, we don't need to explicitly WAL log it
1670  * because it's included with all the other items on the right page.)
1671  * Show the new item as belonging to the left page buffer, so that it
1672  * is not stored if XLogInsert decides it needs a full-page image of
1673  * the left page. We store the offset anyway, though, to support
1674  * archive compression of these records.
1675  */
1676  if (newitemonleft)
1677  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1678 
1679  /* Log the left page's new high key */
1680  itemid = PageGetItemId(origpage, P_HIKEY);
1681  item = (IndexTuple) PageGetItem(origpage, itemid);
1682  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1683 
1684  /*
1685  * Log the contents of the right page in the format understood by
1686  * _bt_restore_page(). The whole right page will be recreated.
1687  *
1688  * Direct access to page is not good but faster - we should implement
1689  * some new func in page API. Note we only store the tuples
1690  * themselves, knowing that they were inserted in item-number order
1691  * and so the line pointers can be reconstructed. See comments for
1692  * _bt_restore_page().
1693  */
1695  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1696  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1697 
1698  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
1699  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1700 
1701  PageSetLSN(origpage, recptr);
1702  PageSetLSN(rightpage, recptr);
1703  if (!P_RIGHTMOST(ropaque))
1704  {
1705  PageSetLSN(spage, recptr);
1706  }
1707  if (!isleaf)
1708  {
1709  PageSetLSN(BufferGetPage(cbuf), recptr);
1710  }
1711  }
1712 
1713  END_CRIT_SECTION();
1714 
1715  /* release the old right sibling */
1716  if (!P_RIGHTMOST(ropaque))
1717  _bt_relbuf(rel, sbuf);
1718 
1719  /* release the child */
1720  if (!isleaf)
1721  _bt_relbuf(rel, cbuf);
1722 
1723  /* split's done */
1724  return rbuf;
1725 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
#define BTP_ROOT
Definition: nbtree.h:72
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1859
#define BTP_SPLIT_END
Definition: nbtree.h:76
BlockNumber btpo_next
Definition: nbtree.h:58
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:410
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define SizeOfBtreeSplit
Definition: nbtxlog.h:118
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
unsigned char uint8
Definition: c.h:356
Pointer Item
Definition: item.h:17
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
int errcode(int sqlerrcode)
Definition: elog.c:570
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:81
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2116
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ERROR
Definition: elog.h:43
OffsetNumber newitemoff
Definition: nbtxlog.h:115
BTCycleId btpo_cycleid
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:57
static char * buf
Definition: pg_test_fsync.c:68
OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:127
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:428
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:435
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:351
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2236
uint32 level
Definition: nbtxlog.h:113
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
int errmsg_internal(const char *fmt,...)
Definition: elog.c:814
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
OffsetNumber firstright
Definition: nbtxlog.h:114
bool heapkeyspace
Definition: nbtree.h:468
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define RelationNeedsWAL(relation)
Definition: rel.h:518
#define PageGetLSN(page)
Definition: bufpage.h:366
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define elog(elevel,...)
Definition: elog.h:226
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:924
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
#define BT_WRITE
Definition: nbtree.h:403
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_stepright()

static void _bt_stepright ( Relation  rel,
BTInsertState  insertstate,
BTStack  stack 
)
static

Definition at line 858 of file nbtinsert.c.

References _bt_finish_split(), _bt_relandgetbuf(), _bt_relbuf(), BTInsertStateData::bounds_valid, BT_WRITE, BTPageOpaqueData::btpo_next, BTInsertStateData::buf, BufferGetPage, elog, ERROR, InvalidBuffer, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_findinsertloc().

859 {
860  Page page;
861  BTPageOpaque lpageop;
862  Buffer rbuf;
863  BlockNumber rblkno;
864 
865  page = BufferGetPage(insertstate->buf);
866  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
867 
868  rbuf = InvalidBuffer;
869  rblkno = lpageop->btpo_next;
870  for (;;)
871  {
872  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
873  page = BufferGetPage(rbuf);
874  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
875 
876  /*
877  * If this page was incompletely split, finish the split now. We do
878  * this while holding a lock on the left sibling, which is not good
879  * because finishing the split could be a fairly lengthy operation.
880  * But this should happen very seldom.
881  */
882  if (P_INCOMPLETE_SPLIT(lpageop))
883  {
884  _bt_finish_split(rel, rbuf, stack);
885  rbuf = InvalidBuffer;
886  continue;
887  }
888 
889  if (!P_IGNORE(lpageop))
890  break;
891  if (P_RIGHTMOST(lpageop))
892  elog(ERROR, "fell off the end of index \"%s\"",
894 
895  rblkno = lpageop->btpo_next;
896  }
897  /* rbuf locked; unlock buf, update state for caller */
898  _bt_relbuf(rel, insertstate->buf);
899  insertstate->buf = rbuf;
900  insertstate->bounds_valid = false;
901 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:504
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1856
#define ERROR
Definition: elog.h:43
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_vacuum_one_page()

static void _bt_vacuum_one_page ( Relation  rel,
Buffer  buffer,
Relation  heapRel 
)
static

Definition at line 2269 of file nbtinsert.c.

References _bt_delitems_delete(), Assert, BufferGetPage, ItemIdIsDead, MaxOffsetNumber, OffsetNumberNext, P_FIRSTDATAKEY, P_ISLEAF, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc().

2270 {
2271  OffsetNumber deletable[MaxOffsetNumber];
2272  int ndeletable = 0;
2273  OffsetNumber offnum,
2274  minoff,
2275  maxoff;
2276  Page page = BufferGetPage(buffer);
2278 
2279  Assert(P_ISLEAF(opaque));
2280 
2281  /*
2282  * Scan over all items to see which ones need to be deleted according to
2283  * LP_DEAD flags.
2284  */
2285  minoff = P_FIRSTDATAKEY(opaque);
2286  maxoff = PageGetMaxOffsetNumber(page);
2287  for (offnum = minoff;
2288  offnum <= maxoff;
2289  offnum = OffsetNumberNext(offnum))
2290  {
2291  ItemId itemId = PageGetItemId(page, offnum);
2292 
2293  if (ItemIdIsDead(itemId))
2294  deletable[ndeletable++] = offnum;
2295  }
2296 
2297  if (ndeletable > 0)
2298  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2299 
2300  /*
2301  * Note: if we didn't find any LP_DEAD items, then the page's
2302  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2303  * separate write to clear it, however. We will clear it when we split
2304  * the page.
2305  */
2306 }
#define MaxOffsetNumber
Definition: off.h:28
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define Assert(condition)
Definition: c.h:732
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
Definition: nbtpage.c:1057
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189