PostgreSQL Source Code  git master
nbtinsert.c File Reference
#include "postgres.h"
#include "access/heapam.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
Include dependency graph for nbtinsert.c:

Go to the source code of this file.

Macros

#define BTREE_FASTPATH_MIN_LEVEL   2
 

Functions

static Buffer _bt_newroot (Relation rel, Buffer lbuf, Buffer rbuf)
 
static TransactionId _bt_check_unique (Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
 
static OffsetNumber _bt_findinsertloc (Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
 
static void _bt_stepright (Relation rel, BTInsertState insertstate, BTStack stack)
 
static void _bt_insertonpg (Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
 
static Buffer _bt_split (Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
 
static void _bt_insert_parent (Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
 
static bool _bt_pgaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
 
static bool _bt_isequal (TupleDesc itupdesc, BTScanInsert itup_key, Page page, OffsetNumber offnum)
 
static void _bt_vacuum_one_page (Relation rel, Buffer buffer, Relation heapRel)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack)
 

Macro Definition Documentation

◆ BTREE_FASTPATH_MIN_LEVEL

#define BTREE_FASTPATH_MIN_LEVEL   2

Definition at line 29 of file nbtinsert.c.

Referenced by _bt_insertonpg().

Function Documentation

◆ _bt_check_unique()

static TransactionId _bt_check_unique ( Relation  rel,
BTInsertState  insertstate,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool is_unique,
uint32 speculativeToken 
)
static

Definition at line 320 of file nbtinsert.c.

References _bt_binsrch_insert(), _bt_compare(), _bt_isequal(), _bt_relandgetbuf(), _bt_relbuf(), Assert, BTInsertStateData::bounds_valid, BT_READ, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTInsertStateData::buf, BufferGetPage, BuildIndexValueDescription(), CheckForSerializableConflictIn(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), heap_hot_search(), index_deform_tuple(), INDEX_MAX_KEYS, InitDirtySnapshot, InvalidBuffer, InvalidTransactionId, ItemIdIsDead, ItemIdMarkDead, ItemPointerCompare(), BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MarkBufferDirtyHint(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, RelationGetRelationName, BTScanInsertData::scantid, SnapshotSelf, SnapshotData::speculativeToken, BTInsertStateData::stricthigh, IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_PARTIAL, values, SnapshotData::xmax, and SnapshotData::xmin.

Referenced by _bt_doinsert().

323 {
324  TupleDesc itupdesc = RelationGetDescr(rel);
325  IndexTuple itup = insertstate->itup;
326  BTScanInsert itup_key = insertstate->itup_key;
327  SnapshotData SnapshotDirty;
328  OffsetNumber offset;
329  OffsetNumber maxoff;
330  Page page;
331  BTPageOpaque opaque;
332  Buffer nbuf = InvalidBuffer;
333  bool found = false;
334 
335  /* Assume unique until we find a duplicate */
336  *is_unique = true;
337 
338  InitDirtySnapshot(SnapshotDirty);
339 
340  page = BufferGetPage(insertstate->buf);
341  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
342  maxoff = PageGetMaxOffsetNumber(page);
343 
344  /*
345  * Find the first tuple with the same key.
346  *
347  * This also saves the binary search bounds in insertstate. We use them
348  * in the fastpath below, but also in the _bt_findinsertloc() call later.
349  */
350  offset = _bt_binsrch_insert(rel, insertstate);
351 
352  /*
353  * Scan over all equal tuples, looking for live conflicts.
354  */
355  Assert(!insertstate->bounds_valid || insertstate->low == offset);
356  Assert(itup_key->scantid == NULL);
357  for (;;)
358  {
359  ItemId curitemid;
360  IndexTuple curitup;
361  BlockNumber nblkno;
362 
363  /*
364  * make sure the offset points to an actual item before trying to
365  * examine it...
366  */
367  if (offset <= maxoff)
368  {
369  /*
370  * Fastpath: In most cases, we can use cached search bounds to
371  * limit our consideration to items that are definitely
372  * duplicates. This fastpath doesn't apply when the original page
373  * is empty, or when initial offset is past the end of the
374  * original page, which may indicate that we need to examine a
375  * second or subsequent page.
376  *
377  * Note that this optimization avoids calling _bt_isequal()
378  * entirely when there are no duplicates, as long as the offset
379  * where the key will go is not at the end of the page.
380  */
381  if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
382  {
383  Assert(insertstate->bounds_valid);
384  Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
385  Assert(insertstate->low <= insertstate->stricthigh);
386  Assert(!_bt_isequal(itupdesc, itup_key, page, offset));
387  break;
388  }
389 
390  curitemid = PageGetItemId(page, offset);
391 
392  /*
393  * We can skip items that are marked killed.
394  *
395  * In the presence of heavy update activity an index may contain
396  * many killed items with the same key; running _bt_isequal() on
397  * each killed item gets expensive. Just advance over killed
398  * items as quickly as we can. We only apply _bt_isequal() when
399  * we get to a non-killed item. Even those comparisons could be
400  * avoided (in the common case where there is only one page to
401  * visit) by reusing bounds, but just skipping dead items is fast
402  * enough.
403  */
404  if (!ItemIdIsDead(curitemid))
405  {
406  ItemPointerData htid;
407  bool all_dead;
408 
409  /*
410  * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
411  * how we handling NULLs - and so we must not use _bt_compare
412  * in real comparison, but only for ordering/finding items on
413  * pages. - vadim 03/24/97
414  */
415  if (!_bt_isequal(itupdesc, itup_key, page, offset))
416  break; /* we're past all the equal tuples */
417 
418  /* okay, we gotta fetch the heap tuple ... */
419  curitup = (IndexTuple) PageGetItem(page, curitemid);
420  htid = curitup->t_tid;
421 
422  /*
423  * If we are doing a recheck, we expect to find the tuple we
424  * are rechecking. It's not a duplicate, but we have to keep
425  * scanning.
426  */
427  if (checkUnique == UNIQUE_CHECK_EXISTING &&
428  ItemPointerCompare(&htid, &itup->t_tid) == 0)
429  {
430  found = true;
431  }
432 
433  /*
434  * We check the whole HOT-chain to see if there is any tuple
435  * that satisfies SnapshotDirty. This is necessary because we
436  * have just a single index entry for the entire chain.
437  */
438  else if (heap_hot_search(&htid, heapRel, &SnapshotDirty,
439  &all_dead))
440  {
441  TransactionId xwait;
442 
443  /*
444  * It is a duplicate. If we are only doing a partial
445  * check, then don't bother checking if the tuple is being
446  * updated in another transaction. Just return the fact
447  * that it is a potential conflict and leave the full
448  * check till later.
449  */
450  if (checkUnique == UNIQUE_CHECK_PARTIAL)
451  {
452  if (nbuf != InvalidBuffer)
453  _bt_relbuf(rel, nbuf);
454  *is_unique = false;
455  return InvalidTransactionId;
456  }
457 
458  /*
459  * If this tuple is being updated by other transaction
460  * then we have to wait for its commit/abort.
461  */
462  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
463  SnapshotDirty.xmin : SnapshotDirty.xmax;
464 
465  if (TransactionIdIsValid(xwait))
466  {
467  if (nbuf != InvalidBuffer)
468  _bt_relbuf(rel, nbuf);
469  /* Tell _bt_doinsert to wait... */
470  *speculativeToken = SnapshotDirty.speculativeToken;
471  return xwait;
472  }
473 
474  /*
475  * Otherwise we have a definite conflict. But before
476  * complaining, look to see if the tuple we want to insert
477  * is itself now committed dead --- if so, don't complain.
478  * This is a waste of time in normal scenarios but we must
479  * do it to support CREATE INDEX CONCURRENTLY.
480  *
481  * We must follow HOT-chains here because during
482  * concurrent index build, we insert the root TID though
483  * the actual tuple may be somewhere in the HOT-chain.
484  * While following the chain we might not stop at the
485  * exact tuple which triggered the insert, but that's OK
486  * because if we find a live tuple anywhere in this chain,
487  * we have a unique key conflict. The other live tuple is
488  * not part of this chain because it had a different index
489  * entry.
490  */
491  htid = itup->t_tid;
492  if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
493  {
494  /* Normal case --- it's still live */
495  }
496  else
497  {
498  /*
499  * It's been deleted, so no error, and no need to
500  * continue searching
501  */
502  break;
503  }
504 
505  /*
506  * Check for a conflict-in as we would if we were going to
507  * write to this page. We aren't actually going to write,
508  * but we want a chance to report SSI conflicts that would
509  * otherwise be masked by this unique constraint
510  * violation.
511  */
512  CheckForSerializableConflictIn(rel, NULL, insertstate->buf);
513 
514  /*
515  * This is a definite conflict. Break the tuple down into
516  * datums and report the error. But first, make sure we
517  * release the buffer locks we're holding ---
518  * BuildIndexValueDescription could make catalog accesses,
519  * which in the worst case might touch this same index and
520  * cause deadlocks.
521  */
522  if (nbuf != InvalidBuffer)
523  _bt_relbuf(rel, nbuf);
524  _bt_relbuf(rel, insertstate->buf);
525  insertstate->buf = InvalidBuffer;
526 
527  {
529  bool isnull[INDEX_MAX_KEYS];
530  char *key_desc;
531 
533  values, isnull);
534 
535  key_desc = BuildIndexValueDescription(rel, values,
536  isnull);
537 
538  ereport(ERROR,
539  (errcode(ERRCODE_UNIQUE_VIOLATION),
540  errmsg("duplicate key value violates unique constraint \"%s\"",
542  key_desc ? errdetail("Key %s already exists.",
543  key_desc) : 0,
544  errtableconstraint(heapRel,
545  RelationGetRelationName(rel))));
546  }
547  }
548  else if (all_dead)
549  {
550  /*
551  * The conflicting tuple (or whole HOT chain) is dead to
552  * everyone, so we may as well mark the index entry
553  * killed.
554  */
555  ItemIdMarkDead(curitemid);
556  opaque->btpo_flags |= BTP_HAS_GARBAGE;
557 
558  /*
559  * Mark buffer with a dirty hint, since state is not
560  * crucial. Be sure to mark the proper buffer dirty.
561  */
562  if (nbuf != InvalidBuffer)
563  MarkBufferDirtyHint(nbuf, true);
564  else
565  MarkBufferDirtyHint(insertstate->buf, true);
566  }
567  }
568  }
569 
570  /*
571  * Advance to next tuple to continue checking.
572  */
573  if (offset < maxoff)
574  offset = OffsetNumberNext(offset);
575  else
576  {
577  int highkeycmp;
578 
579  /* If scankey == hikey we gotta check the next page too */
580  if (P_RIGHTMOST(opaque))
581  break;
582  highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
583  Assert(highkeycmp <= 0);
584  if (highkeycmp != 0)
585  break;
586  /* Advance to next non-dead page --- there must be one */
587  for (;;)
588  {
589  nblkno = opaque->btpo_next;
590  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
591  page = BufferGetPage(nbuf);
592  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
593  if (!P_IGNORE(opaque))
594  break;
595  if (P_RIGHTMOST(opaque))
596  elog(ERROR, "fell off the end of index \"%s\"",
598  }
599  maxoff = PageGetMaxOffsetNumber(page);
600  offset = P_FIRSTDATAKEY(opaque);
601  }
602  }
603 
604  /*
605  * If we are doing a recheck then we should have found the tuple we are
606  * checking. Otherwise there's something very wrong --- probably, the
607  * index is on a non-immutable expression.
608  */
609  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
610  ereport(ERROR,
611  (errcode(ERRCODE_INTERNAL_ERROR),
612  errmsg("failed to re-find tuple within index \"%s\"",
614  errhint("This may be because of a non-immutable index expression."),
615  errtableconstraint(heapRel,
616  RelationGetRelationName(rel))));
617 
618  if (nbuf != InvalidBuffer)
619  _bt_relbuf(rel, nbuf);
620 
621  return InvalidTransactionId;
622 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
BlockNumber btpo_next
Definition: nbtree.h:58
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:77
int errhint(const char *fmt,...)
Definition: elog.c:974
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:499
uint32 TransactionId
Definition: c.h:507
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:934
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3385
#define RelationGetDescr(relation)
Definition: rel.h:436
#define ItemIdMarkDead(itemId)
Definition: itemid.h:178
ItemPointer scantid
Definition: nbtree.h:467
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
OffsetNumber stricthigh
Definition: nbtree.h:501
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
OffsetNumber low
Definition: nbtree.h:500
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4444
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page, OffsetNumber offnum)
Definition: nbtinsert.c:2185
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5255
#define BT_READ
Definition: nbtree.h:403
#define ERROR
Definition: elog.h:43
#define SnapshotSelf
Definition: snapmgr.h:69
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:444
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:444
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:426
TransactionId xmax
Definition: snapshot.h:139
TransactionId xmin
Definition: snapshot.h:138
IndexTuple itup
Definition: nbtree.h:487
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uintptr_t Datum
Definition: postgres.h:367
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
uint32 speculativeToken
Definition: snapshot.h:174
BTScanInsert itup_key
Definition: nbtree.h:489
#define Assert(condition)
Definition: c.h:732
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead)
Definition: heapam.c:1668
#define P_HIKEY
Definition: nbtree.h:217
static Datum values[MAXATTR]
Definition: bootstrap.c:167
int errmsg(const char *fmt,...)
Definition: elog.c:784
#define elog(elevel,...)
Definition: elog.h:226
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:176
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint16 btpo_flags
Definition: nbtree.h:64
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 81 of file nbtinsert.c.

References _bt_check_unique(), _bt_checkpage(), _bt_compare(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), Assert, BTInsertStateData::bounds_valid, BT_WRITE, buf, BTInsertStateData::buf, BufferGetPage, CheckForSerializableConflictIn(), ConditionalLockBuffer(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, pfree(), ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, ReleaseBuffer(), BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

83 {
84  bool is_unique = false;
85  BTInsertStateData insertstate;
86  BTScanInsert itup_key;
87  BTStack stack = NULL;
88  Buffer buf;
89  bool fastpath;
90  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
91 
92  /* we need an insertion scan key to do our search, so build one */
93  itup_key = _bt_mkscankey(rel, itup);
94  /* No scantid until uniqueness established in checkingunique case */
95  if (checkingunique && itup_key->heapkeyspace)
96  itup_key->scantid = NULL;
97 
98  /*
99  * Fill in the BTInsertState working area, to track the current page and
100  * position within the page to insert on
101  */
102  insertstate.itup = itup;
103  /* PageAddItem will MAXALIGN(), but be consistent */
104  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
105  insertstate.itup_key = itup_key;
106  insertstate.bounds_valid = false;
107  insertstate.buf = InvalidBuffer;
108 
109  /*
110  * It's very common to have an index on an auto-incremented or
111  * monotonically increasing value. In such cases, every insertion happens
112  * towards the end of the index. We try to optimize that case by caching
113  * the right-most leaf of the index. If our cached block is still the
114  * rightmost leaf, has enough free space to accommodate a new entry and
115  * the insertion key is strictly greater than the first key in this page,
116  * then we can safely conclude that the new key will be inserted in the
117  * cached block. So we simply search within the cached block and insert
118  * the key at the appropriate location. We call it a fastpath.
119  *
120  * Testing has revealed, though, that the fastpath can result in increased
121  * contention on the exclusive-lock on the rightmost leaf page. So we
122  * conditionally check if the lock is available. If it's not available
123  * then we simply abandon the fastpath and take the regular path. This
124  * makes sense because unavailability of the lock also signals that some
125  * other backend might be concurrently inserting into the page, thus
126  * reducing our chances to finding an insertion place in this page.
127  */
128 top:
129  fastpath = false;
131  {
132  Page page;
133  BTPageOpaque lpageop;
134 
135  /*
136  * Conditionally acquire exclusive lock on the buffer before doing any
137  * checks. If we don't get the lock, we simply follow slowpath. If we
138  * do get the lock, this ensures that the index state cannot change,
139  * as far as the rightmost part of the index is concerned.
140  */
141  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
142 
143  if (ConditionalLockBuffer(buf))
144  {
145  _bt_checkpage(rel, buf);
146 
147  page = BufferGetPage(buf);
148 
149  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
150 
151  /*
152  * Check if the page is still the rightmost leaf page, has enough
153  * free space to accommodate the new tuple, and the insertion scan
154  * key is strictly greater than the first key on the page.
155  */
156  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
157  !P_IGNORE(lpageop) &&
158  (PageGetFreeSpace(page) > insertstate.itemsz) &&
159  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
160  _bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
161  {
162  /*
163  * The right-most block should never have an incomplete split.
164  * But be paranoid and check for it anyway.
165  */
166  Assert(!P_INCOMPLETE_SPLIT(lpageop));
167  fastpath = true;
168  }
169  else
170  {
171  _bt_relbuf(rel, buf);
172 
173  /*
174  * Something did not work out. Just forget about the cached
175  * block and follow the normal path. It might be set again if
176  * the conditions are favourable.
177  */
179  }
180  }
181  else
182  {
183  ReleaseBuffer(buf);
184 
185  /*
186  * If someone's holding a lock, it's likely to change anyway, so
187  * don't try again until we get an updated rightmost leaf.
188  */
190  }
191  }
192 
193  if (!fastpath)
194  {
195  /*
196  * Find the first page containing this key. Buffer returned by
197  * _bt_search() is locked in exclusive mode.
198  */
199  stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
200  }
201 
202  insertstate.buf = buf;
203  buf = InvalidBuffer; /* insertstate.buf now owns the buffer */
204 
205  /*
206  * If we're not allowing duplicates, make sure the key isn't already in
207  * the index.
208  *
209  * NOTE: obviously, _bt_check_unique can only detect keys that are already
210  * in the index; so it cannot defend against concurrent insertions of the
211  * same key. We protect against that by means of holding a write lock on
212  * the first page the value could be on, regardless of the value of its
213  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
214  * the same key must acquire a write lock on the same page, so only one
215  * would-be inserter can be making the check at one time. Furthermore,
216  * once we are past the check we hold write locks continuously until we
217  * have performed our insertion, so no later inserter can fail to see our
218  * insertion. (This requires some care in _bt_findinsertloc.)
219  *
220  * If we must wait for another xact, we release the lock while waiting,
221  * and then must start over completely.
222  *
223  * For a partial uniqueness check, we don't wait for the other xact. Just
224  * let the tuple in and return false for possibly non-unique, or true for
225  * definitely unique.
226  */
227  if (checkingunique)
228  {
229  TransactionId xwait;
230  uint32 speculativeToken;
231 
232  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
233  &is_unique, &speculativeToken);
234 
235  if (TransactionIdIsValid(xwait))
236  {
237  /* Have to wait for the other guy ... */
238  _bt_relbuf(rel, insertstate.buf);
239  insertstate.buf = InvalidBuffer;
240 
241  /*
242  * If it's a speculative insertion, wait for it to finish (ie. to
243  * go ahead with the insertion, or kill the tuple). Otherwise
244  * wait for the transaction to finish as usual.
245  */
246  if (speculativeToken)
247  SpeculativeInsertionWait(xwait, speculativeToken);
248  else
249  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
250 
251  /* start over... */
252  if (stack)
253  _bt_freestack(stack);
254  goto top;
255  }
256 
257  /* Uniqueness is established -- restore heap tid as scantid */
258  if (itup_key->heapkeyspace)
259  itup_key->scantid = &itup->t_tid;
260  }
261 
262  if (checkUnique != UNIQUE_CHECK_EXISTING)
263  {
264  OffsetNumber newitemoff;
265 
266  /*
267  * The only conflict predicate locking cares about for indexes is when
268  * an index tuple insert conflicts with an existing lock. We don't
269  * know the actual page we're going to insert to yet because scantid
270  * was not filled in initially, but it's okay to use the "first valid"
271  * page instead. This reasoning also applies to INCLUDE indexes,
272  * whose extra attributes are not considered part of the key space.
273  */
274  CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
275 
276  /*
277  * Do the insertion. Note that insertstate contains cached binary
278  * search bounds established within _bt_check_unique when insertion is
279  * checkingunique.
280  */
281  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
282  stack, heapRel);
283  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
284  itup, newitemoff, false);
285  }
286  else
287  {
288  /* just release the buffer */
289  _bt_relbuf(rel, insertstate.buf);
290  }
291 
292  /* be tidy */
293  if (stack)
294  _bt_freestack(stack);
295  pfree(itup_key);
296 
297  return is_unique;
298 }
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:158
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:909
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:499
uint32 TransactionId
Definition: c.h:507
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:84
ItemPointer scantid
Definition: nbtree.h:467
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:657
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3315
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
#define RelationGetTargetBlock(relation)
Definition: rel.h:495
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4444
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:731
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:778
static char * buf
Definition: pg_test_fsync.c:67
unsigned int uint32
Definition: c.h:358
IndexTuple itup
Definition: nbtree.h:487
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3578
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:621
BTScanInsert itup_key
Definition: nbtree.h:489
#define Assert(condition)
Definition: c.h:732
bool heapkeyspace
Definition: nbtree.h:464
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:502
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
#define MAXALIGN(LEN)
Definition: c.h:685
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:320
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:404
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_findinsertloc()

static OffsetNumber _bt_findinsertloc ( Relation  rel,
BTInsertState  insertstate,
bool  checkingunique,
BTStack  stack,
Relation  heapRel 
)
static

Definition at line 657 of file nbtinsert.c.

References _bt_binsrch_insert(), _bt_check_third_page(), _bt_compare(), _bt_stepright(), _bt_vacuum_one_page(), Assert, BTInsertStateData::bounds_valid, BTMaxItemSize, BTInsertStateData::buf, BufferGetPage, BTScanInsertData::heapkeyspace, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MAX_RANDOM_VALUE, P_HAS_GARBAGE, P_HIKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, random(), BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_doinsert().

662 {
663  BTScanInsert itup_key = insertstate->itup_key;
664  Page page = BufferGetPage(insertstate->buf);
665  BTPageOpaque lpageop;
666 
667  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
668 
669  /* Check 1/3 of a page restriction */
670  if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
671  _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
672  insertstate->itup);
673 
674  Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
675  Assert(!insertstate->bounds_valid || checkingunique);
676  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
677  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
678 
679  if (itup_key->heapkeyspace)
680  {
681  /*
682  * If we're inserting into a unique index, we may have to walk right
683  * through leaf pages to find the one leaf page that we must insert on
684  * to.
685  *
686  * This is needed for checkingunique callers because a scantid was not
687  * used when we called _bt_search(). scantid can only be set after
688  * _bt_check_unique() has checked for duplicates. The buffer
689  * initially stored in insertstate->buf has the page where the first
690  * duplicate key might be found, which isn't always the page that new
691  * tuple belongs on. The heap TID attribute for new tuple (scantid)
692  * could force us to insert on a sibling page, though that should be
693  * very rare in practice.
694  */
695  if (checkingunique)
696  {
697  for (;;)
698  {
699  /*
700  * Does the new tuple belong on this page?
701  *
702  * The earlier _bt_check_unique() call may well have
703  * established a strict upper bound on the offset for the new
704  * item. If it's not the last item of the page (i.e. if there
705  * is at least one tuple on the page that goes after the tuple
706  * we're inserting) then we know that the tuple belongs on
707  * this page. We can skip the high key check.
708  */
709  if (insertstate->bounds_valid &&
710  insertstate->low <= insertstate->stricthigh &&
711  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
712  break;
713 
714  /* Test '<=', not '!=', since scantid is set now */
715  if (P_RIGHTMOST(lpageop) ||
716  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
717  break;
718 
719  _bt_stepright(rel, insertstate, stack);
720  /* Update local state after stepping right */
721  page = BufferGetPage(insertstate->buf);
722  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
723  }
724  }
725 
726  /*
727  * If the target page is full, see if we can obtain enough space by
728  * erasing LP_DEAD items
729  */
730  if (PageGetFreeSpace(page) < insertstate->itemsz &&
731  P_HAS_GARBAGE(lpageop))
732  {
733  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
734  insertstate->bounds_valid = false;
735  }
736  }
737  else
738  {
739  /*----------
740  * This is a !heapkeyspace (version 2 or 3) index. The current page
741  * is the first page that we could insert the new tuple to, but there
742  * may be other pages to the right that we could opt to use instead.
743  *
744  * If the new key is equal to one or more existing keys, we can
745  * legitimately place it anywhere in the series of equal keys. In
746  * fact, if the new key is equal to the page's "high key" we can place
747  * it on the next page. If it is equal to the high key, and there's
748  * not room to insert the new tuple on the current page without
749  * splitting, then we move right hoping to find more free space and
750  * avoid a split.
751  *
752  * Keep scanning right until we
753  * (a) find a page with enough free space,
754  * (b) reach the last page where the tuple can legally go, or
755  * (c) get tired of searching.
756  * (c) is not flippant; it is important because if there are many
757  * pages' worth of equal keys, it's better to split one of the early
758  * pages than to scan all the way to the end of the run of equal keys
759  * on every insert. We implement "get tired" as a random choice,
760  * since stopping after scanning a fixed number of pages wouldn't work
761  * well (we'd never reach the right-hand side of previously split
762  * pages). The probability of moving right is set at 0.99, which may
763  * seem too high to change the behavior much, but it does an excellent
764  * job of preventing O(N^2) behavior with many equal keys.
765  *----------
766  */
767  while (PageGetFreeSpace(page) < insertstate->itemsz)
768  {
769  /*
770  * Before considering moving right, see if we can obtain enough
771  * space by erasing LP_DEAD items
772  */
773  if (P_HAS_GARBAGE(lpageop))
774  {
775  _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
776  insertstate->bounds_valid = false;
777 
778  if (PageGetFreeSpace(page) >= insertstate->itemsz)
779  break; /* OK, now we have enough space */
780  }
781 
782  /*
783  * Nope, so check conditions (b) and (c) enumerated above
784  *
785  * The earlier _bt_check_unique() call may well have established a
786  * strict upper bound on the offset for the new item. If it's not
787  * the last item of the page (i.e. if there is at least one tuple
788  * on the page that's greater than the tuple we're inserting to)
789  * then we know that the tuple belongs on this page. We can skip
790  * the high key check.
791  */
792  if (insertstate->bounds_valid &&
793  insertstate->low <= insertstate->stricthigh &&
794  insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
795  break;
796 
797  if (P_RIGHTMOST(lpageop) ||
798  _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
799  random() <= (MAX_RANDOM_VALUE / 100))
800  break;
801 
802  _bt_stepright(rel, insertstate, stack);
803  /* Update local state after stepping right */
804  page = BufferGetPage(insertstate->buf);
805  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
806  }
807  }
808 
809  /*
810  * We should now be on the correct page. Find the offset within the page
811  * for the new tuple. (Possibly reusing earlier search bounds.)
812  */
813  Assert(P_RIGHTMOST(lpageop) ||
814  _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
815 
816  return _bt_binsrch_insert(rel, insertstate);
817 }
bool bounds_valid
Definition: nbtree.h:499
ItemPointer scantid
Definition: nbtree.h:467
long random(void)
Definition: random.c:22
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:195
OffsetNumber stricthigh
Definition: nbtree.h:501
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:832
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
OffsetNumber low
Definition: nbtree.h:500
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2238
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:444
#define MAX_RANDOM_VALUE
IndexTuple itup
Definition: nbtree.h:487
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
BTScanInsert itup_key
Definition: nbtree.h:489
#define Assert(condition)
Definition: c.h:732
bool heapkeyspace
Definition: nbtree.h:464
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2518
#define BTMaxItemSize(page)
Definition: nbtree.h:147
#define P_HIKEY
Definition: nbtree.h:217
#define unlikely(x)
Definition: c.h:208
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 1789 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

1790 {
1791  Page lpage = BufferGetPage(lbuf);
1792  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1793  Buffer rbuf;
1794  Page rpage;
1795  BTPageOpaque rpageop;
1796  bool was_root;
1797  bool was_only;
1798 
1799  Assert(P_INCOMPLETE_SPLIT(lpageop));
1800 
1801  /* Lock right sibling, the one missing the downlink */
1802  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1803  rpage = BufferGetPage(rbuf);
1804  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1805 
1806  /* Could this be a root split? */
1807  if (!stack)
1808  {
1809  Buffer metabuf;
1810  Page metapg;
1811  BTMetaPageData *metad;
1812 
1813  /* acquire lock on the metapage */
1814  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1815  metapg = BufferGetPage(metabuf);
1816  metad = BTPageGetMeta(metapg);
1817 
1818  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1819 
1820  _bt_relbuf(rel, metabuf);
1821  }
1822  else
1823  was_root = false;
1824 
1825  /* Was this the only page on the level before split? */
1826  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1827 
1828  elog(DEBUG1, "finishing incomplete split of %u/%u",
1830 
1831  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1832 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1678
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:404
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:74

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack 
)

Definition at line 1849 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_branch_parent().

1850 {
1851  BlockNumber blkno;
1852  OffsetNumber start;
1853 
1854  blkno = stack->bts_blkno;
1855  start = stack->bts_offset;
1856 
1857  for (;;)
1858  {
1859  Buffer buf;
1860  Page page;
1861  BTPageOpaque opaque;
1862 
1863  buf = _bt_getbuf(rel, blkno, BT_WRITE);
1864  page = BufferGetPage(buf);
1865  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1866 
1867  if (P_INCOMPLETE_SPLIT(opaque))
1868  {
1869  _bt_finish_split(rel, buf, stack->bts_parent);
1870  continue;
1871  }
1872 
1873  if (!P_IGNORE(opaque))
1874  {
1875  OffsetNumber offnum,
1876  minoff,
1877  maxoff;
1878  ItemId itemid;
1879  IndexTuple item;
1880 
1881  minoff = P_FIRSTDATAKEY(opaque);
1882  maxoff = PageGetMaxOffsetNumber(page);
1883 
1884  /*
1885  * start = InvalidOffsetNumber means "search the whole page". We
1886  * need this test anyway due to possibility that page has a high
1887  * key now when it didn't before.
1888  */
1889  if (start < minoff)
1890  start = minoff;
1891 
1892  /*
1893  * Need this check too, to guard against possibility that page
1894  * split since we visited it originally.
1895  */
1896  if (start > maxoff)
1897  start = OffsetNumberNext(maxoff);
1898 
1899  /*
1900  * These loops will check every item on the page --- but in an
1901  * order that's attuned to the probability of where it actually
1902  * is. Scan to the right first, then to the left.
1903  */
1904  for (offnum = start;
1905  offnum <= maxoff;
1906  offnum = OffsetNumberNext(offnum))
1907  {
1908  itemid = PageGetItemId(page, offnum);
1909  item = (IndexTuple) PageGetItem(page, itemid);
1910 
1911  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
1912  {
1913  /* Return accurate pointer to where link is now */
1914  stack->bts_blkno = blkno;
1915  stack->bts_offset = offnum;
1916  return buf;
1917  }
1918  }
1919 
1920  for (offnum = OffsetNumberPrev(start);
1921  offnum >= minoff;
1922  offnum = OffsetNumberPrev(offnum))
1923  {
1924  itemid = PageGetItemId(page, offnum);
1925  item = (IndexTuple) PageGetItem(page, itemid);
1926 
1927  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
1928  {
1929  /* Return accurate pointer to where link is now */
1930  stack->bts_blkno = blkno;
1931  stack->bts_offset = offnum;
1932  return buf;
1933  }
1934  }
1935  }
1936 
1937  /*
1938  * The item we're looking for moved right at least one page.
1939  */
1940  if (P_RIGHTMOST(opaque))
1941  {
1942  _bt_relbuf(rel, buf);
1943  return InvalidBuffer;
1944  }
1945  blkno = opaque->btpo_next;
1946  start = InvalidOffsetNumber;
1947  _bt_relbuf(rel, buf);
1948  }
1949 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:302
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1789
OffsetNumber bts_offset
Definition: nbtree.h:419
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:418
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
BlockNumber bts_btentry
Definition: nbtree.h:420
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
struct BTStackData * bts_parent
Definition: nbtree.h:421
#define BT_WRITE
Definition: nbtree.h:404
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_insert_parent()

static void _bt_insert_parent ( Relation  rel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  is_root,
bool  is_only 
)
static

Definition at line 1678 of file nbtinsert.c.

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newroot(), _bt_relbuf(), Assert, BTPageOpaqueData::btpo, BTreeInnerTupleSetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), DEBUG2, elog, ERROR, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, BTPageOpaqueData::level, P_HIKEY, PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), and RelationGetRelationName.

Referenced by _bt_finish_split(), and _bt_insertonpg().

1684 {
1685  /*
1686  * Here we have to do something Lehman and Yao don't talk about: deal with
1687  * a root split and construction of a new root. If our stack is empty
1688  * then we have just split a node on what had been the root level when we
1689  * descended the tree. If it was still the root then we perform a
1690  * new-root construction. If it *wasn't* the root anymore, search to find
1691  * the next higher level that someone constructed meanwhile, and find the
1692  * right place to insert as for the normal case.
1693  *
1694  * If we have to search for the parent level, we do so by re-descending
1695  * from the root. This is not super-efficient, but it's rare enough not
1696  * to matter.
1697  */
1698  if (is_root)
1699  {
1700  Buffer rootbuf;
1701 
1702  Assert(stack == NULL);
1703  Assert(is_only);
1704  /* create a new root node and update the metapage */
1705  rootbuf = _bt_newroot(rel, buf, rbuf);
1706  /* release the split buffers */
1707  _bt_relbuf(rel, rootbuf);
1708  _bt_relbuf(rel, rbuf);
1709  _bt_relbuf(rel, buf);
1710  }
1711  else
1712  {
1714  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1715  Page page = BufferGetPage(buf);
1716  IndexTuple new_item;
1717  BTStackData fakestack;
1718  IndexTuple ritem;
1719  Buffer pbuf;
1720 
1721  if (stack == NULL)
1722  {
1723  BTPageOpaque lpageop;
1724 
1725  elog(DEBUG2, "concurrent ROOT page split");
1726  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1727  /* Find the leftmost page at the next level up */
1728  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
1729  NULL);
1730  /* Set up a phony stack entry pointing there */
1731  stack = &fakestack;
1732  stack->bts_blkno = BufferGetBlockNumber(pbuf);
1735  stack->bts_parent = NULL;
1736  _bt_relbuf(rel, pbuf);
1737  }
1738 
1739  /* get high key from left, a strict lower bound for new right page */
1740  ritem = (IndexTuple) PageGetItem(page,
1741  PageGetItemId(page, P_HIKEY));
1742 
1743  /* form an index tuple that points at the new right page */
1744  new_item = CopyIndexTuple(ritem);
1745  BTreeInnerTupleSetDownLink(new_item, rbknum);
1746 
1747  /*
1748  * Find the parent buffer and get the parent page.
1749  *
1750  * Oops - if we were moved right then we need to change stack item! We
1751  * want to find parent pointing to where we are, right ? - vadim
1752  * 05/27/97
1753  */
1754  stack->bts_btentry = bknum;
1755  pbuf = _bt_getstackbuf(rel, stack);
1756 
1757  /*
1758  * Now we can unlock the right child. The left child will be unlocked
1759  * by _bt_insertonpg().
1760  */
1761  _bt_relbuf(rel, rbuf);
1762 
1763  /* Check for error only after writing children */
1764  if (pbuf == InvalidBuffer)
1765  elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
1766  RelationGetRelationName(rel), bknum, rbknum);
1767 
1768  /* Recursively update the parent */
1769  _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
1770  new_item, stack->bts_offset + 1,
1771  is_only);
1772 
1773  /* be tidy */
1774  pfree(new_item);
1775  }
1776 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:909
union BTPageOpaqueData::@46 btpo
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:1970
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1993
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ERROR
Definition: elog.h:43
#define DEBUG2
Definition: elog.h:24
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:502
OffsetNumber bts_offset
Definition: nbtree.h:419
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:444
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
Buffer _bt_getstackbuf(Relation rel, BTStack stack)
Definition: nbtinsert.c:1849
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 level
Definition: nbtree.h:61
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:418
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
BlockNumber bts_btentry
Definition: nbtree.h:420
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
struct BTStackData * bts_parent
Definition: nbtree.h:421
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:304
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
#define elog(elevel,...)
Definition: elog.h:226
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_insertonpg()

static void _bt_insertonpg ( Relation  rel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
BTStack  stack,
IndexTuple  itup,
OffsetNumber  newitemoff,
bool  split_only_page 
)
static

Definition at line 909 of file nbtinsert.c.

References _bt_findsplitloc(), _bt_getbuf(), _bt_getrootheight(), _bt_insert_parent(), _bt_pgaddtup(), _bt_relbuf(), _bt_split(), _bt_upgrademetapage(), Assert, BlockNumberIsValid, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTREE_FASTPATH_MIN_LEVEL, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, MarkBufferDirty(), MAXALIGN, xl_btree_insert::offnum, xl_btree_metadata::oldest_btpo_xact, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageGetFreeSpace(), PageGetSpecialPointer, PageSetLSN, PANIC, PredicateLockPageSplit(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationGetTargetBlock, RelationNeedsWAL, RelationSetTargetBlock, xl_btree_metadata::root, SizeOfBtreeInsert, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_doinsert(), and _bt_insert_parent().

917 {
918  Page page;
919  BTPageOpaque lpageop;
920  OffsetNumber firstright = InvalidOffsetNumber;
921  Size itemsz;
922 
923  page = BufferGetPage(buf);
924  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
925 
926  /* child buffer must be given iff inserting on an internal page */
927  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
928  /* tuple must have appropriate number of attributes */
929  Assert(!P_ISLEAF(lpageop) ||
930  BTreeTupleGetNAtts(itup, rel) ==
932  Assert(P_ISLEAF(lpageop) ||
933  BTreeTupleGetNAtts(itup, rel) <=
935 
936  /* The caller should've finished any incomplete splits already. */
937  if (P_INCOMPLETE_SPLIT(lpageop))
938  elog(ERROR, "cannot insert to incompletely split page %u",
940 
941  itemsz = IndexTupleSize(itup);
942  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
943  * need to be consistent */
944 
945  /*
946  * Do we need to split the page to fit the item on it?
947  *
948  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
949  * so this comparison is correct even though we appear to be accounting
950  * only for the item and not for its line pointer.
951  */
952  if (PageGetFreeSpace(page) < itemsz)
953  {
954  bool is_root = P_ISROOT(lpageop);
955  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
956  bool newitemonleft;
957  Buffer rbuf;
958 
959  /*
960  * If we're here then a pagesplit is needed. We should never reach
961  * here if we're using the fastpath since we should have checked for
962  * all the required conditions, including the fact that this page has
963  * enough freespace. Note that this routine can in theory deal with
964  * the situation where a NULL stack pointer is passed (that's what
965  * would happen if the fastpath is taken). But that path is much
966  * slower, defeating the very purpose of the optimization. The
967  * following assertion should protect us from any future code changes
968  * that invalidate those assumptions.
969  *
970  * Note that whenever we fail to take the fastpath, we clear the
971  * cached block. Checking for a valid cached block at this point is
972  * enough to decide whether we're in a fastpath or not.
973  */
974  Assert(!(P_ISLEAF(lpageop) &&
976 
977  /* Choose the split point */
978  firstright = _bt_findsplitloc(rel, page,
979  newitemoff, itemsz, itup,
980  &newitemonleft);
981 
982  /* split the buffer into left and right halves */
983  rbuf = _bt_split(rel, itup_key, buf, cbuf, firstright, newitemoff,
984  itemsz, itup, newitemonleft);
987  BufferGetBlockNumber(rbuf));
988 
989  /*----------
990  * By here,
991  *
992  * + our target page has been split;
993  * + the original tuple has been inserted;
994  * + we have write locks on both the old (left half)
995  * and new (right half) buffers, after the split; and
996  * + we know the key we want to insert into the parent
997  * (it's the "high key" on the left child page).
998  *
999  * We're ready to do the parent insertion. We need to hold onto the
1000  * locks for the child pages until we locate the parent, but we can
1001  * at least release the lock on the right child before doing the
1002  * actual insertion. The lock on the left child will be released
1003  * last of all by parent insertion, where it is the 'cbuf' of parent
1004  * page.
1005  *----------
1006  */
1007  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
1008  }
1009  else
1010  {
1011  Buffer metabuf = InvalidBuffer;
1012  Page metapg = NULL;
1013  BTMetaPageData *metad = NULL;
1014  OffsetNumber itup_off;
1015  BlockNumber itup_blkno;
1016  BlockNumber cachedBlock = InvalidBlockNumber;
1017 
1018  itup_off = newitemoff;
1019  itup_blkno = BufferGetBlockNumber(buf);
1020 
1021  /*
1022  * If we are doing this insert because we split a page that was the
1023  * only one on its tree level, but was not the root, it may have been
1024  * the "fast root". We need to ensure that the fast root link points
1025  * at or above the current page. We can safely acquire a lock on the
1026  * metapage here --- see comments for _bt_newroot().
1027  */
1028  if (split_only_page)
1029  {
1030  Assert(!P_ISLEAF(lpageop));
1031 
1032  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1033  metapg = BufferGetPage(metabuf);
1034  metad = BTPageGetMeta(metapg);
1035 
1036  if (metad->btm_fastlevel >= lpageop->btpo.level)
1037  {
1038  /* no update wanted */
1039  _bt_relbuf(rel, metabuf);
1040  metabuf = InvalidBuffer;
1041  }
1042  }
1043 
1044  /*
1045  * Every internal page should have exactly one negative infinity item
1046  * at all times. Only _bt_split() and _bt_newroot() should add items
1047  * that become negative infinity items through truncation, since
1048  * they're the only routines that allocate new internal pages. Do not
1049  * allow a retail insertion of a new item at the negative infinity
1050  * offset.
1051  */
1052  if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
1053  elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
1054  itup_blkno, RelationGetRelationName(rel));
1055 
1056  /* Do the update. No ereport(ERROR) until changes are logged */
1058 
1059  if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
1060  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1061  itup_blkno, RelationGetRelationName(rel));
1062 
1064 
1065  if (BufferIsValid(metabuf))
1066  {
1067  /* upgrade meta-page if needed */
1068  if (metad->btm_version < BTREE_NOVAC_VERSION)
1069  _bt_upgrademetapage(metapg);
1070  metad->btm_fastroot = itup_blkno;
1071  metad->btm_fastlevel = lpageop->btpo.level;
1072  MarkBufferDirty(metabuf);
1073  }
1074 
1075  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
1076  if (BufferIsValid(cbuf))
1077  {
1078  Page cpage = BufferGetPage(cbuf);
1079  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1080 
1081  Assert(P_INCOMPLETE_SPLIT(cpageop));
1082  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1083  MarkBufferDirty(cbuf);
1084  }
1085 
1086  /*
1087  * Cache the block information if we just inserted into the rightmost
1088  * leaf page of the index and it's not the root page. For very small
1089  * index where root is also the leaf, there is no point trying for any
1090  * optimization.
1091  */
1092  if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
1093  cachedBlock = BufferGetBlockNumber(buf);
1094 
1095  /* XLOG stuff */
1096  if (RelationNeedsWAL(rel))
1097  {
1098  xl_btree_insert xlrec;
1099  xl_btree_metadata xlmeta;
1100  uint8 xlinfo;
1101  XLogRecPtr recptr;
1102 
1103  xlrec.offnum = itup_off;
1104 
1105  XLogBeginInsert();
1106  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1107 
1108  if (P_ISLEAF(lpageop))
1109  xlinfo = XLOG_BTREE_INSERT_LEAF;
1110  else
1111  {
1112  /*
1113  * Register the left child whose INCOMPLETE_SPLIT flag was
1114  * cleared.
1115  */
1117 
1118  xlinfo = XLOG_BTREE_INSERT_UPPER;
1119  }
1120 
1121  if (BufferIsValid(metabuf))
1122  {
1124  xlmeta.version = metad->btm_version;
1125  xlmeta.root = metad->btm_root;
1126  xlmeta.level = metad->btm_level;
1127  xlmeta.fastroot = metad->btm_fastroot;
1128  xlmeta.fastlevel = metad->btm_fastlevel;
1129  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1132 
1134  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1135 
1136  xlinfo = XLOG_BTREE_INSERT_META;
1137  }
1138 
1140  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1141 
1142  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1143 
1144  if (BufferIsValid(metabuf))
1145  {
1146  PageSetLSN(metapg, recptr);
1147  }
1148  if (BufferIsValid(cbuf))
1149  {
1150  PageSetLSN(BufferGetPage(cbuf), recptr);
1151  }
1152 
1153  PageSetLSN(page, recptr);
1154  }
1155 
1156  END_CRIT_SECTION();
1157 
1158  /* release buffers */
1159  if (BufferIsValid(metabuf))
1160  _bt_relbuf(rel, metabuf);
1161  if (BufferIsValid(cbuf))
1162  _bt_relbuf(rel, cbuf);
1163  _bt_relbuf(rel, buf);
1164 
1165  /*
1166  * If we decided to cache the insertion target block, then set it now.
1167  * But before that, check for the height of the tree and don't go for
1168  * the optimization for small indexes. We defer that check to this
1169  * point to ensure that we don't call _bt_getrootheight while holding
1170  * lock on any other block.
1171  *
1172  * We do this after dropping locks on all buffers. So the information
1173  * about whether the insertion block is still the rightmost block or
1174  * not may have changed in between. But we will deal with that during
1175  * next insert operation. No special care is required while setting
1176  * it.
1177  */
1178  if (BlockNumberIsValid(cachedBlock) &&
1180  RelationSetTargetBlock(rel, cachedBlock);
1181  }
1182 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:89
uint32 btm_version
Definition: nbtree.h:100
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:637
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1456
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
Definition: nbtinsert.c:1202
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:356
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
uint32 BlockNumber
Definition: block.h:31
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1678
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:328
#define PANIC
Definition: elog.h:53
#define RelationGetTargetBlock(relation)
Definition: rel.h:495
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
uint16 OffsetNumber
Definition: off.h:24
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
static char * buf
Definition: pg_test_fsync.c:67
OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:124
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:422
#define RelationGetRelationName(relation)
Definition: rel.h:444
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:429
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define P_ISROOT(opaque)
Definition: nbtree.h:190
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:29
uint32 version
Definition: nbtxlog.h:49
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2153
OffsetNumber offnum
Definition: nbtxlog.h:70
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:502
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define RelationNeedsWAL(relation)
Definition: rel.h:512
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
#define BT_WRITE
Definition: nbtree.h:404
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define SizeOfBtreeInsert
Definition: nbtxlog.h:73
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3118
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_isequal()

static bool _bt_isequal ( TupleDesc  itupdesc,
BTScanInsert  itup_key,
Page  page,
OffsetNumber  offnum 
)
static

Definition at line 2185 of file nbtinsert.c.

References Assert, DatumGetInt32, FunctionCall2Coll(), i, index_getattr, BTScanInsertData::keysz, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_check_unique().

2187 {
2188  IndexTuple itup;
2189  ScanKey scankey;
2190  int i;
2191 
2192  /* Better be comparing to a non-pivot item */
2195  Assert(itup_key->scantid == NULL);
2196 
2197  scankey = itup_key->scankeys;
2198  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2199 
2200  for (i = 1; i <= itup_key->keysz; i++)
2201  {
2202  AttrNumber attno;
2203  Datum datum;
2204  bool isNull;
2205  int32 result;
2206 
2207  attno = scankey->sk_attno;
2208  Assert(attno == i);
2209  datum = index_getattr(itup, attno, itupdesc, &isNull);
2210 
2211  /* NULLs are never equal to anything */
2212  if (isNull || (scankey->sk_flags & SK_ISNULL))
2213  return false;
2214 
2215  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
2216  scankey->sk_collation,
2217  datum,
2218  scankey->sk_argument));
2219 
2220  if (result != 0)
2221  return false;
2222 
2223  scankey++;
2224  }
2225 
2226  /* if we get here, the keys are equal */
2227  return true;
2228 }
#define DatumGetInt32(X)
Definition: postgres.h:472
ItemPointer scantid
Definition: nbtree.h:467
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
signed int int32
Definition: c.h:346
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define SK_ISNULL
Definition: skey.h:115
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uintptr_t Datum
Definition: postgres.h:367
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:469
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
int16 AttrNumber
Definition: attnum.h:21
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_newroot()

static Buffer _bt_newroot ( Relation  rel,
Buffer  lbuf,
Buffer  rbuf 
)
static

Definition at line 1970 of file nbtinsert.c.

References _bt_getbuf(), _bt_relbuf(), _bt_upgrademetapage(), Assert, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeInnerTupleSetDownLink, BTreeTupleGetNAtts, BTreeTupleSetNAtts, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), elog, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, MarkBufferDirty(), xl_btree_metadata::oldest_btpo_xact, P_FIRSTKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_NEW, P_NONE, PageAddItem, PageGetItem, PageGetItemId, PageGetSpecialPointer, PageSetLSN, palloc(), PANIC, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insert_parent().

1971 {
1972  Buffer rootbuf;
1973  Page lpage,
1974  rootpage;
1975  BlockNumber lbkno,
1976  rbkno;
1977  BlockNumber rootblknum;
1978  BTPageOpaque rootopaque;
1979  BTPageOpaque lopaque;
1980  ItemId itemid;
1981  IndexTuple item;
1982  IndexTuple left_item;
1983  Size left_item_sz;
1984  IndexTuple right_item;
1985  Size right_item_sz;
1986  Buffer metabuf;
1987  Page metapg;
1988  BTMetaPageData *metad;
1989 
1990  lbkno = BufferGetBlockNumber(lbuf);
1991  rbkno = BufferGetBlockNumber(rbuf);
1992  lpage = BufferGetPage(lbuf);
1993  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1994 
1995  /* get a new root page */
1996  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1997  rootpage = BufferGetPage(rootbuf);
1998  rootblknum = BufferGetBlockNumber(rootbuf);
1999 
2000  /* acquire lock on the metapage */
2001  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2002  metapg = BufferGetPage(metabuf);
2003  metad = BTPageGetMeta(metapg);
2004 
2005  /*
2006  * Create downlink item for left page (old root). Since this will be the
2007  * first item in a non-leaf page, it implicitly has minus-infinity key
2008  * value, so we need not store any actual key in it.
2009  */
2010  left_item_sz = sizeof(IndexTupleData);
2011  left_item = (IndexTuple) palloc(left_item_sz);
2012  left_item->t_info = left_item_sz;
2013  BTreeInnerTupleSetDownLink(left_item, lbkno);
2014  BTreeTupleSetNAtts(left_item, 0);
2015 
2016  /*
2017  * Create downlink item for right page. The key for it is obtained from
2018  * the "high key" position in the left page.
2019  */
2020  itemid = PageGetItemId(lpage, P_HIKEY);
2021  right_item_sz = ItemIdGetLength(itemid);
2022  item = (IndexTuple) PageGetItem(lpage, itemid);
2023  right_item = CopyIndexTuple(item);
2024  BTreeInnerTupleSetDownLink(right_item, rbkno);
2025 
2026  /* NO EREPORT(ERROR) from here till newroot op is logged */
2028 
2029  /* upgrade metapage if needed */
2030  if (metad->btm_version < BTREE_NOVAC_VERSION)
2031  _bt_upgrademetapage(metapg);
2032 
2033  /* set btree special data */
2034  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2035  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2036  rootopaque->btpo_flags = BTP_ROOT;
2037  rootopaque->btpo.level =
2038  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2039  rootopaque->btpo_cycleid = 0;
2040 
2041  /* update metapage data */
2042  metad->btm_root = rootblknum;
2043  metad->btm_level = rootopaque->btpo.level;
2044  metad->btm_fastroot = rootblknum;
2045  metad->btm_fastlevel = rootopaque->btpo.level;
2046 
2047  /*
2048  * Insert the left page pointer into the new root page. The root page is
2049  * the rightmost page on its level so there is no "high key" in it; the
2050  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2051  *
2052  * Note: we *must* insert the two items in item-number order, for the
2053  * benefit of _bt_restore_page().
2054  */
2055  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2056  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2057  false, false) == InvalidOffsetNumber)
2058  elog(PANIC, "failed to add leftkey to new root page"
2059  " while splitting block %u of index \"%s\"",
2061 
2062  /*
2063  * insert the right page pointer into the new root page.
2064  */
2065  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2066  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2068  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2069  false, false) == InvalidOffsetNumber)
2070  elog(PANIC, "failed to add rightkey to new root page"
2071  " while splitting block %u of index \"%s\"",
2073 
2074  /* Clear the incomplete-split flag in the left child */
2075  Assert(P_INCOMPLETE_SPLIT(lopaque));
2076  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2077  MarkBufferDirty(lbuf);
2078 
2079  MarkBufferDirty(rootbuf);
2080  MarkBufferDirty(metabuf);
2081 
2082  /* XLOG stuff */
2083  if (RelationNeedsWAL(rel))
2084  {
2085  xl_btree_newroot xlrec;
2086  XLogRecPtr recptr;
2087  xl_btree_metadata md;
2088 
2089  xlrec.rootblk = rootblknum;
2090  xlrec.level = metad->btm_level;
2091 
2092  XLogBeginInsert();
2093  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2094 
2095  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2098 
2100  md.version = metad->btm_version;
2101  md.root = rootblknum;
2102  md.level = metad->btm_level;
2103  md.fastroot = rootblknum;
2104  md.fastlevel = metad->btm_level;
2107 
2108  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2109 
2110  /*
2111  * Direct access to page is not good but faster - we should implement
2112  * some new func in page API.
2113  */
2115  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2116  ((PageHeader) rootpage)->pd_special -
2117  ((PageHeader) rootpage)->pd_upper);
2118 
2119  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2120 
2121  PageSetLSN(lpage, recptr);
2122  PageSetLSN(rootpage, recptr);
2123  PageSetLSN(metapg, recptr);
2124  }
2125 
2126  END_CRIT_SECTION();
2127 
2128  /* done with metapage */
2129  _bt_relbuf(rel, metabuf);
2130 
2131  pfree(left_item);
2132  pfree(right_item);
2133 
2134  return rootbuf;
2135 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:248
#define BTP_ROOT
Definition: nbtree.h:72
BlockNumber btpo_next
Definition: nbtree.h:58
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:89
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1456
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:181
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
uint32 level
Definition: nbtxlog.h:249
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:81
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:328
#define PANIC
Definition: elog.h:53
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:337
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:502
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define P_FIRSTKEY
Definition: nbtree.h:218
#define RelationGetRelationName(relation)
Definition: rel.h:444
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:429
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:252
uint32 version
Definition: nbtxlog.h:49
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
struct IndexTupleData IndexTupleData
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define RelationNeedsWAL(relation)
Definition: rel.h:512
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:304
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
void * palloc(Size size)
Definition: mcxt.c:924
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
#define BT_WRITE
Definition: nbtree.h:404
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_pgaddtup()

static bool _bt_pgaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off 
)
static

Definition at line 2153 of file nbtinsert.c.

References BTreeTupleSetNAtts, InvalidOffsetNumber, P_FIRSTDATAKEY, P_ISLEAF, PageAddItem, PageGetSpecialPointer, and IndexTupleData::t_info.

Referenced by _bt_insertonpg(), and _bt_split().

2157 {
2159  IndexTupleData trunctuple;
2160 
2161  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2162  {
2163  trunctuple = *itup;
2164  trunctuple.t_info = sizeof(IndexTupleData);
2165  /* Deliberately zero INDEX_ALT_TID_MASK bits */
2166  BTreeTupleSetNAtts(&trunctuple, 0);
2167  itup = &trunctuple;
2168  itemsize = sizeof(IndexTupleData);
2169  }
2170 
2171  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2172  false, false) == InvalidOffsetNumber)
2173  return false;
2174 
2175  return true;
2176 }
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:337
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
struct IndexTupleData IndexTupleData
#define InvalidOffsetNumber
Definition: off.h:26
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
unsigned short t_info
Definition: itup.h:49
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_split()

static Buffer _bt_split ( Relation  rel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
OffsetNumber  firstright,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool  newitemonleft 
)
static

Definition at line 1202 of file nbtinsert.c.

References _bt_getbuf(), _bt_pageinit(), _bt_pgaddtup(), _bt_relbuf(), _bt_truncate(), _bt_vacuum_cycleid(), Assert, BT_WRITE, BTP_HAS_GARBAGE, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTP_SPLIT_END, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferGetPageSize, BufferIsValid, elog, END_CRIT_SECTION, ERROR, xl_btree_split::firstright, BTScanInsertData::heapkeyspace, i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, ItemIdGetLength, BTPageOpaqueData::level, xl_btree_split::level, MarkBufferDirty(), MAXALIGN, xl_btree_split::newitemoff, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_NEW, P_RIGHTMOST, PageAddItem, PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPage(), PageRestoreTempPage(), PageSetLSN, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeSplit, START_CRIT_SECTION, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_R, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insertonpg().

1205 {
1206  Buffer rbuf;
1207  Page origpage;
1208  Page leftpage,
1209  rightpage;
1210  BlockNumber origpagenumber,
1211  rightpagenumber;
1212  BTPageOpaque ropaque,
1213  lopaque,
1214  oopaque;
1215  Buffer sbuf = InvalidBuffer;
1216  Page spage = NULL;
1217  BTPageOpaque sopaque = NULL;
1218  Size itemsz;
1219  ItemId itemid;
1220  IndexTuple item;
1221  OffsetNumber leftoff,
1222  rightoff;
1223  OffsetNumber maxoff;
1224  OffsetNumber i;
1225  bool isleaf;
1226  IndexTuple lefthikey;
1227  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1228  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1229 
1230  /* Acquire a new page to split into */
1231  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1232 
1233  /*
1234  * origpage is the original page to be split. leftpage is a temporary
1235  * buffer that receives the left-sibling data, which will be copied back
1236  * into origpage on success. rightpage is the new page that receives the
1237  * right-sibling data. If we fail before reaching the critical section,
1238  * origpage hasn't been modified and leftpage is only workspace. In
1239  * principle we shouldn't need to worry about rightpage either, because it
1240  * hasn't been linked into the btree page structure; but to avoid leaving
1241  * possibly-confusing junk behind, we are careful to rewrite rightpage as
1242  * zeroes before throwing any error.
1243  */
1244  origpage = BufferGetPage(buf);
1245  leftpage = PageGetTempPage(origpage);
1246  rightpage = BufferGetPage(rbuf);
1247 
1248  origpagenumber = BufferGetBlockNumber(buf);
1249  rightpagenumber = BufferGetBlockNumber(rbuf);
1250 
1251  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1252  /* rightpage was already initialized by _bt_getbuf */
1253 
1254  /*
1255  * Copy the original page's LSN into leftpage, which will become the
1256  * updated version of the page. We need this because XLogInsert will
1257  * examine the LSN and possibly dump it in a page image.
1258  */
1259  PageSetLSN(leftpage, PageGetLSN(origpage));
1260 
1261  /* init btree private data */
1262  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1263  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1264  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1265 
1266  isleaf = P_ISLEAF(oopaque);
1267 
1268  /* if we're splitting this page, it won't be the root when we're done */
1269  /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1270  lopaque->btpo_flags = oopaque->btpo_flags;
1271  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1272  ropaque->btpo_flags = lopaque->btpo_flags;
1273  /* set flag in left page indicating that the right page has no downlink */
1274  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1275  lopaque->btpo_prev = oopaque->btpo_prev;
1276  lopaque->btpo_next = rightpagenumber;
1277  ropaque->btpo_prev = origpagenumber;
1278  ropaque->btpo_next = oopaque->btpo_next;
1279  lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
1280  /* Since we already have write-lock on both pages, ok to read cycleid */
1281  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1282  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1283 
1284  /*
1285  * If the page we're splitting is not the rightmost page at its level in
1286  * the tree, then the first entry on the page is the high key for the
1287  * page. We need to copy that to the right half. Otherwise (meaning the
1288  * rightmost page case), all the items on the right half will be user
1289  * data.
1290  */
1291  rightoff = P_HIKEY;
1292 
1293  if (!P_RIGHTMOST(oopaque))
1294  {
1295  itemid = PageGetItemId(origpage, P_HIKEY);
1296  itemsz = ItemIdGetLength(itemid);
1297  item = (IndexTuple) PageGetItem(origpage, itemid);
1298  Assert(BTreeTupleGetNAtts(item, rel) > 0);
1299  Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
1300  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1301  false, false) == InvalidOffsetNumber)
1302  {
1303  memset(rightpage, 0, BufferGetPageSize(rbuf));
1304  elog(ERROR, "failed to add hikey to the right sibling"
1305  " while splitting block %u of index \"%s\"",
1306  origpagenumber, RelationGetRelationName(rel));
1307  }
1308  rightoff = OffsetNumberNext(rightoff);
1309  }
1310 
1311  /*
1312  * The "high key" for the new left page will be the first key that's going
1313  * to go into the new right page, or possibly a truncated version if this
1314  * is a leaf page split. This might be either the existing data item at
1315  * position firstright, or the incoming tuple.
1316  *
1317  * The high key for the left page is formed using the first item on the
1318  * right page, which may seem to be contrary to Lehman & Yao's approach of
1319  * using the left page's last item as its new high key when splitting on
1320  * the leaf level. It isn't, though: suffix truncation will leave the
1321  * left page's high key fully equal to the last item on the left page when
1322  * two tuples with equal key values (excluding heap TID) enclose the split
1323  * point. It isn't actually necessary for a new leaf high key to be equal
1324  * to the last item on the left for the L&Y "subtree" invariant to hold.
1325  * It's sufficient to make sure that the new leaf high key is strictly
1326  * less than the first item on the right leaf page, and greater than or
1327  * equal to (not necessarily equal to) the last item on the left leaf
1328  * page.
1329  *
1330  * In other words, when suffix truncation isn't possible, L&Y's exact
1331  * approach to leaf splits is taken. (Actually, even that is slightly
1332  * inaccurate. A tuple with all the keys from firstright but the heap TID
1333  * from lastleft will be used as the new high key, since the last left
1334  * tuple could be physically larger despite being opclass-equal in respect
1335  * of all attributes prior to the heap TID attribute.)
1336  */
1337  leftoff = P_HIKEY;
1338  if (!newitemonleft && newitemoff == firstright)
1339  {
1340  /* incoming tuple will become first on right page */
1341  itemsz = newitemsz;
1342  item = newitem;
1343  }
1344  else
1345  {
1346  /* existing item at firstright will become first on right page */
1347  itemid = PageGetItemId(origpage, firstright);
1348  itemsz = ItemIdGetLength(itemid);
1349  item = (IndexTuple) PageGetItem(origpage, itemid);
1350  }
1351 
1352  /*
1353  * Truncate unneeded key and non-key attributes of the high key item
1354  * before inserting it on the left page. This can only happen at the leaf
1355  * level, since in general all pivot tuple values originate from leaf
1356  * level high keys. A pivot tuple in a grandparent page must guide a
1357  * search not only to the correct parent page, but also to the correct
1358  * leaf page.
1359  */
1360  if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
1361  {
1362  IndexTuple lastleft;
1363 
1364  /*
1365  * Determine which tuple will become the last on the left page. This
1366  * is needed to decide how many attributes from the first item on the
1367  * right page must remain in new high key for left page.
1368  */
1369  if (newitemonleft && newitemoff == firstright)
1370  {
1371  /* incoming tuple will become last on left page */
1372  lastleft = newitem;
1373  }
1374  else
1375  {
1376  OffsetNumber lastleftoff;
1377 
1378  /* item just before firstright will become last on left page */
1379  lastleftoff = OffsetNumberPrev(firstright);
1380  Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1381  itemid = PageGetItemId(origpage, lastleftoff);
1382  lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1383  }
1384 
1385  Assert(lastleft != item);
1386  lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
1387  itemsz = IndexTupleSize(lefthikey);
1388  itemsz = MAXALIGN(itemsz);
1389  }
1390  else
1391  lefthikey = item;
1392 
1393  Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
1394  Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
1395  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1396  false, false) == InvalidOffsetNumber)
1397  {
1398  memset(rightpage, 0, BufferGetPageSize(rbuf));
1399  elog(ERROR, "failed to add hikey to the left sibling"
1400  " while splitting block %u of index \"%s\"",
1401  origpagenumber, RelationGetRelationName(rel));
1402  }
1403  leftoff = OffsetNumberNext(leftoff);
1404  /* be tidy */
1405  if (lefthikey != item)
1406  pfree(lefthikey);
1407 
1408  /*
1409  * Now transfer all the data items to the appropriate page.
1410  *
1411  * Note: we *must* insert at least the right page's items in item-number
1412  * order, for the benefit of _bt_restore_page().
1413  */
1414  maxoff = PageGetMaxOffsetNumber(origpage);
1415 
1416  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1417  {
1418  itemid = PageGetItemId(origpage, i);
1419  itemsz = ItemIdGetLength(itemid);
1420  item = (IndexTuple) PageGetItem(origpage, itemid);
1421 
1422  /* does new item belong before this one? */
1423  if (i == newitemoff)
1424  {
1425  if (newitemonleft)
1426  {
1427  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1428  {
1429  memset(rightpage, 0, BufferGetPageSize(rbuf));
1430  elog(ERROR, "failed to add new item to the left sibling"
1431  " while splitting block %u of index \"%s\"",
1432  origpagenumber, RelationGetRelationName(rel));
1433  }
1434  leftoff = OffsetNumberNext(leftoff);
1435  }
1436  else
1437  {
1438  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1439  {
1440  memset(rightpage, 0, BufferGetPageSize(rbuf));
1441  elog(ERROR, "failed to add new item to the right sibling"
1442  " while splitting block %u of index \"%s\"",
1443  origpagenumber, RelationGetRelationName(rel));
1444  }
1445  rightoff = OffsetNumberNext(rightoff);
1446  }
1447  }
1448 
1449  /* decide which page to put it on */
1450  if (i < firstright)
1451  {
1452  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1453  {
1454  memset(rightpage, 0, BufferGetPageSize(rbuf));
1455  elog(ERROR, "failed to add old item to the left sibling"
1456  " while splitting block %u of index \"%s\"",
1457  origpagenumber, RelationGetRelationName(rel));
1458  }
1459  leftoff = OffsetNumberNext(leftoff);
1460  }
1461  else
1462  {
1463  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1464  {
1465  memset(rightpage, 0, BufferGetPageSize(rbuf));
1466  elog(ERROR, "failed to add old item to the right sibling"
1467  " while splitting block %u of index \"%s\"",
1468  origpagenumber, RelationGetRelationName(rel));
1469  }
1470  rightoff = OffsetNumberNext(rightoff);
1471  }
1472  }
1473 
1474  /* cope with possibility that newitem goes at the end */
1475  if (i <= newitemoff)
1476  {
1477  /*
1478  * Can't have newitemonleft here; that would imply we were told to put
1479  * *everything* on the left page, which cannot fit (if it could, we'd
1480  * not be splitting the page).
1481  */
1482  Assert(!newitemonleft);
1483  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1484  {
1485  memset(rightpage, 0, BufferGetPageSize(rbuf));
1486  elog(ERROR, "failed to add new item to the right sibling"
1487  " while splitting block %u of index \"%s\"",
1488  origpagenumber, RelationGetRelationName(rel));
1489  }
1490  rightoff = OffsetNumberNext(rightoff);
1491  }
1492 
1493  /*
1494  * We have to grab the right sibling (if any) and fix the prev pointer
1495  * there. We are guaranteed that this is deadlock-free since no other
1496  * writer will be holding a lock on that page and trying to move left, and
1497  * all readers release locks on a page before trying to fetch its
1498  * neighbors.
1499  */
1500 
1501  if (!P_RIGHTMOST(oopaque))
1502  {
1503  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1504  spage = BufferGetPage(sbuf);
1505  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1506  if (sopaque->btpo_prev != origpagenumber)
1507  {
1508  memset(rightpage, 0, BufferGetPageSize(rbuf));
1509  elog(ERROR, "right sibling's left-link doesn't match: "
1510  "block %u links to %u instead of expected %u in index \"%s\"",
1511  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1513  }
1514 
1515  /*
1516  * Check to see if we can set the SPLIT_END flag in the right-hand
1517  * split page; this can save some I/O for vacuum since it need not
1518  * proceed to the right sibling. We can set the flag if the right
1519  * sibling has a different cycleid: that means it could not be part of
1520  * a group of pages that were all split off from the same ancestor
1521  * page. If you're confused, imagine that page A splits to A B and
1522  * then again, yielding A C B, while vacuum is in progress. Tuples
1523  * originally in A could now be in either B or C, hence vacuum must
1524  * examine both pages. But if D, our right sibling, has a different
1525  * cycleid then it could not contain any tuples that were in A when
1526  * the vacuum started.
1527  */
1528  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1529  ropaque->btpo_flags |= BTP_SPLIT_END;
1530  }
1531 
1532  /*
1533  * Right sibling is locked, new siblings are prepared, but original page
1534  * is not updated yet.
1535  *
1536  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1537  * not starting the critical section till here because we haven't been
1538  * scribbling on the original page yet; see comments above.
1539  */
1541 
1542  /*
1543  * By here, the original data page has been split into two new halves, and
1544  * these are correct. The algorithm requires that the left page never
1545  * move during a split, so we copy the new left page back on top of the
1546  * original. Note that this is not a waste of time, since we also require
1547  * (in the page management code) that the center of a page always be
1548  * clean, and the most efficient way to guarantee this is just to compact
1549  * the data by reinserting it into a new left page. (XXX the latter
1550  * comment is probably obsolete; but in any case it's good to not scribble
1551  * on the original page until we enter the critical section.)
1552  *
1553  * We need to do this before writing the WAL record, so that XLogInsert
1554  * can WAL log an image of the page if necessary.
1555  */
1556  PageRestoreTempPage(leftpage, origpage);
1557  /* leftpage, lopaque must not be used below here */
1558 
1560  MarkBufferDirty(rbuf);
1561 
1562  if (!P_RIGHTMOST(ropaque))
1563  {
1564  sopaque->btpo_prev = rightpagenumber;
1565  MarkBufferDirty(sbuf);
1566  }
1567 
1568  /*
1569  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1570  * a split.
1571  */
1572  if (!isleaf)
1573  {
1574  Page cpage = BufferGetPage(cbuf);
1575  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1576 
1577  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1578  MarkBufferDirty(cbuf);
1579  }
1580 
1581  /* XLOG stuff */
1582  if (RelationNeedsWAL(rel))
1583  {
1584  xl_btree_split xlrec;
1585  uint8 xlinfo;
1586  XLogRecPtr recptr;
1587 
1588  xlrec.level = ropaque->btpo.level;
1589  xlrec.firstright = firstright;
1590  xlrec.newitemoff = newitemoff;
1591 
1592  XLogBeginInsert();
1593  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1594 
1597  /* Log the right sibling, because we've changed its prev-pointer. */
1598  if (!P_RIGHTMOST(ropaque))
1600  if (BufferIsValid(cbuf))
1602 
1603  /*
1604  * Log the new item, if it was inserted on the left page. (If it was
1605  * put on the right page, we don't need to explicitly WAL log it
1606  * because it's included with all the other items on the right page.)
1607  * Show the new item as belonging to the left page buffer, so that it
1608  * is not stored if XLogInsert decides it needs a full-page image of
1609  * the left page. We store the offset anyway, though, to support
1610  * archive compression of these records.
1611  */
1612  if (newitemonleft)
1613  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1614 
1615  /* Log the left page's new high key */
1616  itemid = PageGetItemId(origpage, P_HIKEY);
1617  item = (IndexTuple) PageGetItem(origpage, itemid);
1618  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1619 
1620  /*
1621  * Log the contents of the right page in the format understood by
1622  * _bt_restore_page(). The whole right page will be recreated.
1623  *
1624  * Direct access to page is not good but faster - we should implement
1625  * some new func in page API. Note we only store the tuples
1626  * themselves, knowing that they were inserted in item-number order
1627  * and so the item pointers can be reconstructed. See comments for
1628  * _bt_restore_page().
1629  */
1631  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1632  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1633 
1634  xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
1635  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1636 
1637  PageSetLSN(origpage, recptr);
1638  PageSetLSN(rightpage, recptr);
1639  if (!P_RIGHTMOST(ropaque))
1640  {
1641  PageSetLSN(spage, recptr);
1642  }
1643  if (!isleaf)
1644  {
1645  PageSetLSN(BufferGetPage(cbuf), recptr);
1646  }
1647  }
1648 
1649  END_CRIT_SECTION();
1650 
1651  /* release the old right sibling */
1652  if (!P_RIGHTMOST(ropaque))
1653  _bt_relbuf(rel, sbuf);
1654 
1655  /* release the child */
1656  if (!isleaf)
1657  _bt_relbuf(rel, cbuf);
1658 
1659  /* split's done */
1660  return rbuf;
1661 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
#define BTP_ROOT
Definition: nbtree.h:72
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1868
#define BTP_SPLIT_END
Definition: nbtree.h:76
BlockNumber btpo_next
Definition: nbtree.h:58
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:410
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:798
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1456
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define SizeOfBtreeSplit
Definition: nbtxlog.h:118
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
unsigned char uint8
Definition: c.h:356
Pointer Item
Definition: item.h:17
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:81
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:328
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2102
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
#define ERROR
Definition: elog.h:43
OffsetNumber newitemoff
Definition: nbtxlog.h:115
BTCycleId btpo_cycleid
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:57
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:422
#define RelationGetRelationName(relation)
Definition: rel.h:444
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:429
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:351
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2153
uint32 level
Definition: nbtxlog.h:113
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
OffsetNumber firstright
Definition: nbtxlog.h:114
bool heapkeyspace
Definition: nbtree.h:464
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define RelationNeedsWAL(relation)
Definition: rel.h:512
#define PageGetLSN(page)
Definition: bufpage.h:362
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
#define elog(elevel,...)
Definition: elog.h:226
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:965
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
#define BT_WRITE
Definition: nbtree.h:404
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_stepright()

static void _bt_stepright ( Relation  rel,
BTInsertState  insertstate,
BTStack  stack 
)
static

Definition at line 832 of file nbtinsert.c.

References _bt_finish_split(), _bt_relandgetbuf(), _bt_relbuf(), BTInsertStateData::bounds_valid, BT_WRITE, BTPageOpaqueData::btpo_next, BTInsertStateData::buf, BufferGetPage, elog, ERROR, InvalidBuffer, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_findinsertloc().

833 {
834  Page page;
835  BTPageOpaque lpageop;
836  Buffer rbuf;
837  BlockNumber rblkno;
838 
839  page = BufferGetPage(insertstate->buf);
840  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
841 
842  rbuf = InvalidBuffer;
843  rblkno = lpageop->btpo_next;
844  for (;;)
845  {
846  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
847  page = BufferGetPage(rbuf);
848  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
849 
850  /*
851  * If this page was incompletely split, finish the split now. We do
852  * this while holding a lock on the left sibling, which is not good
853  * because finishing the split could be a fairly lengthy operation.
854  * But this should happen very seldom.
855  */
856  if (P_INCOMPLETE_SPLIT(lpageop))
857  {
858  _bt_finish_split(rel, rbuf, stack);
859  rbuf = InvalidBuffer;
860  continue;
861  }
862 
863  if (!P_IGNORE(lpageop))
864  break;
865  if (P_RIGHTMOST(lpageop))
866  elog(ERROR, "fell off the end of index \"%s\"",
868 
869  rblkno = lpageop->btpo_next;
870  }
871  /* rbuf locked; unlock buf, update state for caller */
872  _bt_relbuf(rel, insertstate->buf);
873  insertstate->buf = rbuf;
874  insertstate->bounds_valid = false;
875 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:499
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:934
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1789
#define ERROR
Definition: elog.h:43
#define RelationGetRelationName(relation)
Definition: rel.h:444
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:953
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:404
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:74

◆ _bt_vacuum_one_page()

static void _bt_vacuum_one_page ( Relation  rel,
Buffer  buffer,
Relation  heapRel 
)
static

Definition at line 2238 of file nbtinsert.c.

References _bt_delitems_delete(), Assert, BufferGetPage, ItemIdIsDead, MaxOffsetNumber, OffsetNumberNext, P_FIRSTDATAKEY, P_ISLEAF, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc().

2239 {
2240  OffsetNumber deletable[MaxOffsetNumber];
2241  int ndeletable = 0;
2242  OffsetNumber offnum,
2243  minoff,
2244  maxoff;
2245  Page page = BufferGetPage(buffer);
2247 
2248  Assert(P_ISLEAF(opaque));
2249 
2250  /*
2251  * Scan over all items to see which ones need to be deleted according to
2252  * LP_DEAD flags.
2253  */
2254  minoff = P_FIRSTDATAKEY(opaque);
2255  maxoff = PageGetMaxOffsetNumber(page);
2256  for (offnum = minoff;
2257  offnum <= maxoff;
2258  offnum = OffsetNumberNext(offnum))
2259  {
2260  ItemId itemId = PageGetItemId(page, offnum);
2261 
2262  if (ItemIdIsDead(itemId))
2263  deletable[ndeletable++] = offnum;
2264  }
2265 
2266  if (ndeletable > 0)
2267  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2268 
2269  /*
2270  * Note: if we didn't find any LP_DEAD items, then the page's
2271  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2272  * separate write to clear it, however. We will clear it when we split
2273  * the page.
2274  */
2275 }
#define MaxOffsetNumber
Definition: off.h:28
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define Assert(condition)
Definition: c.h:732
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
Definition: nbtpage.c:1098
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:189