PostgreSQL Source Code  git master
nbtinsert.c File Reference
#include "postgres.h"
#include "access/heapam.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
#include "utils/tqual.h"
Include dependency graph for nbtinsert.c:

Go to the source code of this file.

Data Structures

struct  FindSplitData
 

Macros

#define BTREE_FASTPATH_MIN_LEVEL   2
 

Functions

static Buffer _bt_newroot (Relation rel, Buffer lbuf, Buffer rbuf)
 
static TransactionId _bt_check_unique (Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
 
static void _bt_findinsertloc (Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, int keysz, ScanKey scankey, IndexTuple newtup, BTStack stack, Relation heapRel)
 
static void _bt_insertonpg (Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
 
static Buffer _bt_split (Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
 
static void _bt_insert_parent (Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
 
static OffsetNumber _bt_findsplitloc (Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft)
 
static void _bt_checksplitloc (FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz)
 
static bool _bt_pgaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
 
static bool _bt_isequal (TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey)
 
static void _bt_vacuum_one_page (Relation rel, Buffer buffer, Relation heapRel)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, int access)
 

Macro Definition Documentation

◆ BTREE_FASTPATH_MIN_LEVEL

#define BTREE_FASTPATH_MIN_LEVEL   2

Definition at line 30 of file nbtinsert.c.

Referenced by _bt_insertonpg().

Function Documentation

◆ _bt_check_unique()

static TransactionId _bt_check_unique ( Relation  rel,
IndexTuple  itup,
Relation  heapRel,
Buffer  buf,
OffsetNumber  offset,
ScanKey  itup_scankey,
IndexUniqueCheck  checkUnique,
bool is_unique,
uint32 speculativeToken 
)
static

Definition at line 341 of file nbtinsert.c.

References _bt_isequal(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BufferGetPage, BuildIndexValueDescription(), CheckForSerializableConflictIn(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), heap_hot_search(), index_deform_tuple(), INDEX_MAX_KEYS, IndexRelationGetNumberOfKeyAttributes, InitDirtySnapshot, InvalidBuffer, InvalidTransactionId, ItemIdIsDead, ItemIdMarkDead, ItemPointerCompare(), MarkBufferDirtyHint(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, RelationGetRelationName, SnapshotSelf, SnapshotData::speculativeToken, IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_PARTIAL, values, SnapshotData::xmax, and SnapshotData::xmin.

Referenced by _bt_doinsert().

345 {
346  TupleDesc itupdesc = RelationGetDescr(rel);
347  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
348  SnapshotData SnapshotDirty;
349  OffsetNumber maxoff;
350  Page page;
351  BTPageOpaque opaque;
352  Buffer nbuf = InvalidBuffer;
353  bool found = false;
354 
355  /* Assume unique until we find a duplicate */
356  *is_unique = true;
357 
358  InitDirtySnapshot(SnapshotDirty);
359 
360  page = BufferGetPage(buf);
361  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362  maxoff = PageGetMaxOffsetNumber(page);
363 
364  /*
365  * Scan over all equal tuples, looking for live conflicts.
366  */
367  for (;;)
368  {
369  ItemId curitemid;
370  IndexTuple curitup;
371  BlockNumber nblkno;
372 
373  /*
374  * make sure the offset points to an actual item before trying to
375  * examine it...
376  */
377  if (offset <= maxoff)
378  {
379  curitemid = PageGetItemId(page, offset);
380 
381  /*
382  * We can skip items that are marked killed.
383  *
384  * Formerly, we applied _bt_isequal() before checking the kill
385  * flag, so as to fall out of the item loop as soon as possible.
386  * However, in the presence of heavy update activity an index may
387  * contain many killed items with the same key; running
388  * _bt_isequal() on each killed item gets expensive. Furthermore
389  * it is likely that the non-killed version of each key appears
390  * first, so that we didn't actually get to exit any sooner
391  * anyway. So now we just advance over killed items as quickly as
392  * we can. We only apply _bt_isequal() when we get to a non-killed
393  * item or the end of the page.
394  */
395  if (!ItemIdIsDead(curitemid))
396  {
397  ItemPointerData htid;
398  bool all_dead;
399 
400  /*
401  * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
402  * how we handling NULLs - and so we must not use _bt_compare
403  * in real comparison, but only for ordering/finding items on
404  * pages. - vadim 03/24/97
405  */
406  if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
407  break; /* we're past all the equal tuples */
408 
409  /* okay, we gotta fetch the heap tuple ... */
410  curitup = (IndexTuple) PageGetItem(page, curitemid);
411  htid = curitup->t_tid;
412 
413  /*
414  * If we are doing a recheck, we expect to find the tuple we
415  * are rechecking. It's not a duplicate, but we have to keep
416  * scanning.
417  */
418  if (checkUnique == UNIQUE_CHECK_EXISTING &&
419  ItemPointerCompare(&htid, &itup->t_tid) == 0)
420  {
421  found = true;
422  }
423 
424  /*
425  * We check the whole HOT-chain to see if there is any tuple
426  * that satisfies SnapshotDirty. This is necessary because we
427  * have just a single index entry for the entire chain.
428  */
429  else if (heap_hot_search(&htid, heapRel, &SnapshotDirty,
430  &all_dead))
431  {
432  TransactionId xwait;
433 
434  /*
435  * It is a duplicate. If we are only doing a partial
436  * check, then don't bother checking if the tuple is being
437  * updated in another transaction. Just return the fact
438  * that it is a potential conflict and leave the full
439  * check till later.
440  */
441  if (checkUnique == UNIQUE_CHECK_PARTIAL)
442  {
443  if (nbuf != InvalidBuffer)
444  _bt_relbuf(rel, nbuf);
445  *is_unique = false;
446  return InvalidTransactionId;
447  }
448 
449  /*
450  * If this tuple is being updated by other transaction
451  * then we have to wait for its commit/abort.
452  */
453  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
454  SnapshotDirty.xmin : SnapshotDirty.xmax;
455 
456  if (TransactionIdIsValid(xwait))
457  {
458  if (nbuf != InvalidBuffer)
459  _bt_relbuf(rel, nbuf);
460  /* Tell _bt_doinsert to wait... */
461  *speculativeToken = SnapshotDirty.speculativeToken;
462  return xwait;
463  }
464 
465  /*
466  * Otherwise we have a definite conflict. But before
467  * complaining, look to see if the tuple we want to insert
468  * is itself now committed dead --- if so, don't complain.
469  * This is a waste of time in normal scenarios but we must
470  * do it to support CREATE INDEX CONCURRENTLY.
471  *
472  * We must follow HOT-chains here because during
473  * concurrent index build, we insert the root TID though
474  * the actual tuple may be somewhere in the HOT-chain.
475  * While following the chain we might not stop at the
476  * exact tuple which triggered the insert, but that's OK
477  * because if we find a live tuple anywhere in this chain,
478  * we have a unique key conflict. The other live tuple is
479  * not part of this chain because it had a different index
480  * entry.
481  */
482  htid = itup->t_tid;
483  if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
484  {
485  /* Normal case --- it's still live */
486  }
487  else
488  {
489  /*
490  * It's been deleted, so no error, and no need to
491  * continue searching
492  */
493  break;
494  }
495 
496  /*
497  * Check for a conflict-in as we would if we were going to
498  * write to this page. We aren't actually going to write,
499  * but we want a chance to report SSI conflicts that would
500  * otherwise be masked by this unique constraint
501  * violation.
502  */
504 
505  /*
506  * This is a definite conflict. Break the tuple down into
507  * datums and report the error. But first, make sure we
508  * release the buffer locks we're holding ---
509  * BuildIndexValueDescription could make catalog accesses,
510  * which in the worst case might touch this same index and
511  * cause deadlocks.
512  */
513  if (nbuf != InvalidBuffer)
514  _bt_relbuf(rel, nbuf);
515  _bt_relbuf(rel, buf);
516 
517  {
519  bool isnull[INDEX_MAX_KEYS];
520  char *key_desc;
521 
523  values, isnull);
524 
525  key_desc = BuildIndexValueDescription(rel, values,
526  isnull);
527 
528  ereport(ERROR,
529  (errcode(ERRCODE_UNIQUE_VIOLATION),
530  errmsg("duplicate key value violates unique constraint \"%s\"",
532  key_desc ? errdetail("Key %s already exists.",
533  key_desc) : 0,
534  errtableconstraint(heapRel,
535  RelationGetRelationName(rel))));
536  }
537  }
538  else if (all_dead)
539  {
540  /*
541  * The conflicting tuple (or whole HOT chain) is dead to
542  * everyone, so we may as well mark the index entry
543  * killed.
544  */
545  ItemIdMarkDead(curitemid);
546  opaque->btpo_flags |= BTP_HAS_GARBAGE;
547 
548  /*
549  * Mark buffer with a dirty hint, since state is not
550  * crucial. Be sure to mark the proper buffer dirty.
551  */
552  if (nbuf != InvalidBuffer)
553  MarkBufferDirtyHint(nbuf, true);
554  else
555  MarkBufferDirtyHint(buf, true);
556  }
557  }
558  }
559 
560  /*
561  * Advance to next tuple to continue checking.
562  */
563  if (offset < maxoff)
564  offset = OffsetNumberNext(offset);
565  else
566  {
567  /* If scankey == hikey we gotta check the next page too */
568  if (P_RIGHTMOST(opaque))
569  break;
570  if (!_bt_isequal(itupdesc, page, P_HIKEY,
571  indnkeyatts, itup_scankey))
572  break;
573  /* Advance to next non-dead page --- there must be one */
574  for (;;)
575  {
576  nblkno = opaque->btpo_next;
577  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
578  page = BufferGetPage(nbuf);
579  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
580  if (!P_IGNORE(opaque))
581  break;
582  if (P_RIGHTMOST(opaque))
583  elog(ERROR, "fell off the end of index \"%s\"",
585  }
586  maxoff = PageGetMaxOffsetNumber(page);
587  offset = P_FIRSTDATAKEY(opaque);
588  }
589  }
590 
591  /*
592  * If we are doing a recheck then we should have found the tuple we are
593  * checking. Otherwise there's something very wrong --- probably, the
594  * index is on a non-immutable expression.
595  */
596  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
597  ereport(ERROR,
598  (errcode(ERRCODE_INTERNAL_ERROR),
599  errmsg("failed to re-find tuple within index \"%s\"",
601  errhint("This may be because of a non-immutable index expression."),
602  errtableconstraint(heapRel,
603  RelationGetRelationName(rel))));
604 
605  if (nbuf != InvalidBuffer)
606  _bt_relbuf(rel, nbuf);
607 
608  return InvalidTransactionId;
609 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
BlockNumber btpo_next
Definition: nbtree.h:58
int errhint(const char *fmt,...)
Definition: elog.c:987
#define P_IGNORE(opaque)
Definition: nbtree.h:163
uint32 TransactionId
Definition: c.h:474
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:860
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
#define RelationGetDescr(relation)
Definition: rel.h:433
#define ItemIdMarkDead(itemId)
Definition: itemid.h:178
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey)
Definition: nbtinsert.c:2328
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:575
uint32 BlockNumber
Definition: block.h:31
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4280
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5246
#define BT_READ
Definition: nbtree.h:286
#define ERROR
Definition: elog.h:43
#define InitDirtySnapshot(snapshotdata)
Definition: tqual.h:103
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:873
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:441
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:423
TransactionId xmax
Definition: snapshot.h:69
TransactionId xmin
Definition: snapshot.h:68
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define SnapshotSelf
Definition: tqual.h:27
#define ereport(elevel, rest)
Definition: elog.h:122
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uintptr_t Datum
Definition: postgres.h:365
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
uint32 speculativeToken
Definition: snapshot.h:104
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead)
Definition: heapam.c:2183
#define P_HIKEY
Definition: nbtree.h:185
static Datum values[MAXATTR]
Definition: bootstrap.c:164
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define elog
Definition: elog.h:219
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:178
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint16 btpo_flags
Definition: nbtree.h:64
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_checksplitloc()

static void _bt_checksplitloc ( FindSplitData state,
OffsetNumber  firstoldonright,
bool  newitemonleft,
int  dataitemstoleft,
Size  firstoldonrightsz 
)
static

Definition at line 1727 of file nbtinsert.c.

References FindSplitData::best_delta, FindSplitData::fillfactor, FindSplitData::firstright, FindSplitData::have_split, FindSplitData::is_leaf, FindSplitData::is_rightmost, FindSplitData::leftspace, MAXALIGN, FindSplitData::newitemoff, FindSplitData::newitemonleft, FindSplitData::newitemsz, FindSplitData::olddataitemstotal, and FindSplitData::rightspace.

Referenced by _bt_findsplitloc().

1732 {
1733  int leftfree,
1734  rightfree;
1735  Size firstrightitemsz;
1736  bool newitemisfirstonright;
1737 
1738  /* Is the new item going to be the first item on the right page? */
1739  newitemisfirstonright = (firstoldonright == state->newitemoff
1740  && !newitemonleft);
1741 
1742  if (newitemisfirstonright)
1743  firstrightitemsz = state->newitemsz;
1744  else
1745  firstrightitemsz = firstoldonrightsz;
1746 
1747  /* Account for all the old tuples */
1748  leftfree = state->leftspace - olddataitemstoleft;
1749  rightfree = state->rightspace -
1750  (state->olddataitemstotal - olddataitemstoleft);
1751 
1752  /*
1753  * The first item on the right page becomes the high key of the left page;
1754  * therefore it counts against left space as well as right space. When
1755  * index has included attributes, then those attributes of left page high
1756  * key will be truncated leaving that page with slightly more free space.
1757  * However, that shouldn't affect our ability to find valid split
1758  * location, because anyway split location should exists even without high
1759  * key truncation.
1760  */
1761  leftfree -= firstrightitemsz;
1762 
1763  /* account for the new item */
1764  if (newitemonleft)
1765  leftfree -= (int) state->newitemsz;
1766  else
1767  rightfree -= (int) state->newitemsz;
1768 
1769  /*
1770  * If we are not on the leaf level, we will be able to discard the key
1771  * data from the first item that winds up on the right page.
1772  */
1773  if (!state->is_leaf)
1774  rightfree += (int) firstrightitemsz -
1775  (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
1776 
1777  /*
1778  * If feasible split point, remember best delta.
1779  */
1780  if (leftfree >= 0 && rightfree >= 0)
1781  {
1782  int delta;
1783 
1784  if (state->is_rightmost)
1785  {
1786  /*
1787  * If splitting a rightmost page, try to put (100-fillfactor)% of
1788  * free space on left page. See comments for _bt_findsplitloc.
1789  */
1790  delta = (state->fillfactor * leftfree)
1791  - ((100 - state->fillfactor) * rightfree);
1792  }
1793  else
1794  {
1795  /* Otherwise, aim for equal free space on both sides */
1796  delta = leftfree - rightfree;
1797  }
1798 
1799  if (delta < 0)
1800  delta = -delta;
1801  if (!state->have_split || delta < state->best_delta)
1802  {
1803  state->have_split = true;
1804  state->newitemonleft = newitemonleft;
1805  state->firstright = firstoldonright;
1806  state->best_delta = delta;
1807  }
1808  }
1809 }
bool newitemonleft
Definition: nbtinsert.c:47
int best_delta
Definition: nbtinsert.c:49
bool is_rightmost
Definition: nbtinsert.c:38
int fillfactor
Definition: nbtinsert.c:36
bool have_split
Definition: nbtinsert.c:44
int rightspace
Definition: nbtinsert.c:41
bool is_leaf
Definition: nbtinsert.c:37
Size newitemsz
Definition: nbtinsert.c:35
struct ItemIdData ItemIdData
int olddataitemstotal
Definition: nbtinsert.c:42
OffsetNumber newitemoff
Definition: nbtinsert.c:39
size_t Size
Definition: c.h:433
#define MAXALIGN(LEN)
Definition: c.h:652
OffsetNumber firstright
Definition: nbtinsert.c:48

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 110 of file nbtinsert.c.

References _bt_binsrch(), _bt_check_unique(), _bt_checkpage(), _bt_compare(), _bt_findinsertloc(), _bt_freeskey(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_moveright(), _bt_relbuf(), _bt_search(), Assert, BT_WRITE, buf, BUFFER_LOCK_UNLOCK, BufferGetPage, CheckForSerializableConflictIn(), ConditionalLockBuffer(), IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, LockBuffer(), MAXALIGN, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, ReleaseBuffer(), SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

112 {
113  bool is_unique = false;
114  int indnkeyatts;
115  ScanKey itup_scankey;
116  BTStack stack = NULL;
117  Buffer buf;
118  OffsetNumber offset;
119  bool fastpath;
120 
121  indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
122  Assert(indnkeyatts != 0);
123 
124  /* we need an insertion scan key to do our search, so build one */
125  itup_scankey = _bt_mkscankey(rel, itup);
126 
127  /*
128  * It's very common to have an index on an auto-incremented or
129  * monotonically increasing value. In such cases, every insertion happens
130  * towards the end of the index. We try to optimize that case by caching
131  * the right-most leaf of the index. If our cached block is still the
132  * rightmost leaf, has enough free space to accommodate a new entry and
133  * the insertion key is strictly greater than the first key in this page,
134  * then we can safely conclude that the new key will be inserted in the
135  * cached block. So we simply search within the cached block and insert the
136  * key at the appropriate location. We call it a fastpath.
137  *
138  * Testing has revealed, though, that the fastpath can result in increased
139  * contention on the exclusive-lock on the rightmost leaf page. So we
140  * conditionally check if the lock is available. If it's not available then
141  * we simply abandon the fastpath and take the regular path. This makes
142  * sense because unavailability of the lock also signals that some other
143  * backend might be concurrently inserting into the page, thus reducing our
144  * chances to finding an insertion place in this page.
145  */
146 top:
147  fastpath = false;
148  offset = InvalidOffsetNumber;
150  {
151  Size itemsz;
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change, as
159  * far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170  itemsz = IndexTupleSize(itup);
171  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this
172  * but we need to be consistent */
173 
174  /*
175  * Check if the page is still the rightmost leaf page, has enough
176  * free space to accommodate the new tuple, and the insertion
177  * scan key is strictly greater than the first key on the page.
178  */
179  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
180  !P_IGNORE(lpageop) &&
181  (PageGetFreeSpace(page) > itemsz) &&
182  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
183  _bt_compare(rel, indnkeyatts, itup_scankey, page,
184  P_FIRSTDATAKEY(lpageop)) > 0)
185  {
186  /*
187  * The right-most block should never have an incomplete split.
188  * But be paranoid and check for it anyway.
189  */
190  Assert(!P_INCOMPLETE_SPLIT(lpageop));
191  fastpath = true;
192  }
193  else
194  {
195  _bt_relbuf(rel, buf);
196 
197  /*
198  * Something did not work out. Just forget about the cached
199  * block and follow the normal path. It might be set again if
200  * the conditions are favourable.
201  */
203  }
204  }
205  else
206  {
207  ReleaseBuffer(buf);
208 
209  /*
210  * If someone's holding a lock, it's likely to change anyway,
211  * so don't try again until we get an updated rightmost leaf.
212  */
214  }
215  }
216 
217  if (!fastpath)
218  {
219  /* find the first page containing this key */
220  stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE,
221  NULL);
222 
223  /* trade in our read lock for a write lock */
225  LockBuffer(buf, BT_WRITE);
226 
227  /*
228  * If the page was split between the time that we surrendered our read
229  * lock and acquired our write lock, then this page may no longer be
230  * the right place for the key we want to insert. In this case, we
231  * need to move right in the tree. See Lehman and Yao for an
232  * excruciatingly precise description.
233  */
234  buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, false,
235  true, stack, BT_WRITE, NULL);
236  }
237 
238  /*
239  * If we're not allowing duplicates, make sure the key isn't already in
240  * the index.
241  *
242  * NOTE: obviously, _bt_check_unique can only detect keys that are already
243  * in the index; so it cannot defend against concurrent insertions of the
244  * same key. We protect against that by means of holding a write lock on
245  * the target page. Any other would-be inserter of the same key must
246  * acquire a write lock on the same target page, so only one would-be
247  * inserter can be making the check at one time. Furthermore, once we are
248  * past the check we hold write locks continuously until we have performed
249  * our insertion, so no later inserter can fail to see our insertion.
250  * (This requires some care in _bt_insertonpg.)
251  *
252  * If we must wait for another xact, we release the lock while waiting,
253  * and then must start over completely.
254  *
255  * For a partial uniqueness check, we don't wait for the other xact. Just
256  * let the tuple in and return false for possibly non-unique, or true for
257  * definitely unique.
258  */
259  if (checkUnique != UNIQUE_CHECK_NO)
260  {
261  TransactionId xwait;
262  uint32 speculativeToken;
263 
264  offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false);
265  xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
266  checkUnique, &is_unique, &speculativeToken);
267 
268  if (TransactionIdIsValid(xwait))
269  {
270  /* Have to wait for the other guy ... */
271  _bt_relbuf(rel, buf);
272 
273  /*
274  * If it's a speculative insertion, wait for it to finish (ie. to
275  * go ahead with the insertion, or kill the tuple). Otherwise
276  * wait for the transaction to finish as usual.
277  */
278  if (speculativeToken)
279  SpeculativeInsertionWait(xwait, speculativeToken);
280  else
281  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
282 
283  /* start over... */
284  if (stack)
285  _bt_freestack(stack);
286  goto top;
287  }
288  }
289 
290  if (checkUnique != UNIQUE_CHECK_EXISTING)
291  {
292  /*
293  * The only conflict predicate locking cares about for indexes is when
294  * an index tuple insert conflicts with an existing lock. Since the
295  * actual location of the insert is hard to predict because of the
296  * random search used to prevent O(N^2) performance when there are
297  * many duplicate entries, we can just use the "first valid" page.
298  * This reasoning also applies to INCLUDE indexes, whose extra
299  * attributes are not considered part of the key space.
300  */
301  CheckForSerializableConflictIn(rel, NULL, buf);
302  /* do the insertion */
303  _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup,
304  stack, heapRel);
305  _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
306  }
307  else
308  {
309  /* just release the buffer */
310  _bt_relbuf(rel, buf);
311  }
312 
313  /* be tidy */
314  if (stack)
315  _bt_freestack(stack);
316  _bt_freeskey(itup_scankey);
317 
318  return is_unique;
319 }
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:341
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
#define P_IGNORE(opaque)
Definition: nbtree.h:163
uint32 TransactionId
Definition: c.h:474
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
void _bt_freeskey(ScanKey skey)
Definition: nbtutils.c:166
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
#define RelationGetTargetBlock(relation)
Definition: rel.h:493
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4280
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
uint16 OffsetNumber
Definition: off.h:24
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:662
Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:214
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:711
static char * buf
Definition: pg_test_fsync.c:67
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:323
unsigned int uint32
Definition: c.h:325
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
static void _bt_findinsertloc(Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, int keysz, ScanKey scankey, IndexTuple newtup, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:642
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3572
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:554
#define Assert(condition)
Definition: c.h:699
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:500
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
#define MAXALIGN(LEN)
Definition: c.h:652
static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:835
#define TransactionIdIsValid(xid)
Definition: transam.h:41
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:428
ScanKey _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:62
#define BT_WRITE
Definition: nbtree.h:287
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_findinsertloc()

static void _bt_findinsertloc ( Relation  rel,
Buffer bufptr,
OffsetNumber offsetptr,
int  keysz,
ScanKey  scankey,
IndexTuple  newtup,
BTStack  stack,
Relation  heapRel 
)
static

Definition at line 642 of file nbtinsert.c.

References _bt_binsrch(), _bt_compare(), _bt_finish_split(), _bt_relandgetbuf(), _bt_relbuf(), _bt_vacuum_one_page(), BT_WRITE, BTMaxItemSize, BTPageOpaqueData::btpo_next, buf, BufferGetPage, elog, ereport, errcode(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, MAX_RANDOM_VALUE, MAXALIGN, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetSpecialPointer, random(), and RelationGetRelationName.

Referenced by _bt_doinsert().

650 {
651  Buffer buf = *bufptr;
652  Page page = BufferGetPage(buf);
653  Size itemsz;
654  BTPageOpaque lpageop;
655  bool movedright,
656  vacuumed;
657  OffsetNumber newitemoff;
658  OffsetNumber firstlegaloff = *offsetptr;
659 
660  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
661 
662  itemsz = IndexTupleSize(newtup);
663  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
664  * need to be consistent */
665 
666  /*
667  * Check whether the item can fit on a btree page at all. (Eventually, we
668  * ought to try to apply TOAST methods if not.) We actually need to be
669  * able to fit three items on every page, so restrict any one item to 1/3
670  * the per-page available space. Note that at this point, itemsz doesn't
671  * include the ItemId.
672  *
673  * NOTE: if you change this, see also the similar code in _bt_buildadd().
674  */
675  if (itemsz > BTMaxItemSize(page))
676  ereport(ERROR,
677  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
678  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
679  itemsz, BTMaxItemSize(page),
681  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
682  "Consider a function index of an MD5 hash of the value, "
683  "or use full text indexing."),
684  errtableconstraint(heapRel,
685  RelationGetRelationName(rel))));
686 
687  /*----------
688  * If we will need to split the page to put the item on this page,
689  * check whether we can put the tuple somewhere to the right,
690  * instead. Keep scanning right until we
691  * (a) find a page with enough free space,
692  * (b) reach the last page where the tuple can legally go, or
693  * (c) get tired of searching.
694  * (c) is not flippant; it is important because if there are many
695  * pages' worth of equal keys, it's better to split one of the early
696  * pages than to scan all the way to the end of the run of equal keys
697  * on every insert. We implement "get tired" as a random choice,
698  * since stopping after scanning a fixed number of pages wouldn't work
699  * well (we'd never reach the right-hand side of previously split
700  * pages). Currently the probability of moving right is set at 0.99,
701  * which may seem too high to change the behavior much, but it does an
702  * excellent job of preventing O(N^2) behavior with many equal keys.
703  *----------
704  */
705  movedright = false;
706  vacuumed = false;
707  while (PageGetFreeSpace(page) < itemsz)
708  {
709  Buffer rbuf;
710  BlockNumber rblkno;
711 
712  /*
713  * before considering moving right, see if we can obtain enough space
714  * by erasing LP_DEAD items
715  */
716  if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
717  {
718  _bt_vacuum_one_page(rel, buf, heapRel);
719 
720  /*
721  * remember that we vacuumed this page, because that makes the
722  * hint supplied by the caller invalid
723  */
724  vacuumed = true;
725 
726  if (PageGetFreeSpace(page) >= itemsz)
727  break; /* OK, now we have enough space */
728  }
729 
730  /*
731  * nope, so check conditions (b) and (c) enumerated above
732  */
733  if (P_RIGHTMOST(lpageop) ||
734  _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
735  random() <= (MAX_RANDOM_VALUE / 100))
736  break;
737 
738  /*
739  * step right to next non-dead page
740  *
741  * must write-lock that page before releasing write lock on current
742  * page; else someone else's _bt_check_unique scan could fail to see
743  * our insertion. write locks on intermediate dead pages won't do
744  * because we don't know when they will get de-linked from the tree.
745  */
746  rbuf = InvalidBuffer;
747 
748  rblkno = lpageop->btpo_next;
749  for (;;)
750  {
751  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
752  page = BufferGetPage(rbuf);
753  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
754 
755  /*
756  * If this page was incompletely split, finish the split now. We
757  * do this while holding a lock on the left sibling, which is not
758  * good because finishing the split could be a fairly lengthy
759  * operation. But this should happen very seldom.
760  */
761  if (P_INCOMPLETE_SPLIT(lpageop))
762  {
763  _bt_finish_split(rel, rbuf, stack);
764  rbuf = InvalidBuffer;
765  continue;
766  }
767 
768  if (!P_IGNORE(lpageop))
769  break;
770  if (P_RIGHTMOST(lpageop))
771  elog(ERROR, "fell off the end of index \"%s\"",
773 
774  rblkno = lpageop->btpo_next;
775  }
776  _bt_relbuf(rel, buf);
777  buf = rbuf;
778  movedright = true;
779  vacuumed = false;
780  }
781 
782  /*
783  * Now we are on the right page, so find the insert position. If we moved
784  * right at all, we know we should insert at the start of the page. If we
785  * didn't move right, we can use the firstlegaloff hint if the caller
786  * supplied one, unless we vacuumed the page which might have moved tuples
787  * around making the hint invalid. If we didn't move right or can't use
788  * the hint, find the position by searching.
789  */
790  if (movedright)
791  newitemoff = P_FIRSTDATAKEY(lpageop);
792  else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
793  newitemoff = firstlegaloff;
794  else
795  newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
796 
797  *bufptr = buf;
798  *offsetptr = newitemoff;
799 }
BlockNumber btpo_next
Definition: nbtree.h:58
int errhint(const char *fmt,...)
Definition: elog.c:987
#define P_IGNORE(opaque)
Definition: nbtree.h:163
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:860
long random(void)
Definition: random.c:22
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define InvalidBuffer
Definition: buf.h:25
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:164
int errcode(int sqlerrcode)
Definition: elog.c:575
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
uint16 OffsetNumber
Definition: off.h:24
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5246
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1937
#define ERROR
Definition: elog.h:43
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2383
static char * buf
Definition: pg_test_fsync.c:67
#define MAX_RANDOM_VALUE
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:323
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define ereport(elevel, rest)
Definition: elog.h:122
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define MAXALIGN(LEN)
Definition: c.h:652
#define BTMaxItemSize(page)
Definition: nbtree.h:126
#define P_HIKEY
Definition: nbtree.h:185
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define elog
Definition: elog.h:219
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:428
#define BT_WRITE
Definition: nbtree.h:287
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_findsplitloc()

static OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  page,
OffsetNumber  newitemoff,
Size  newitemsz,
bool newitemonleft 
)
static

Definition at line 1579 of file nbtinsert.c.

References _bt_checksplitloc(), FindSplitData::best_delta, BTREE_DEFAULT_FILLFACTOR, BTREE_NONLEAF_FILLFACTOR, elog, ERROR, FindSplitData::fillfactor, FindSplitData::firstright, FindSplitData::have_split, FindSplitData::is_leaf, FindSplitData::is_rightmost, ItemIdGetLength, FindSplitData::leftspace, MAXALIGN, FindSplitData::newitemoff, FindSplitData::newitemonleft, FindSplitData::newitemsz, OffsetNumberNext, FindSplitData::olddataitemstotal, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, PageGetSpecialPointer, RelationGetFillFactor, RelationGetRelationName, FindSplitData::rightspace, and SizeOfPageHeaderData.

Referenced by _bt_insertonpg().

1584 {
1585  BTPageOpaque opaque;
1586  OffsetNumber offnum;
1587  OffsetNumber maxoff;
1588  ItemId itemid;
1590  int leftspace,
1591  rightspace,
1592  goodenough,
1593  olddataitemstotal,
1594  olddataitemstoleft;
1595  bool goodenoughfound;
1596 
1597  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1598 
1599  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
1600  newitemsz += sizeof(ItemIdData);
1601 
1602  /* Total free space available on a btree page, after fixed overhead */
1603  leftspace = rightspace =
1605  MAXALIGN(sizeof(BTPageOpaqueData));
1606 
1607  /* The right page will have the same high key as the old page */
1608  if (!P_RIGHTMOST(opaque))
1609  {
1610  itemid = PageGetItemId(page, P_HIKEY);
1611  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
1612  sizeof(ItemIdData));
1613  }
1614 
1615  /* Count up total space in data items without actually scanning 'em */
1616  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
1617 
1618  state.newitemsz = newitemsz;
1619  state.is_leaf = P_ISLEAF(opaque);
1620  state.is_rightmost = P_RIGHTMOST(opaque);
1621  state.have_split = false;
1622  if (state.is_leaf)
1623  state.fillfactor = RelationGetFillFactor(rel,
1625  else
1627  state.newitemonleft = false; /* these just to keep compiler quiet */
1628  state.firstright = 0;
1629  state.best_delta = 0;
1630  state.leftspace = leftspace;
1631  state.rightspace = rightspace;
1632  state.olddataitemstotal = olddataitemstotal;
1633  state.newitemoff = newitemoff;
1634 
1635  /*
1636  * Finding the best possible split would require checking all the possible
1637  * split points, because of the high-key and left-key special cases.
1638  * That's probably more work than it's worth; instead, stop as soon as we
1639  * find a "good-enough" split, where good-enough is defined as an
1640  * imbalance in free space of no more than pagesize/16 (arbitrary...) This
1641  * should let us stop near the middle on most pages, instead of plowing to
1642  * the end.
1643  */
1644  goodenough = leftspace / 16;
1645 
1646  /*
1647  * Scan through the data items and calculate space usage for a split at
1648  * each possible position.
1649  */
1650  olddataitemstoleft = 0;
1651  goodenoughfound = false;
1652  maxoff = PageGetMaxOffsetNumber(page);
1653 
1654  for (offnum = P_FIRSTDATAKEY(opaque);
1655  offnum <= maxoff;
1656  offnum = OffsetNumberNext(offnum))
1657  {
1658  Size itemsz;
1659 
1660  itemid = PageGetItemId(page, offnum);
1661  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1662 
1663  /*
1664  * Will the new item go to left or right of split?
1665  */
1666  if (offnum > newitemoff)
1667  _bt_checksplitloc(&state, offnum, true,
1668  olddataitemstoleft, itemsz);
1669 
1670  else if (offnum < newitemoff)
1671  _bt_checksplitloc(&state, offnum, false,
1672  olddataitemstoleft, itemsz);
1673  else
1674  {
1675  /* need to try it both ways! */
1676  _bt_checksplitloc(&state, offnum, true,
1677  olddataitemstoleft, itemsz);
1678 
1679  _bt_checksplitloc(&state, offnum, false,
1680  olddataitemstoleft, itemsz);
1681  }
1682 
1683  /* Abort scan once we find a good-enough choice */
1684  if (state.have_split && state.best_delta <= goodenough)
1685  {
1686  goodenoughfound = true;
1687  break;
1688  }
1689 
1690  olddataitemstoleft += itemsz;
1691  }
1692 
1693  /*
1694  * If the new item goes as the last item, check for splitting so that all
1695  * the old items go to the left page and the new item goes to the right
1696  * page.
1697  */
1698  if (newitemoff > maxoff && !goodenoughfound)
1699  _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
1700 
1701  /*
1702  * I believe it is not possible to fail to find a feasible split, but just
1703  * in case ...
1704  */
1705  if (!state.have_split)
1706  elog(ERROR, "could not find a feasible split point for index \"%s\"",
1708 
1709  *newitemonleft = state.newitemonleft;
1710  return state.firstright;
1711 }
bool newitemonleft
Definition: nbtinsert.c:47
int best_delta
Definition: nbtinsert.c:49
bool is_rightmost
Definition: nbtinsert.c:38
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
int fillfactor
Definition: nbtinsert.c:36
bool have_split
Definition: nbtinsert.c:44
#define SizeOfPageHeaderData
Definition: bufpage.h:212
static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz)
Definition: nbtinsert.c:1727
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
int rightspace
Definition: nbtinsert.c:41
uint16 OffsetNumber
Definition: off.h:24
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:140
#define ERROR
Definition: elog.h:43
bool is_leaf
Definition: nbtinsert.c:37
Size newitemsz
Definition: nbtinsert.c:35
#define PageGetPageSize(page)
Definition: bufpage.h:264
#define RelationGetRelationName(relation)
Definition: rel.h:441
struct ItemIdData ItemIdData
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:139
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define RelationGetFillFactor(relation, defaultff)
Definition: rel.h:283
int olddataitemstotal
Definition: nbtinsert.c:42
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtinsert.c:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define MAXALIGN(LEN)
Definition: c.h:652
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:629
#define P_HIKEY
Definition: nbtree.h:185
OffsetNumber firstright
Definition: nbtinsert.c:48
#define elog
Definition: elog.h:219
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 1937 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc(), _bt_getstackbuf(), and _bt_moveright().

1938 {
1939  Page lpage = BufferGetPage(lbuf);
1940  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1941  Buffer rbuf;
1942  Page rpage;
1943  BTPageOpaque rpageop;
1944  bool was_root;
1945  bool was_only;
1946 
1947  Assert(P_INCOMPLETE_SPLIT(lpageop));
1948 
1949  /* Lock right sibling, the one missing the downlink */
1950  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1951  rpage = BufferGetPage(rbuf);
1952  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1953 
1954  /* Could this be a root split? */
1955  if (!stack)
1956  {
1957  Buffer metabuf;
1958  Page metapg;
1959  BTMetaPageData *metad;
1960 
1961  /* acquire lock on the metapage */
1962  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1963  metapg = BufferGetPage(metabuf);
1964  metad = BTPageGetMeta(metapg);
1965 
1966  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1967 
1968  _bt_relbuf(rel, metabuf);
1969  }
1970  else
1971  was_root = false;
1972 
1973  /* Was this the only page on the level before split? */
1974  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1975 
1976  elog(DEBUG1, "finishing incomplete split of %u/%u",
1978 
1979  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1980 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1826
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define P_LEFTMOST(opaque)
Definition: nbtree.h:156
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define BTREE_METAPAGE
Definition: nbtree.h:115
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
#define Assert(condition)
Definition: c.h:699
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
#define elog
Definition: elog.h:219
#define BT_WRITE
Definition: nbtree.h:287
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Pointer Page
Definition: bufpage.h:74

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
int  access 
)

Definition at line 1996 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_branch_parent().

1997 {
1998  BlockNumber blkno;
1999  OffsetNumber start;
2000 
2001  blkno = stack->bts_blkno;
2002  start = stack->bts_offset;
2003 
2004  for (;;)
2005  {
2006  Buffer buf;
2007  Page page;
2008  BTPageOpaque opaque;
2009 
2010  buf = _bt_getbuf(rel, blkno, access);
2011  page = BufferGetPage(buf);
2012  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2013 
2014  if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
2015  {
2016  _bt_finish_split(rel, buf, stack->bts_parent);
2017  continue;
2018  }
2019 
2020  if (!P_IGNORE(opaque))
2021  {
2022  OffsetNumber offnum,
2023  minoff,
2024  maxoff;
2025  ItemId itemid;
2026  IndexTuple item;
2027 
2028  minoff = P_FIRSTDATAKEY(opaque);
2029  maxoff = PageGetMaxOffsetNumber(page);
2030 
2031  /*
2032  * start = InvalidOffsetNumber means "search the whole page". We
2033  * need this test anyway due to possibility that page has a high
2034  * key now when it didn't before.
2035  */
2036  if (start < minoff)
2037  start = minoff;
2038 
2039  /*
2040  * Need this check too, to guard against possibility that page
2041  * split since we visited it originally.
2042  */
2043  if (start > maxoff)
2044  start = OffsetNumberNext(maxoff);
2045 
2046  /*
2047  * These loops will check every item on the page --- but in an
2048  * order that's attuned to the probability of where it actually
2049  * is. Scan to the right first, then to the left.
2050  */
2051  for (offnum = start;
2052  offnum <= maxoff;
2053  offnum = OffsetNumberNext(offnum))
2054  {
2055  itemid = PageGetItemId(page, offnum);
2056  item = (IndexTuple) PageGetItem(page, itemid);
2057 
2058  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2059  {
2060  /* Return accurate pointer to where link is now */
2061  stack->bts_blkno = blkno;
2062  stack->bts_offset = offnum;
2063  return buf;
2064  }
2065  }
2066 
2067  for (offnum = OffsetNumberPrev(start);
2068  offnum >= minoff;
2069  offnum = OffsetNumberPrev(offnum))
2070  {
2071  itemid = PageGetItemId(page, offnum);
2072  item = (IndexTuple) PageGetItem(page, itemid);
2073 
2074  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2075  {
2076  /* Return accurate pointer to where link is now */
2077  stack->bts_blkno = blkno;
2078  stack->bts_offset = offnum;
2079  return buf;
2080  }
2081  }
2082  }
2083 
2084  /*
2085  * The item we're looking for moved right at least one page.
2086  */
2087  if (P_RIGHTMOST(opaque))
2088  {
2089  _bt_relbuf(rel, buf);
2090  return InvalidBuffer;
2091  }
2092  blkno = opaque->btpo_next;
2093  start = InvalidOffsetNumber;
2094  _bt_relbuf(rel, buf);
2095  }
2096 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:163
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:224
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1937
OffsetNumber bts_offset
Definition: nbtree.h:302
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:301
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
BlockNumber bts_btentry
Definition: nbtree.h:303
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
struct BTStackData * bts_parent
Definition: nbtree.h:304
#define BT_WRITE
Definition: nbtree.h:287
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_insert_parent()

static void _bt_insert_parent ( Relation  rel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  is_root,
bool  is_only 
)
static

Definition at line 1826 of file nbtinsert.c.

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newroot(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, BTreeInnerTupleSetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), DEBUG2, elog, ERROR, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, BTPageOpaqueData::level, P_HIKEY, PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), and RelationGetRelationName.

Referenced by _bt_finish_split(), and _bt_insertonpg().

1832 {
1833  /*
1834  * Here we have to do something Lehman and Yao don't talk about: deal with
1835  * a root split and construction of a new root. If our stack is empty
1836  * then we have just split a node on what had been the root level when we
1837  * descended the tree. If it was still the root then we perform a
1838  * new-root construction. If it *wasn't* the root anymore, search to find
1839  * the next higher level that someone constructed meanwhile, and find the
1840  * right place to insert as for the normal case.
1841  *
1842  * If we have to search for the parent level, we do so by re-descending
1843  * from the root. This is not super-efficient, but it's rare enough not
1844  * to matter.
1845  */
1846  if (is_root)
1847  {
1848  Buffer rootbuf;
1849 
1850  Assert(stack == NULL);
1851  Assert(is_only);
1852  /* create a new root node and update the metapage */
1853  rootbuf = _bt_newroot(rel, buf, rbuf);
1854  /* release the split buffers */
1855  _bt_relbuf(rel, rootbuf);
1856  _bt_relbuf(rel, rbuf);
1857  _bt_relbuf(rel, buf);
1858  }
1859  else
1860  {
1862  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1863  Page page = BufferGetPage(buf);
1864  IndexTuple new_item;
1865  BTStackData fakestack;
1866  IndexTuple ritem;
1867  Buffer pbuf;
1868 
1869  if (stack == NULL)
1870  {
1871  BTPageOpaque lpageop;
1872 
1873  elog(DEBUG2, "concurrent ROOT page split");
1874  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1875  /* Find the leftmost page at the next level up */
1876  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
1877  NULL);
1878  /* Set up a phony stack entry pointing there */
1879  stack = &fakestack;
1880  stack->bts_blkno = BufferGetBlockNumber(pbuf);
1883  stack->bts_parent = NULL;
1884  _bt_relbuf(rel, pbuf);
1885  }
1886 
1887  /* get high key from left page == lower bound for new right page */
1888  ritem = (IndexTuple) PageGetItem(page,
1889  PageGetItemId(page, P_HIKEY));
1890 
1891  /* form an index tuple that points at the new right page */
1892  new_item = CopyIndexTuple(ritem);
1893  BTreeInnerTupleSetDownLink(new_item, rbknum);
1894 
1895  /*
1896  * Find the parent buffer and get the parent page.
1897  *
1898  * Oops - if we were moved right then we need to change stack item! We
1899  * want to find parent pointing to where we are, right ? - vadim
1900  * 05/27/97
1901  */
1902  stack->bts_btentry = bknum;
1903  pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1904 
1905  /*
1906  * Now we can unlock the right child. The left child will be unlocked
1907  * by _bt_insertonpg().
1908  */
1909  _bt_relbuf(rel, rbuf);
1910 
1911  /* Check for error only after writing children */
1912  if (pbuf == InvalidBuffer)
1913  elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
1914  RelationGetRelationName(rel), bknum, rbknum);
1915 
1916  /* Recursively update the parent */
1917  _bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
1918  new_item, stack->bts_offset + 1,
1919  is_only);
1920 
1921  /* be tidy */
1922  pfree(new_item);
1923  }
1924 }
Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access)
Definition: nbtinsert.c:1996
union BTPageOpaqueData::@46 btpo
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2117
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1776
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ERROR
Definition: elog.h:43
#define DEBUG2
Definition: elog.h:24
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:441
OffsetNumber bts_offset
Definition: nbtree.h:302
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 level
Definition: nbtree.h:61
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:301
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
BlockNumber bts_btentry
Definition: nbtree.h:303
#define Assert(condition)
Definition: c.h:699
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
struct BTStackData * bts_parent
Definition: nbtree.h:304
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:226
#define P_HIKEY
Definition: nbtree.h:185
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:835
#define elog
Definition: elog.h:219
#define BT_WRITE
Definition: nbtree.h:287
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_insertonpg()

static void _bt_insertonpg ( Relation  rel,
Buffer  buf,
Buffer  cbuf,
BTStack  stack,
IndexTuple  itup,
OffsetNumber  newitemoff,
bool  split_only_page 
)
static

Definition at line 835 of file nbtinsert.c.

References _bt_findsplitloc(), _bt_getbuf(), _bt_getrootheight(), _bt_insert_parent(), _bt_pgaddtup(), _bt_relbuf(), _bt_split(), _bt_upgrademetapage(), Assert, BlockNumberIsValid, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTREE_FASTPATH_MIN_LEVEL, BTREE_METAPAGE, BTREE_VERSION, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, MarkBufferDirty(), MAXALIGN, xl_btree_insert::offnum, xl_btree_metadata::oldest_btpo_xact, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageGetFreeSpace(), PageGetSpecialPointer, PageSetLSN, PANIC, PredicateLockPageSplit(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationGetTargetBlock, RelationNeedsWAL, RelationSetTargetBlock, xl_btree_metadata::root, SizeOfBtreeInsert, START_CRIT_SECTION, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_doinsert(), and _bt_insert_parent().

842 {
843  Page page;
844  BTPageOpaque lpageop;
845  OffsetNumber firstright = InvalidOffsetNumber;
846  Size itemsz;
847 
848  page = BufferGetPage(buf);
849  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
850 
851  /* child buffer must be given iff inserting on an internal page */
852  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
853  /* tuple must have appropriate number of attributes */
854  Assert(!P_ISLEAF(lpageop) ||
855  BTreeTupleGetNAtts(itup, rel) ==
857  Assert(P_ISLEAF(lpageop) ||
858  BTreeTupleGetNAtts(itup, rel) ==
860 
861  /* The caller should've finished any incomplete splits already. */
862  if (P_INCOMPLETE_SPLIT(lpageop))
863  elog(ERROR, "cannot insert to incompletely split page %u",
865 
866  itemsz = IndexTupleSize(itup);
867  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
868  * need to be consistent */
869 
870  /*
871  * Do we need to split the page to fit the item on it?
872  *
873  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
874  * so this comparison is correct even though we appear to be accounting
875  * only for the item and not for its line pointer.
876  */
877  if (PageGetFreeSpace(page) < itemsz)
878  {
879  bool is_root = P_ISROOT(lpageop);
880  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
881  bool newitemonleft;
882  Buffer rbuf;
883 
884  /*
885  * If we're here then a pagesplit is needed. We should never reach here
886  * if we're using the fastpath since we should have checked for all the
887  * required conditions, including the fact that this page has enough
888  * freespace. Note that this routine can in theory deal with the
889  * situation where a NULL stack pointer is passed (that's what would
890  * happen if the fastpath is taken), like it does during crash
891  * recovery. But that path is much slower, defeating the very purpose
892  * of the optimization. The following assertion should protect us from
893  * any future code changes that invalidate those assumptions.
894  *
895  * Note that whenever we fail to take the fastpath, we clear the
896  * cached block. Checking for a valid cached block at this point is
897  * enough to decide whether we're in a fastpath or not.
898  */
899  Assert(!(P_ISLEAF(lpageop) &&
901 
902  /* Choose the split point */
903  firstright = _bt_findsplitloc(rel, page,
904  newitemoff, itemsz,
905  &newitemonleft);
906 
907  /* split the buffer into left and right halves */
908  rbuf = _bt_split(rel, buf, cbuf, firstright,
909  newitemoff, itemsz, itup, newitemonleft);
912  BufferGetBlockNumber(rbuf));
913 
914  /*----------
915  * By here,
916  *
917  * + our target page has been split;
918  * + the original tuple has been inserted;
919  * + we have write locks on both the old (left half)
920  * and new (right half) buffers, after the split; and
921  * + we know the key we want to insert into the parent
922  * (it's the "high key" on the left child page).
923  *
924  * We're ready to do the parent insertion. We need to hold onto the
925  * locks for the child pages until we locate the parent, but we can
926  * release them before doing the actual insertion (see Lehman and Yao
927  * for the reasoning).
928  *----------
929  */
930  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
931  }
932  else
933  {
934  Buffer metabuf = InvalidBuffer;
935  Page metapg = NULL;
936  BTMetaPageData *metad = NULL;
937  OffsetNumber itup_off;
938  BlockNumber itup_blkno;
939  BlockNumber cachedBlock = InvalidBlockNumber;
940 
941  itup_off = newitemoff;
942  itup_blkno = BufferGetBlockNumber(buf);
943 
944  /*
945  * If we are doing this insert because we split a page that was the
946  * only one on its tree level, but was not the root, it may have been
947  * the "fast root". We need to ensure that the fast root link points
948  * at or above the current page. We can safely acquire a lock on the
949  * metapage here --- see comments for _bt_newroot().
950  */
951  if (split_only_page)
952  {
953  Assert(!P_ISLEAF(lpageop));
954 
955  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
956  metapg = BufferGetPage(metabuf);
957  metad = BTPageGetMeta(metapg);
958 
959  if (metad->btm_fastlevel >= lpageop->btpo.level)
960  {
961  /* no update wanted */
962  _bt_relbuf(rel, metabuf);
963  metabuf = InvalidBuffer;
964  }
965  }
966 
967  /*
968  * Every internal page should have exactly one negative infinity item
969  * at all times. Only _bt_split() and _bt_newroot() should add items
970  * that become negative infinity items through truncation, since
971  * they're the only routines that allocate new internal pages. Do not
972  * allow a retail insertion of a new item at the negative infinity
973  * offset.
974  */
975  if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
976  elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
977  itup_blkno, RelationGetRelationName(rel));
978 
979  /* Do the update. No ereport(ERROR) until changes are logged */
981 
982  if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
983  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
984  itup_blkno, RelationGetRelationName(rel));
985 
987 
988  if (BufferIsValid(metabuf))
989  {
990  /* upgrade meta-page if needed */
991  if (metad->btm_version < BTREE_VERSION)
992  _bt_upgrademetapage(metapg);
993  metad->btm_fastroot = itup_blkno;
994  metad->btm_fastlevel = lpageop->btpo.level;
995  MarkBufferDirty(metabuf);
996  }
997 
998  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
999  if (BufferIsValid(cbuf))
1000  {
1001  Page cpage = BufferGetPage(cbuf);
1002  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1003 
1004  Assert(P_INCOMPLETE_SPLIT(cpageop));
1005  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1006  MarkBufferDirty(cbuf);
1007  }
1008 
1009  /*
1010  * Cache the block information if we just inserted into the rightmost
1011  * leaf page of the index and it's not the root page. For very small
1012  * index where root is also the leaf, there is no point trying for any
1013  * optimization.
1014  */
1015  if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
1016  cachedBlock = BufferGetBlockNumber(buf);
1017 
1018  /* XLOG stuff */
1019  if (RelationNeedsWAL(rel))
1020  {
1021  xl_btree_insert xlrec;
1022  xl_btree_metadata xlmeta;
1023  uint8 xlinfo;
1024  XLogRecPtr recptr;
1025 
1026  xlrec.offnum = itup_off;
1027 
1028  XLogBeginInsert();
1029  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1030 
1031  if (P_ISLEAF(lpageop))
1032  xlinfo = XLOG_BTREE_INSERT_LEAF;
1033  else
1034  {
1035  /*
1036  * Register the left child whose INCOMPLETE_SPLIT flag was
1037  * cleared.
1038  */
1040 
1041  xlinfo = XLOG_BTREE_INSERT_UPPER;
1042  }
1043 
1044  if (BufferIsValid(metabuf))
1045  {
1046  xlmeta.root = metad->btm_root;
1047  xlmeta.level = metad->btm_level;
1048  xlmeta.fastroot = metad->btm_fastroot;
1049  xlmeta.fastlevel = metad->btm_fastlevel;
1050  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1053 
1055  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1056 
1057  xlinfo = XLOG_BTREE_INSERT_META;
1058  }
1059 
1061  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1062 
1063  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1064 
1065  if (BufferIsValid(metabuf))
1066  {
1067  PageSetLSN(metapg, recptr);
1068  }
1069  if (BufferIsValid(cbuf))
1070  {
1071  PageSetLSN(BufferGetPage(cbuf), recptr);
1072  }
1073 
1074  PageSetLSN(page, recptr);
1075  }
1076 
1077  END_CRIT_SECTION();
1078 
1079  /* release buffers */
1080  if (BufferIsValid(metabuf))
1081  _bt_relbuf(rel, metabuf);
1082  if (BufferIsValid(cbuf))
1083  _bt_relbuf(rel, cbuf);
1084  _bt_relbuf(rel, buf);
1085 
1086  /*
1087  * If we decided to cache the insertion target block, then set it now.
1088  * But before that, check for the height of the tree and don't go for
1089  * the optimization for small indexes. We defer that check to this
1090  * point to ensure that we don't call _bt_getrootheight while holding
1091  * lock on any other block.
1092  *
1093  * We do this after dropping locks on all buffers. So the information
1094  * about whether the insertion block is still the rightmost block or
1095  * not may have changed in between. But we will deal with that during
1096  * next insert operation. No special care is required while setting it.
1097  */
1098  if (BlockNumberIsValid(cachedBlock) &&
1100  RelationSetTargetBlock(rel, cachedBlock);
1101  }
1102 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:86
uint32 btm_version
Definition: nbtree.h:100
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:594
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:117
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:133
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:323
#define InvalidBuffer
Definition: buf.h:25
static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft)
Definition: nbtinsert.c:1579
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define START_CRIT_SECTION()
Definition: miscadmin.h:131
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
uint32 BlockNumber
Definition: block.h:31
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1826
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:233
#define PANIC
Definition: elog.h:53
#define RelationGetTargetBlock(relation)
Definition: rel.h:493
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
uint16 OffsetNumber
Definition: off.h:24
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
static char * buf
Definition: pg_test_fsync.c:67
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:419
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define P_LEFTMOST(opaque)
Definition: nbtree.h:156
static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
Definition: nbtinsert.c:1120
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define P_ISROOT(opaque)
Definition: nbtree.h:159
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:30
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2297
OffsetNumber offnum
Definition: nbtxlog.h:70
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:699
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:500
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define MAXALIGN(LEN)
Definition: c.h:652
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define RelationNeedsWAL(relation)
Definition: rel.h:510
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
#define elog
Definition: elog.h:219
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
#define BT_WRITE
Definition: nbtree.h:287
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define SizeOfBtreeInsert
Definition: nbtxlog.h:73
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3065
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_isequal()

static bool _bt_isequal ( TupleDesc  itupdesc,
Page  page,
OffsetNumber  offnum,
int  keysz,
ScanKey  scankey 
)
static

Definition at line 2328 of file nbtinsert.c.

References Assert, DatumGetInt32, FunctionCall2Coll(), i, index_getattr, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, ScanKeyData::sk_argument, ScanKeyData::sk_attno, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_check_unique().

2330 {
2331  IndexTuple itup;
2332  int i;
2333 
2334  /* Better be comparing to a leaf item */
2336 
2337  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2338 
2339  /*
2340  * It's okay that we might perform a comparison against a truncated page
2341  * high key when caller needs to determine if _bt_check_unique scan must
2342  * continue on to the next page. Caller never asks us to compare non-key
2343  * attributes within an INCLUDE index.
2344  */
2345  for (i = 1; i <= keysz; i++)
2346  {
2347  AttrNumber attno;
2348  Datum datum;
2349  bool isNull;
2350  int32 result;
2351 
2352  attno = scankey->sk_attno;
2353  Assert(attno == i);
2354  datum = index_getattr(itup, attno, itupdesc, &isNull);
2355 
2356  /* NULLs are never equal to anything */
2357  if (isNull || (scankey->sk_flags & SK_ISNULL))
2358  return false;
2359 
2360  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
2361  scankey->sk_collation,
2362  datum,
2363  scankey->sk_argument));
2364 
2365  if (result != 0)
2366  return false;
2367 
2368  scankey++;
2369  }
2370 
2371  /* if we get here, the keys are equal */
2372  return true;
2373 }
#define DatumGetInt32(X)
Definition: postgres.h:455
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1132
signed int int32
Definition: c.h:313
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define SK_ISNULL
Definition: skey.h:115
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uintptr_t Datum
Definition: postgres.h:365
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:699
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
int16 AttrNumber
Definition: attnum.h:21
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_newroot()

static Buffer _bt_newroot ( Relation  rel,
Buffer  lbuf,
Buffer  rbuf 
)
static

Definition at line 2117 of file nbtinsert.c.

References _bt_getbuf(), _bt_relbuf(), _bt_upgrademetapage(), Assert, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_VERSION, BTreeInnerTupleSetDownLink, BTreeTupleGetNAtts, BTreeTupleSetNAtts, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), elog, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, MarkBufferDirty(), xl_btree_metadata::oldest_btpo_xact, P_FIRSTKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_NEW, P_NONE, PageAddItem, PageGetItem, PageGetItemId, PageGetSpecialPointer, PageSetLSN, palloc(), PANIC, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, IndexTupleData::t_info, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insert_parent().

2118 {
2119  Buffer rootbuf;
2120  Page lpage,
2121  rootpage;
2122  BlockNumber lbkno,
2123  rbkno;
2124  BlockNumber rootblknum;
2125  BTPageOpaque rootopaque;
2126  BTPageOpaque lopaque;
2127  ItemId itemid;
2128  IndexTuple item;
2129  IndexTuple left_item;
2130  Size left_item_sz;
2131  IndexTuple right_item;
2132  Size right_item_sz;
2133  Buffer metabuf;
2134  Page metapg;
2135  BTMetaPageData *metad;
2136 
2137  lbkno = BufferGetBlockNumber(lbuf);
2138  rbkno = BufferGetBlockNumber(rbuf);
2139  lpage = BufferGetPage(lbuf);
2140  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2141 
2142  /* get a new root page */
2143  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2144  rootpage = BufferGetPage(rootbuf);
2145  rootblknum = BufferGetBlockNumber(rootbuf);
2146 
2147  /* acquire lock on the metapage */
2148  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2149  metapg = BufferGetPage(metabuf);
2150  metad = BTPageGetMeta(metapg);
2151 
2152  /* upgrade metapage if needed */
2153  if (metad->btm_version < BTREE_VERSION)
2154  _bt_upgrademetapage(metapg);
2155 
2156  /*
2157  * Create downlink item for left page (old root). Since this will be the
2158  * first item in a non-leaf page, it implicitly has minus-infinity key
2159  * value, so we need not store any actual key in it.
2160  */
2161  left_item_sz = sizeof(IndexTupleData);
2162  left_item = (IndexTuple) palloc(left_item_sz);
2163  left_item->t_info = left_item_sz;
2164  BTreeInnerTupleSetDownLink(left_item, lbkno);
2165  BTreeTupleSetNAtts(left_item, 0);
2166 
2167  /*
2168  * Create downlink item for right page. The key for it is obtained from
2169  * the "high key" position in the left page.
2170  */
2171  itemid = PageGetItemId(lpage, P_HIKEY);
2172  right_item_sz = ItemIdGetLength(itemid);
2173  item = (IndexTuple) PageGetItem(lpage, itemid);
2174  right_item = CopyIndexTuple(item);
2175  BTreeInnerTupleSetDownLink(right_item, rbkno);
2176 
2177  /* NO EREPORT(ERROR) from here till newroot op is logged */
2179 
2180  /* set btree special data */
2181  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2182  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2183  rootopaque->btpo_flags = BTP_ROOT;
2184  rootopaque->btpo.level =
2185  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2186  rootopaque->btpo_cycleid = 0;
2187 
2188  /* update metapage data */
2189  metad->btm_root = rootblknum;
2190  metad->btm_level = rootopaque->btpo.level;
2191  metad->btm_fastroot = rootblknum;
2192  metad->btm_fastlevel = rootopaque->btpo.level;
2193 
2194  /*
2195  * Insert the left page pointer into the new root page. The root page is
2196  * the rightmost page on its level so there is no "high key" in it; the
2197  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2198  *
2199  * Note: we *must* insert the two items in item-number order, for the
2200  * benefit of _bt_restore_page().
2201  */
2202  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2203  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2204  false, false) == InvalidOffsetNumber)
2205  elog(PANIC, "failed to add leftkey to new root page"
2206  " while splitting block %u of index \"%s\"",
2208 
2209  /*
2210  * insert the right page pointer into the new root page.
2211  */
2212  Assert(BTreeTupleGetNAtts(right_item, rel) ==
2214  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2215  false, false) == InvalidOffsetNumber)
2216  elog(PANIC, "failed to add rightkey to new root page"
2217  " while splitting block %u of index \"%s\"",
2219 
2220  /* Clear the incomplete-split flag in the left child */
2221  Assert(P_INCOMPLETE_SPLIT(lopaque));
2222  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2223  MarkBufferDirty(lbuf);
2224 
2225  MarkBufferDirty(rootbuf);
2226  MarkBufferDirty(metabuf);
2227 
2228  /* XLOG stuff */
2229  if (RelationNeedsWAL(rel))
2230  {
2231  xl_btree_newroot xlrec;
2232  XLogRecPtr recptr;
2233  xl_btree_metadata md;
2234 
2235  xlrec.rootblk = rootblknum;
2236  xlrec.level = metad->btm_level;
2237 
2238  XLogBeginInsert();
2239  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2240 
2241  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2244 
2245  md.root = rootblknum;
2246  md.level = metad->btm_level;
2247  md.fastroot = rootblknum;
2248  md.fastlevel = metad->btm_level;
2251 
2252  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2253 
2254  /*
2255  * Direct access to page is not good but faster - we should implement
2256  * some new func in page API.
2257  */
2259  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2260  ((PageHeader) rootpage)->pd_special -
2261  ((PageHeader) rootpage)->pd_upper);
2262 
2263  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2264 
2265  PageSetLSN(lpage, recptr);
2266  PageSetLSN(rootpage, recptr);
2267  PageSetLSN(metapg, recptr);
2268  }
2269 
2270  END_CRIT_SECTION();
2271 
2272  /* done with metapage */
2273  _bt_relbuf(rel, metabuf);
2274 
2275  pfree(left_item);
2276  pfree(right_item);
2277 
2278  return rootbuf;
2279 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:245
#define BTP_ROOT
Definition: nbtree.h:72
BlockNumber btpo_next
Definition: nbtree.h:58
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:86
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:117
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:133
BlockNumber root
Definition: nbtxlog.h:50
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:150
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define START_CRIT_SECTION()
Definition: miscadmin.h:131
uint32 level
Definition: nbtxlog.h:246
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:82
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:233
#define PANIC
Definition: elog.h:53
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:243
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:441
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define P_FIRSTKEY
Definition: nbtree.h:186
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:249
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
struct IndexTupleData IndexTupleData
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:699
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define RelationNeedsWAL(relation)
Definition: rel.h:510
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:226
#define P_HIKEY
Definition: nbtree.h:185
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
void * palloc(Size size)
Definition: mcxt.c:924
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
#define elog
Definition: elog.h:219
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
#define BT_WRITE
Definition: nbtree.h:287
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_pgaddtup()

static bool _bt_pgaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off 
)
static

Definition at line 2297 of file nbtinsert.c.

References BTreeTupleSetNAtts, InvalidOffsetNumber, P_FIRSTDATAKEY, P_ISLEAF, PageAddItem, PageGetSpecialPointer, and IndexTupleData::t_info.

Referenced by _bt_insertonpg(), and _bt_split().

2301 {
2303  IndexTupleData trunctuple;
2304 
2305  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2306  {
2307  trunctuple = *itup;
2308  trunctuple.t_info = sizeof(IndexTupleData);
2309  BTreeTupleSetNAtts(&trunctuple, 0);
2310  itup = &trunctuple;
2311  itemsize = sizeof(IndexTupleData);
2312  }
2313 
2314  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2315  false, false) == InvalidOffsetNumber)
2316  return false;
2317 
2318  return true;
2319 }
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:243
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
struct IndexTupleData IndexTupleData
#define InvalidOffsetNumber
Definition: off.h:26
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
unsigned short t_info
Definition: itup.h:49
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_split()

static Buffer _bt_split ( Relation  rel,
Buffer  buf,
Buffer  cbuf,
OffsetNumber  firstright,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool  newitemonleft 
)
static

Definition at line 1120 of file nbtinsert.c.

References _bt_getbuf(), _bt_nonkey_truncate(), _bt_pageinit(), _bt_pgaddtup(), _bt_relbuf(), _bt_vacuum_cycleid(), Assert, BT_WRITE, BTP_HAS_GARBAGE, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTP_SPLIT_END, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferGetPageSize, BufferIsValid, elog, END_CRIT_SECTION, ERROR, xl_btree_split::firstright, i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, ItemIdGetLength, BTPageOpaqueData::level, xl_btree_split::level, MarkBufferDirty(), MAXALIGN, xl_btree_split::newitemoff, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_NEW, P_RIGHTMOST, PageAddItem, PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPage(), PageRestoreTempPage(), PageSetLSN, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeSplit, START_CRIT_SECTION, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_L_HIGHKEY, XLOG_BTREE_SPLIT_R, XLOG_BTREE_SPLIT_R_HIGHKEY, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insertonpg().

1123 {
1124  Buffer rbuf;
1125  Page origpage;
1126  Page leftpage,
1127  rightpage;
1128  BlockNumber origpagenumber,
1129  rightpagenumber;
1130  BTPageOpaque ropaque,
1131  lopaque,
1132  oopaque;
1133  Buffer sbuf = InvalidBuffer;
1134  Page spage = NULL;
1135  BTPageOpaque sopaque = NULL;
1136  Size itemsz;
1137  ItemId itemid;
1138  IndexTuple item;
1139  OffsetNumber leftoff,
1140  rightoff;
1141  OffsetNumber maxoff;
1142  OffsetNumber i;
1143  bool isleaf;
1144  IndexTuple lefthikey;
1145  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1146  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1147 
1148  /* Acquire a new page to split into */
1149  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1150 
1151  /*
1152  * origpage is the original page to be split. leftpage is a temporary
1153  * buffer that receives the left-sibling data, which will be copied back
1154  * into origpage on success. rightpage is the new page that receives the
1155  * right-sibling data. If we fail before reaching the critical section,
1156  * origpage hasn't been modified and leftpage is only workspace. In
1157  * principle we shouldn't need to worry about rightpage either, because it
1158  * hasn't been linked into the btree page structure; but to avoid leaving
1159  * possibly-confusing junk behind, we are careful to rewrite rightpage as
1160  * zeroes before throwing any error.
1161  */
1162  origpage = BufferGetPage(buf);
1163  leftpage = PageGetTempPage(origpage);
1164  rightpage = BufferGetPage(rbuf);
1165 
1166  origpagenumber = BufferGetBlockNumber(buf);
1167  rightpagenumber = BufferGetBlockNumber(rbuf);
1168 
1169  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1170  /* rightpage was already initialized by _bt_getbuf */
1171 
1172  /*
1173  * Copy the original page's LSN into leftpage, which will become the
1174  * updated version of the page. We need this because XLogInsert will
1175  * examine the LSN and possibly dump it in a page image.
1176  */
1177  PageSetLSN(leftpage, PageGetLSN(origpage));
1178 
1179  /* init btree private data */
1180  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1181  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1182  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1183 
1184  isleaf = P_ISLEAF(oopaque);
1185 
1186  /* if we're splitting this page, it won't be the root when we're done */
1187  /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1188  lopaque->btpo_flags = oopaque->btpo_flags;
1189  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1190  ropaque->btpo_flags = lopaque->btpo_flags;
1191  /* set flag in left page indicating that the right page has no downlink */
1192  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1193  lopaque->btpo_prev = oopaque->btpo_prev;
1194  lopaque->btpo_next = rightpagenumber;
1195  ropaque->btpo_prev = origpagenumber;
1196  ropaque->btpo_next = oopaque->btpo_next;
1197  lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
1198  /* Since we already have write-lock on both pages, ok to read cycleid */
1199  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1200  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1201 
1202  /*
1203  * If the page we're splitting is not the rightmost page at its level in
1204  * the tree, then the first entry on the page is the high key for the
1205  * page. We need to copy that to the right half. Otherwise (meaning the
1206  * rightmost page case), all the items on the right half will be user
1207  * data.
1208  */
1209  rightoff = P_HIKEY;
1210 
1211  if (!P_RIGHTMOST(oopaque))
1212  {
1213  itemid = PageGetItemId(origpage, P_HIKEY);
1214  itemsz = ItemIdGetLength(itemid);
1215  item = (IndexTuple) PageGetItem(origpage, itemid);
1216  Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
1217  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1218  false, false) == InvalidOffsetNumber)
1219  {
1220  memset(rightpage, 0, BufferGetPageSize(rbuf));
1221  elog(ERROR, "failed to add hikey to the right sibling"
1222  " while splitting block %u of index \"%s\"",
1223  origpagenumber, RelationGetRelationName(rel));
1224  }
1225  rightoff = OffsetNumberNext(rightoff);
1226  }
1227 
1228  /*
1229  * The "high key" for the new left page will be the first key that's going
1230  * to go into the new right page. This might be either the existing data
1231  * item at position firstright, or the incoming tuple.
1232  */
1233  leftoff = P_HIKEY;
1234  if (!newitemonleft && newitemoff == firstright)
1235  {
1236  /* incoming tuple will become first on right page */
1237  itemsz = newitemsz;
1238  item = newitem;
1239  }
1240  else
1241  {
1242  /* existing item at firstright will become first on right page */
1243  itemid = PageGetItemId(origpage, firstright);
1244  itemsz = ItemIdGetLength(itemid);
1245  item = (IndexTuple) PageGetItem(origpage, itemid);
1246  }
1247 
1248  /*
1249  * Truncate non-key (INCLUDE) attributes of the high key item before
1250  * inserting it on the left page. This only needs to happen at the leaf
1251  * level, since in general all pivot tuple values originate from leaf
1252  * level high keys. This isn't just about avoiding unnecessary work,
1253  * though; truncating unneeded key attributes (more aggressive suffix
1254  * truncation) can only be performed at the leaf level anyway. This is
1255  * because a pivot tuple in a grandparent page must guide a search not
1256  * only to the correct parent page, but also to the correct leaf page.
1257  */
1258  if (indnatts != indnkeyatts && isleaf)
1259  {
1260  lefthikey = _bt_nonkey_truncate(rel, item);
1261  itemsz = IndexTupleSize(lefthikey);
1262  itemsz = MAXALIGN(itemsz);
1263  }
1264  else
1265  lefthikey = item;
1266 
1267  Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
1268  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1269  false, false) == InvalidOffsetNumber)
1270  {
1271  memset(rightpage, 0, BufferGetPageSize(rbuf));
1272  elog(ERROR, "failed to add hikey to the left sibling"
1273  " while splitting block %u of index \"%s\"",
1274  origpagenumber, RelationGetRelationName(rel));
1275  }
1276  leftoff = OffsetNumberNext(leftoff);
1277  /* be tidy */
1278  if (lefthikey != item)
1279  pfree(lefthikey);
1280 
1281  /*
1282  * Now transfer all the data items to the appropriate page.
1283  *
1284  * Note: we *must* insert at least the right page's items in item-number
1285  * order, for the benefit of _bt_restore_page().
1286  */
1287  maxoff = PageGetMaxOffsetNumber(origpage);
1288 
1289  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1290  {
1291  itemid = PageGetItemId(origpage, i);
1292  itemsz = ItemIdGetLength(itemid);
1293  item = (IndexTuple) PageGetItem(origpage, itemid);
1294 
1295  /* does new item belong before this one? */
1296  if (i == newitemoff)
1297  {
1298  if (newitemonleft)
1299  {
1300  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1301  {
1302  memset(rightpage, 0, BufferGetPageSize(rbuf));
1303  elog(ERROR, "failed to add new item to the left sibling"
1304  " while splitting block %u of index \"%s\"",
1305  origpagenumber, RelationGetRelationName(rel));
1306  }
1307  leftoff = OffsetNumberNext(leftoff);
1308  }
1309  else
1310  {
1311  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1312  {
1313  memset(rightpage, 0, BufferGetPageSize(rbuf));
1314  elog(ERROR, "failed to add new item to the right sibling"
1315  " while splitting block %u of index \"%s\"",
1316  origpagenumber, RelationGetRelationName(rel));
1317  }
1318  rightoff = OffsetNumberNext(rightoff);
1319  }
1320  }
1321 
1322  /* decide which page to put it on */
1323  if (i < firstright)
1324  {
1325  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1326  {
1327  memset(rightpage, 0, BufferGetPageSize(rbuf));
1328  elog(ERROR, "failed to add old item to the left sibling"
1329  " while splitting block %u of index \"%s\"",
1330  origpagenumber, RelationGetRelationName(rel));
1331  }
1332  leftoff = OffsetNumberNext(leftoff);
1333  }
1334  else
1335  {
1336  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1337  {
1338  memset(rightpage, 0, BufferGetPageSize(rbuf));
1339  elog(ERROR, "failed to add old item to the right sibling"
1340  " while splitting block %u of index \"%s\"",
1341  origpagenumber, RelationGetRelationName(rel));
1342  }
1343  rightoff = OffsetNumberNext(rightoff);
1344  }
1345  }
1346 
1347  /* cope with possibility that newitem goes at the end */
1348  if (i <= newitemoff)
1349  {
1350  /*
1351  * Can't have newitemonleft here; that would imply we were told to put
1352  * *everything* on the left page, which cannot fit (if it could, we'd
1353  * not be splitting the page).
1354  */
1355  Assert(!newitemonleft);
1356  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1357  {
1358  memset(rightpage, 0, BufferGetPageSize(rbuf));
1359  elog(ERROR, "failed to add new item to the right sibling"
1360  " while splitting block %u of index \"%s\"",
1361  origpagenumber, RelationGetRelationName(rel));
1362  }
1363  rightoff = OffsetNumberNext(rightoff);
1364  }
1365 
1366  /*
1367  * We have to grab the right sibling (if any) and fix the prev pointer
1368  * there. We are guaranteed that this is deadlock-free since no other
1369  * writer will be holding a lock on that page and trying to move left, and
1370  * all readers release locks on a page before trying to fetch its
1371  * neighbors.
1372  */
1373 
1374  if (!P_RIGHTMOST(oopaque))
1375  {
1376  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1377  spage = BufferGetPage(sbuf);
1378  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1379  if (sopaque->btpo_prev != origpagenumber)
1380  {
1381  memset(rightpage, 0, BufferGetPageSize(rbuf));
1382  elog(ERROR, "right sibling's left-link doesn't match: "
1383  "block %u links to %u instead of expected %u in index \"%s\"",
1384  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1386  }
1387 
1388  /*
1389  * Check to see if we can set the SPLIT_END flag in the right-hand
1390  * split page; this can save some I/O for vacuum since it need not
1391  * proceed to the right sibling. We can set the flag if the right
1392  * sibling has a different cycleid: that means it could not be part of
1393  * a group of pages that were all split off from the same ancestor
1394  * page. If you're confused, imagine that page A splits to A B and
1395  * then again, yielding A C B, while vacuum is in progress. Tuples
1396  * originally in A could now be in either B or C, hence vacuum must
1397  * examine both pages. But if D, our right sibling, has a different
1398  * cycleid then it could not contain any tuples that were in A when
1399  * the vacuum started.
1400  */
1401  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1402  ropaque->btpo_flags |= BTP_SPLIT_END;
1403  }
1404 
1405  /*
1406  * Right sibling is locked, new siblings are prepared, but original page
1407  * is not updated yet.
1408  *
1409  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1410  * not starting the critical section till here because we haven't been
1411  * scribbling on the original page yet; see comments above.
1412  */
1414 
1415  /*
1416  * By here, the original data page has been split into two new halves, and
1417  * these are correct. The algorithm requires that the left page never
1418  * move during a split, so we copy the new left page back on top of the
1419  * original. Note that this is not a waste of time, since we also require
1420  * (in the page management code) that the center of a page always be
1421  * clean, and the most efficient way to guarantee this is just to compact
1422  * the data by reinserting it into a new left page. (XXX the latter
1423  * comment is probably obsolete; but in any case it's good to not scribble
1424  * on the original page until we enter the critical section.)
1425  *
1426  * We need to do this before writing the WAL record, so that XLogInsert
1427  * can WAL log an image of the page if necessary.
1428  */
1429  PageRestoreTempPage(leftpage, origpage);
1430  /* leftpage, lopaque must not be used below here */
1431 
1433  MarkBufferDirty(rbuf);
1434 
1435  if (!P_RIGHTMOST(ropaque))
1436  {
1437  sopaque->btpo_prev = rightpagenumber;
1438  MarkBufferDirty(sbuf);
1439  }
1440 
1441  /*
1442  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1443  * a split.
1444  */
1445  if (!isleaf)
1446  {
1447  Page cpage = BufferGetPage(cbuf);
1448  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1449 
1450  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1451  MarkBufferDirty(cbuf);
1452  }
1453 
1454  /* XLOG stuff */
1455  if (RelationNeedsWAL(rel))
1456  {
1457  xl_btree_split xlrec;
1458  uint8 xlinfo;
1459  XLogRecPtr recptr;
1460  bool loglhikey = false;
1461 
1462  xlrec.level = ropaque->btpo.level;
1463  xlrec.firstright = firstright;
1464  xlrec.newitemoff = newitemoff;
1465 
1466  XLogBeginInsert();
1467  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1468 
1471  /* Log the right sibling, because we've changed its prev-pointer. */
1472  if (!P_RIGHTMOST(ropaque))
1474  if (BufferIsValid(cbuf))
1476 
1477  /*
1478  * Log the new item, if it was inserted on the left page. (If it was
1479  * put on the right page, we don't need to explicitly WAL log it
1480  * because it's included with all the other items on the right page.)
1481  * Show the new item as belonging to the left page buffer, so that it
1482  * is not stored if XLogInsert decides it needs a full-page image of
1483  * the left page. We store the offset anyway, though, to support
1484  * archive compression of these records.
1485  */
1486  if (newitemonleft)
1487  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1488 
1489  /* Log left page */
1490  if (!isleaf || indnatts != indnkeyatts)
1491  {
1492  /*
1493  * We must also log the left page's high key. There are two
1494  * reasons for that: right page's leftmost key is suppressed on
1495  * non-leaf levels and in covering indexes included columns are
1496  * truncated from high keys. Show it as belonging to the left
1497  * page buffer, so that it is not stored if XLogInsert decides it
1498  * needs a full-page image of the left page.
1499  */
1500  itemid = PageGetItemId(origpage, P_HIKEY);
1501  item = (IndexTuple) PageGetItem(origpage, itemid);
1502  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1503  loglhikey = true;
1504  }
1505 
1506  /*
1507  * Log the contents of the right page in the format understood by
1508  * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1509  * because we're going to recreate the whole page anyway, so it should
1510  * never be stored by XLogInsert.
1511  *
1512  * Direct access to page is not good but faster - we should implement
1513  * some new func in page API. Note we only store the tuples
1514  * themselves, knowing that they were inserted in item-number order
1515  * and so the item pointers can be reconstructed. See comments for
1516  * _bt_restore_page().
1517  */
1519  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1520  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1521 
1522  xlinfo = newitemonleft ?
1525  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1526 
1527  PageSetLSN(origpage, recptr);
1528  PageSetLSN(rightpage, recptr);
1529  if (!P_RIGHTMOST(ropaque))
1530  {
1531  PageSetLSN(spage, recptr);
1532  }
1533  if (!isleaf)
1534  {
1535  PageSetLSN(BufferGetPage(cbuf), recptr);
1536  }
1537  }
1538 
1539  END_CRIT_SECTION();
1540 
1541  /* release the old right sibling */
1542  if (!P_RIGHTMOST(ropaque))
1543  _bt_relbuf(rel, sbuf);
1544 
1545  /* release the child */
1546  if (!isleaf)
1547  _bt_relbuf(rel, cbuf);
1548 
1549  /* split's done */
1550  return rbuf;
1551 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
#define BTP_ROOT
Definition: nbtree.h:72
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1885
#define BTP_SPLIT_END
Definition: nbtree.h:76
BlockNumber btpo_next
Definition: nbtree.h:58
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:407
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define SizeOfBtreeSplit
Definition: nbtxlog.h:115
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:133
unsigned char uint8
Definition: c.h:323
Pointer Item
Definition: item.h:17
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup)
Definition: nbtutils.c:2102
#define START_CRIT_SECTION()
Definition: miscadmin.h:131
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:82
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:233
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
#define ERROR
Definition: elog.h:43
OffsetNumber newitemoff
Definition: nbtxlog.h:112
BTCycleId btpo_cycleid
Definition: nbtree.h:65
#define XLOG_BTREE_SPLIT_L_HIGHKEY
Definition: nbtxlog.h:31
BlockNumber btpo_prev
Definition: nbtree.h:57
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:419
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:348
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2297
uint32 level
Definition: nbtxlog.h:110
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:147
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:699
OffsetNumber firstright
Definition: nbtxlog.h:111
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define XLOG_BTREE_SPLIT_R_HIGHKEY
Definition: nbtxlog.h:32
#define MAXALIGN(LEN)
Definition: c.h:652
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define RelationNeedsWAL(relation)
Definition: rel.h:510
#define PageGetLSN(page)
Definition: bufpage.h:362
#define P_HIKEY
Definition: nbtree.h:185
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:891
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
#define elog
Definition: elog.h:219
#define BT_WRITE
Definition: nbtree.h:287
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_vacuum_one_page()

static void _bt_vacuum_one_page ( Relation  rel,
Buffer  buffer,
Relation  heapRel 
)
static

Definition at line 2383 of file nbtinsert.c.

References _bt_delitems_delete(), BufferGetPage, ItemIdIsDead, MaxOffsetNumber, OffsetNumberNext, P_FIRSTDATAKEY, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc().

2384 {
2385  OffsetNumber deletable[MaxOffsetNumber];
2386  int ndeletable = 0;
2387  OffsetNumber offnum,
2388  minoff,
2389  maxoff;
2390  Page page = BufferGetPage(buffer);
2392 
2393  /*
2394  * Scan over all items to see which ones need to be deleted according to
2395  * LP_DEAD flags.
2396  */
2397  minoff = P_FIRSTDATAKEY(opaque);
2398  maxoff = PageGetMaxOffsetNumber(page);
2399  for (offnum = minoff;
2400  offnum <= maxoff;
2401  offnum = OffsetNumberNext(offnum))
2402  {
2403  ItemId itemId = PageGetItemId(page, offnum);
2404 
2405  if (ItemIdIsDead(itemId))
2406  deletable[ndeletable++] = offnum;
2407  }
2408 
2409  if (ndeletable > 0)
2410  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2411 
2412  /*
2413  * Note: if we didn't find any LP_DEAD items, then the page's
2414  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2415  * separate write to clear it, however. We will clear it when we split
2416  * the page.
2417  */
2418 }
#define MaxOffsetNumber
Definition: off.h:28
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]
Definition: walsender.c:215
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
Definition: nbtpage.c:1021
Pointer Page
Definition: bufpage.h:74