PostgreSQL Source Code  git master
nbtinsert.c File Reference
#include "postgres.h"
#include "access/heapam.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
#include "utils/tqual.h"
Include dependency graph for nbtinsert.c:

Go to the source code of this file.

Data Structures

struct  FindSplitData
 

Macros

#define BTREE_FASTPATH_MIN_LEVEL   2
 

Functions

static Buffer _bt_newroot (Relation rel, Buffer lbuf, Buffer rbuf)
 
static TransactionId _bt_check_unique (Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
 
static void _bt_findinsertloc (Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, int keysz, ScanKey scankey, IndexTuple newtup, BTStack stack, Relation heapRel)
 
static void _bt_insertonpg (Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
 
static Buffer _bt_split (Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
 
static void _bt_insert_parent (Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
 
static OffsetNumber _bt_findsplitloc (Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft)
 
static void _bt_checksplitloc (FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz)
 
static bool _bt_pgaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
 
static bool _bt_isequal (TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey)
 
static void _bt_vacuum_one_page (Relation rel, Buffer buffer, Relation heapRel)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, int access)
 

Macro Definition Documentation

◆ BTREE_FASTPATH_MIN_LEVEL

#define BTREE_FASTPATH_MIN_LEVEL   2

Definition at line 30 of file nbtinsert.c.

Referenced by _bt_insertonpg().

Function Documentation

◆ _bt_check_unique()

static TransactionId _bt_check_unique ( Relation  rel,
IndexTuple  itup,
Relation  heapRel,
Buffer  buf,
OffsetNumber  offset,
ScanKey  itup_scankey,
IndexUniqueCheck  checkUnique,
bool is_unique,
uint32 speculativeToken 
)
static

Definition at line 330 of file nbtinsert.c.

References _bt_isequal(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BufferGetPage, BuildIndexValueDescription(), CheckForSerializableConflictIn(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), heap_hot_search(), index_deform_tuple(), INDEX_MAX_KEYS, IndexRelationGetNumberOfKeyAttributes, InitDirtySnapshot, InvalidBuffer, InvalidTransactionId, ItemIdIsDead, ItemIdMarkDead, ItemPointerCompare(), MarkBufferDirtyHint(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, RelationGetRelationName, SnapshotSelf, SnapshotData::speculativeToken, IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_PARTIAL, values, SnapshotData::xmax, and SnapshotData::xmin.

Referenced by _bt_doinsert().

334 {
335  TupleDesc itupdesc = RelationGetDescr(rel);
336  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
337  SnapshotData SnapshotDirty;
338  OffsetNumber maxoff;
339  Page page;
340  BTPageOpaque opaque;
341  Buffer nbuf = InvalidBuffer;
342  bool found = false;
343 
344  /* Assume unique until we find a duplicate */
345  *is_unique = true;
346 
347  InitDirtySnapshot(SnapshotDirty);
348 
349  page = BufferGetPage(buf);
350  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
351  maxoff = PageGetMaxOffsetNumber(page);
352 
353  /*
354  * Scan over all equal tuples, looking for live conflicts.
355  */
356  for (;;)
357  {
358  ItemId curitemid;
359  IndexTuple curitup;
360  BlockNumber nblkno;
361 
362  /*
363  * make sure the offset points to an actual item before trying to
364  * examine it...
365  */
366  if (offset <= maxoff)
367  {
368  curitemid = PageGetItemId(page, offset);
369 
370  /*
371  * We can skip items that are marked killed.
372  *
373  * Formerly, we applied _bt_isequal() before checking the kill
374  * flag, so as to fall out of the item loop as soon as possible.
375  * However, in the presence of heavy update activity an index may
376  * contain many killed items with the same key; running
377  * _bt_isequal() on each killed item gets expensive. Furthermore
378  * it is likely that the non-killed version of each key appears
379  * first, so that we didn't actually get to exit any sooner
380  * anyway. So now we just advance over killed items as quickly as
381  * we can. We only apply _bt_isequal() when we get to a non-killed
382  * item or the end of the page.
383  */
384  if (!ItemIdIsDead(curitemid))
385  {
386  ItemPointerData htid;
387  bool all_dead;
388 
389  /*
390  * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
391  * how we handling NULLs - and so we must not use _bt_compare
392  * in real comparison, but only for ordering/finding items on
393  * pages. - vadim 03/24/97
394  */
395  if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
396  break; /* we're past all the equal tuples */
397 
398  /* okay, we gotta fetch the heap tuple ... */
399  curitup = (IndexTuple) PageGetItem(page, curitemid);
400  htid = curitup->t_tid;
401 
402  /*
403  * If we are doing a recheck, we expect to find the tuple we
404  * are rechecking. It's not a duplicate, but we have to keep
405  * scanning.
406  */
407  if (checkUnique == UNIQUE_CHECK_EXISTING &&
408  ItemPointerCompare(&htid, &itup->t_tid) == 0)
409  {
410  found = true;
411  }
412 
413  /*
414  * We check the whole HOT-chain to see if there is any tuple
415  * that satisfies SnapshotDirty. This is necessary because we
416  * have just a single index entry for the entire chain.
417  */
418  else if (heap_hot_search(&htid, heapRel, &SnapshotDirty,
419  &all_dead))
420  {
421  TransactionId xwait;
422 
423  /*
424  * It is a duplicate. If we are only doing a partial
425  * check, then don't bother checking if the tuple is being
426  * updated in another transaction. Just return the fact
427  * that it is a potential conflict and leave the full
428  * check till later.
429  */
430  if (checkUnique == UNIQUE_CHECK_PARTIAL)
431  {
432  if (nbuf != InvalidBuffer)
433  _bt_relbuf(rel, nbuf);
434  *is_unique = false;
435  return InvalidTransactionId;
436  }
437 
438  /*
439  * If this tuple is being updated by other transaction
440  * then we have to wait for its commit/abort.
441  */
442  xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
443  SnapshotDirty.xmin : SnapshotDirty.xmax;
444 
445  if (TransactionIdIsValid(xwait))
446  {
447  if (nbuf != InvalidBuffer)
448  _bt_relbuf(rel, nbuf);
449  /* Tell _bt_doinsert to wait... */
450  *speculativeToken = SnapshotDirty.speculativeToken;
451  return xwait;
452  }
453 
454  /*
455  * Otherwise we have a definite conflict. But before
456  * complaining, look to see if the tuple we want to insert
457  * is itself now committed dead --- if so, don't complain.
458  * This is a waste of time in normal scenarios but we must
459  * do it to support CREATE INDEX CONCURRENTLY.
460  *
461  * We must follow HOT-chains here because during
462  * concurrent index build, we insert the root TID though
463  * the actual tuple may be somewhere in the HOT-chain.
464  * While following the chain we might not stop at the
465  * exact tuple which triggered the insert, but that's OK
466  * because if we find a live tuple anywhere in this chain,
467  * we have a unique key conflict. The other live tuple is
468  * not part of this chain because it had a different index
469  * entry.
470  */
471  htid = itup->t_tid;
472  if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
473  {
474  /* Normal case --- it's still live */
475  }
476  else
477  {
478  /*
479  * It's been deleted, so no error, and no need to
480  * continue searching
481  */
482  break;
483  }
484 
485  /*
486  * Check for a conflict-in as we would if we were going to
487  * write to this page. We aren't actually going to write,
488  * but we want a chance to report SSI conflicts that would
489  * otherwise be masked by this unique constraint
490  * violation.
491  */
493 
494  /*
495  * This is a definite conflict. Break the tuple down into
496  * datums and report the error. But first, make sure we
497  * release the buffer locks we're holding ---
498  * BuildIndexValueDescription could make catalog accesses,
499  * which in the worst case might touch this same index and
500  * cause deadlocks.
501  */
502  if (nbuf != InvalidBuffer)
503  _bt_relbuf(rel, nbuf);
504  _bt_relbuf(rel, buf);
505 
506  {
508  bool isnull[INDEX_MAX_KEYS];
509  char *key_desc;
510 
512  values, isnull);
513 
514  key_desc = BuildIndexValueDescription(rel, values,
515  isnull);
516 
517  ereport(ERROR,
518  (errcode(ERRCODE_UNIQUE_VIOLATION),
519  errmsg("duplicate key value violates unique constraint \"%s\"",
521  key_desc ? errdetail("Key %s already exists.",
522  key_desc) : 0,
523  errtableconstraint(heapRel,
524  RelationGetRelationName(rel))));
525  }
526  }
527  else if (all_dead)
528  {
529  /*
530  * The conflicting tuple (or whole HOT chain) is dead to
531  * everyone, so we may as well mark the index entry
532  * killed.
533  */
534  ItemIdMarkDead(curitemid);
535  opaque->btpo_flags |= BTP_HAS_GARBAGE;
536 
537  /*
538  * Mark buffer with a dirty hint, since state is not
539  * crucial. Be sure to mark the proper buffer dirty.
540  */
541  if (nbuf != InvalidBuffer)
542  MarkBufferDirtyHint(nbuf, true);
543  else
544  MarkBufferDirtyHint(buf, true);
545  }
546  }
547  }
548 
549  /*
550  * Advance to next tuple to continue checking.
551  */
552  if (offset < maxoff)
553  offset = OffsetNumberNext(offset);
554  else
555  {
556  /* If scankey == hikey we gotta check the next page too */
557  if (P_RIGHTMOST(opaque))
558  break;
559  if (!_bt_isequal(itupdesc, page, P_HIKEY,
560  indnkeyatts, itup_scankey))
561  break;
562  /* Advance to next non-dead page --- there must be one */
563  for (;;)
564  {
565  nblkno = opaque->btpo_next;
566  nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
567  page = BufferGetPage(nbuf);
568  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
569  if (!P_IGNORE(opaque))
570  break;
571  if (P_RIGHTMOST(opaque))
572  elog(ERROR, "fell off the end of index \"%s\"",
574  }
575  maxoff = PageGetMaxOffsetNumber(page);
576  offset = P_FIRSTDATAKEY(opaque);
577  }
578  }
579 
580  /*
581  * If we are doing a recheck then we should have found the tuple we are
582  * checking. Otherwise there's something very wrong --- probably, the
583  * index is on a non-immutable expression.
584  */
585  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
586  ereport(ERROR,
587  (errcode(ERRCODE_INTERNAL_ERROR),
588  errmsg("failed to re-find tuple within index \"%s\"",
590  errhint("This may be because of a non-immutable index expression."),
591  errtableconstraint(heapRel,
592  RelationGetRelationName(rel))));
593 
594  if (nbuf != InvalidBuffer)
595  _bt_relbuf(rel, nbuf);
596 
597  return InvalidTransactionId;
598 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
BlockNumber btpo_next
Definition: nbtree.h:58
int errhint(const char *fmt,...)
Definition: elog.c:974
#define P_IGNORE(opaque)
Definition: nbtree.h:163
uint32 TransactionId
Definition: c.h:507
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:865
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3385
#define RelationGetDescr(relation)
Definition: rel.h:423
#define ItemIdMarkDead(itemId)
Definition: itemid.h:178
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey)
Definition: nbtinsert.c:2315
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4280
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5164
#define BT_READ
Definition: nbtree.h:299
#define ERROR
Definition: elog.h:43
#define InitDirtySnapshot(snapshotdata)
Definition: tqual.h:103
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:431
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:423
TransactionId xmax
Definition: snapshot.h:69
TransactionId xmin
Definition: snapshot.h:68
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:416
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define SnapshotSelf
Definition: tqual.h:27
#define ereport(elevel, rest)
Definition: elog.h:141
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uintptr_t Datum
Definition: postgres.h:367
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
uint32 speculativeToken
Definition: snapshot.h:104
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead)
Definition: heapam.c:2198
#define P_HIKEY
Definition: nbtree.h:185
static Datum values[MAXATTR]
Definition: bootstrap.c:167
int errmsg(const char *fmt,...)
Definition: elog.c:784
#define elog(elevel,...)
Definition: elog.h:226
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:180
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint16 btpo_flags
Definition: nbtree.h:64
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_checksplitloc()

static void _bt_checksplitloc ( FindSplitData state,
OffsetNumber  firstoldonright,
bool  newitemonleft,
int  dataitemstoleft,
Size  firstoldonrightsz 
)
static

Definition at line 1714 of file nbtinsert.c.

References FindSplitData::best_delta, FindSplitData::fillfactor, FindSplitData::firstright, FindSplitData::have_split, FindSplitData::is_leaf, FindSplitData::is_rightmost, FindSplitData::leftspace, MAXALIGN, FindSplitData::newitemoff, FindSplitData::newitemonleft, FindSplitData::newitemsz, FindSplitData::olddataitemstotal, and FindSplitData::rightspace.

Referenced by _bt_findsplitloc().

1719 {
1720  int leftfree,
1721  rightfree;
1722  Size firstrightitemsz;
1723  bool newitemisfirstonright;
1724 
1725  /* Is the new item going to be the first item on the right page? */
1726  newitemisfirstonright = (firstoldonright == state->newitemoff
1727  && !newitemonleft);
1728 
1729  if (newitemisfirstonright)
1730  firstrightitemsz = state->newitemsz;
1731  else
1732  firstrightitemsz = firstoldonrightsz;
1733 
1734  /* Account for all the old tuples */
1735  leftfree = state->leftspace - olddataitemstoleft;
1736  rightfree = state->rightspace -
1737  (state->olddataitemstotal - olddataitemstoleft);
1738 
1739  /*
1740  * The first item on the right page becomes the high key of the left page;
1741  * therefore it counts against left space as well as right space. When
1742  * index has included attributes, then those attributes of left page high
1743  * key will be truncated leaving that page with slightly more free space.
1744  * However, that shouldn't affect our ability to find valid split
1745  * location, because anyway split location should exists even without high
1746  * key truncation.
1747  */
1748  leftfree -= firstrightitemsz;
1749 
1750  /* account for the new item */
1751  if (newitemonleft)
1752  leftfree -= (int) state->newitemsz;
1753  else
1754  rightfree -= (int) state->newitemsz;
1755 
1756  /*
1757  * If we are not on the leaf level, we will be able to discard the key
1758  * data from the first item that winds up on the right page.
1759  */
1760  if (!state->is_leaf)
1761  rightfree += (int) firstrightitemsz -
1762  (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
1763 
1764  /*
1765  * If feasible split point, remember best delta.
1766  */
1767  if (leftfree >= 0 && rightfree >= 0)
1768  {
1769  int delta;
1770 
1771  if (state->is_rightmost)
1772  {
1773  /*
1774  * If splitting a rightmost page, try to put (100-fillfactor)% of
1775  * free space on left page. See comments for _bt_findsplitloc.
1776  */
1777  delta = (state->fillfactor * leftfree)
1778  - ((100 - state->fillfactor) * rightfree);
1779  }
1780  else
1781  {
1782  /* Otherwise, aim for equal free space on both sides */
1783  delta = leftfree - rightfree;
1784  }
1785 
1786  if (delta < 0)
1787  delta = -delta;
1788  if (!state->have_split || delta < state->best_delta)
1789  {
1790  state->have_split = true;
1791  state->newitemonleft = newitemonleft;
1792  state->firstright = firstoldonright;
1793  state->best_delta = delta;
1794  }
1795  }
1796 }
bool newitemonleft
Definition: nbtinsert.c:47
int best_delta
Definition: nbtinsert.c:49
bool is_rightmost
Definition: nbtinsert.c:38
int fillfactor
Definition: nbtinsert.c:36
bool have_split
Definition: nbtinsert.c:44
int rightspace
Definition: nbtinsert.c:41
bool is_leaf
Definition: nbtinsert.c:37
Size newitemsz
Definition: nbtinsert.c:35
struct ItemIdData ItemIdData
int olddataitemstotal
Definition: nbtinsert.c:42
OffsetNumber newitemoff
Definition: nbtinsert.c:39
size_t Size
Definition: c.h:466
#define MAXALIGN(LEN)
Definition: c.h:685
OffsetNumber firstright
Definition: nbtinsert.c:48

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 110 of file nbtinsert.c.

References _bt_binsrch(), _bt_check_unique(), _bt_checkpage(), _bt_compare(), _bt_findinsertloc(), _bt_freeskey(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), Assert, BT_WRITE, buf, BufferGetPage, CheckForSerializableConflictIn(), ConditionalLockBuffer(), IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, MAXALIGN, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, ReleaseBuffer(), SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

112 {
113  bool is_unique = false;
114  int indnkeyatts;
115  ScanKey itup_scankey;
116  BTStack stack = NULL;
117  Buffer buf;
118  OffsetNumber offset;
119  bool fastpath;
120 
121  indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
122  Assert(indnkeyatts != 0);
123 
124  /* we need an insertion scan key to do our search, so build one */
125  itup_scankey = _bt_mkscankey(rel, itup);
126 
127  /*
128  * It's very common to have an index on an auto-incremented or
129  * monotonically increasing value. In such cases, every insertion happens
130  * towards the end of the index. We try to optimize that case by caching
131  * the right-most leaf of the index. If our cached block is still the
132  * rightmost leaf, has enough free space to accommodate a new entry and
133  * the insertion key is strictly greater than the first key in this page,
134  * then we can safely conclude that the new key will be inserted in the
135  * cached block. So we simply search within the cached block and insert
136  * the key at the appropriate location. We call it a fastpath.
137  *
138  * Testing has revealed, though, that the fastpath can result in increased
139  * contention on the exclusive-lock on the rightmost leaf page. So we
140  * conditionally check if the lock is available. If it's not available
141  * then we simply abandon the fastpath and take the regular path. This
142  * makes sense because unavailability of the lock also signals that some
143  * other backend might be concurrently inserting into the page, thus
144  * reducing our chances to finding an insertion place in this page.
145  */
146 top:
147  fastpath = false;
148  offset = InvalidOffsetNumber;
150  {
151  Size itemsz;
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change,
159  * as far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170  itemsz = IndexTupleSize(itup);
171  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this
172  * but we need to be consistent */
173 
174  /*
175  * Check if the page is still the rightmost leaf page, has enough
176  * free space to accommodate the new tuple, and the insertion scan
177  * key is strictly greater than the first key on the page.
178  */
179  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
180  !P_IGNORE(lpageop) &&
181  (PageGetFreeSpace(page) > itemsz) &&
182  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
183  _bt_compare(rel, indnkeyatts, itup_scankey, page,
184  P_FIRSTDATAKEY(lpageop)) > 0)
185  {
186  /*
187  * The right-most block should never have an incomplete split.
188  * But be paranoid and check for it anyway.
189  */
190  Assert(!P_INCOMPLETE_SPLIT(lpageop));
191  fastpath = true;
192  }
193  else
194  {
195  _bt_relbuf(rel, buf);
196 
197  /*
198  * Something did not work out. Just forget about the cached
199  * block and follow the normal path. It might be set again if
200  * the conditions are favourable.
201  */
203  }
204  }
205  else
206  {
207  ReleaseBuffer(buf);
208 
209  /*
210  * If someone's holding a lock, it's likely to change anyway, so
211  * don't try again until we get an updated rightmost leaf.
212  */
214  }
215  }
216 
217  if (!fastpath)
218  {
219  /*
220  * Find the first page containing this key. Buffer returned by
221  * _bt_search() is locked in exclusive mode.
222  */
223  stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE,
224  NULL);
225  }
226 
227  /*
228  * If we're not allowing duplicates, make sure the key isn't already in
229  * the index.
230  *
231  * NOTE: obviously, _bt_check_unique can only detect keys that are already
232  * in the index; so it cannot defend against concurrent insertions of the
233  * same key. We protect against that by means of holding a write lock on
234  * the target page. Any other would-be inserter of the same key must
235  * acquire a write lock on the same target page, so only one would-be
236  * inserter can be making the check at one time. Furthermore, once we are
237  * past the check we hold write locks continuously until we have performed
238  * our insertion, so no later inserter can fail to see our insertion.
239  * (This requires some care in _bt_findinsertloc.)
240  *
241  * If we must wait for another xact, we release the lock while waiting,
242  * and then must start over completely.
243  *
244  * For a partial uniqueness check, we don't wait for the other xact. Just
245  * let the tuple in and return false for possibly non-unique, or true for
246  * definitely unique.
247  */
248  if (checkUnique != UNIQUE_CHECK_NO)
249  {
250  TransactionId xwait;
251  uint32 speculativeToken;
252 
253  offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false);
254  xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
255  checkUnique, &is_unique, &speculativeToken);
256 
257  if (TransactionIdIsValid(xwait))
258  {
259  /* Have to wait for the other guy ... */
260  _bt_relbuf(rel, buf);
261 
262  /*
263  * If it's a speculative insertion, wait for it to finish (ie. to
264  * go ahead with the insertion, or kill the tuple). Otherwise
265  * wait for the transaction to finish as usual.
266  */
267  if (speculativeToken)
268  SpeculativeInsertionWait(xwait, speculativeToken);
269  else
270  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
271 
272  /* start over... */
273  if (stack)
274  _bt_freestack(stack);
275  goto top;
276  }
277  }
278 
279  if (checkUnique != UNIQUE_CHECK_EXISTING)
280  {
281  /*
282  * The only conflict predicate locking cares about for indexes is when
283  * an index tuple insert conflicts with an existing lock. Since the
284  * actual location of the insert is hard to predict because of the
285  * random search used to prevent O(N^2) performance when there are
286  * many duplicate entries, we can just use the "first valid" page.
287  * This reasoning also applies to INCLUDE indexes, whose extra
288  * attributes are not considered part of the key space.
289  */
290  CheckForSerializableConflictIn(rel, NULL, buf);
291  /* do the insertion */
292  _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup,
293  stack, heapRel);
294  _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
295  }
296  else
297  {
298  /* just release the buffer */
299  _bt_relbuf(rel, buf);
300  }
301 
302  /* be tidy */
303  if (stack)
304  _bt_freestack(stack);
305  _bt_freeskey(itup_scankey);
306 
307  return is_unique;
308 }
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:330
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
#define P_IGNORE(opaque)
Definition: nbtree.h:163
uint32 TransactionId
Definition: c.h:507
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
void _bt_freeskey(ScanKey skey)
Definition: nbtutils.c:166
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3315
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
#define RelationGetTargetBlock(relation)
Definition: rel.h:482
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4280
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
uint16 OffsetNumber
Definition: off.h:24
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:662
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:778
static char * buf
Definition: pg_test_fsync.c:67
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:353
unsigned int uint32
Definition: c.h:358
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:416
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
static void _bt_findinsertloc(Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, int keysz, ScanKey scankey, IndexTuple newtup, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:631
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3578
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:621
#define Assert(condition)
Definition: c.h:732
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:489
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
#define MAXALIGN(LEN)
Definition: c.h:685
static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:821
#define TransactionIdIsValid(xid)
Definition: transam.h:41
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:458
ScanKey _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:62
#define BT_WRITE
Definition: nbtree.h:300
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_findinsertloc()

static void _bt_findinsertloc ( Relation  rel,
Buffer bufptr,
OffsetNumber offsetptr,
int  keysz,
ScanKey  scankey,
IndexTuple  newtup,
BTStack  stack,
Relation  heapRel 
)
static

Definition at line 631 of file nbtinsert.c.

References _bt_binsrch(), _bt_compare(), _bt_finish_split(), _bt_relandgetbuf(), _bt_relbuf(), _bt_vacuum_one_page(), BT_WRITE, BTMaxItemSize, BTPageOpaqueData::btpo_next, buf, BufferGetPage, elog, ereport, errcode(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, MAX_RANDOM_VALUE, MAXALIGN, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetSpecialPointer, random(), and RelationGetRelationName.

Referenced by _bt_doinsert().

639 {
640  Buffer buf = *bufptr;
641  Page page = BufferGetPage(buf);
642  Size itemsz;
643  BTPageOpaque lpageop;
644  bool movedright,
645  vacuumed;
646  OffsetNumber newitemoff;
647  OffsetNumber firstlegaloff = *offsetptr;
648 
649  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
650 
651  itemsz = IndexTupleSize(newtup);
652  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
653  * need to be consistent */
654 
655  /*
656  * Check whether the item can fit on a btree page at all. (Eventually, we
657  * ought to try to apply TOAST methods if not.) We actually need to be
658  * able to fit three items on every page, so restrict any one item to 1/3
659  * the per-page available space. Note that at this point, itemsz doesn't
660  * include the ItemId.
661  *
662  * NOTE: if you change this, see also the similar code in _bt_buildadd().
663  */
664  if (itemsz > BTMaxItemSize(page))
665  ereport(ERROR,
666  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
667  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
668  itemsz, BTMaxItemSize(page),
670  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
671  "Consider a function index of an MD5 hash of the value, "
672  "or use full text indexing."),
673  errtableconstraint(heapRel,
674  RelationGetRelationName(rel))));
675 
676  /*----------
677  * If we will need to split the page to put the item on this page,
678  * check whether we can put the tuple somewhere to the right,
679  * instead. Keep scanning right until we
680  * (a) find a page with enough free space,
681  * (b) reach the last page where the tuple can legally go, or
682  * (c) get tired of searching.
683  * (c) is not flippant; it is important because if there are many
684  * pages' worth of equal keys, it's better to split one of the early
685  * pages than to scan all the way to the end of the run of equal keys
686  * on every insert. We implement "get tired" as a random choice,
687  * since stopping after scanning a fixed number of pages wouldn't work
688  * well (we'd never reach the right-hand side of previously split
689  * pages). Currently the probability of moving right is set at 0.99,
690  * which may seem too high to change the behavior much, but it does an
691  * excellent job of preventing O(N^2) behavior with many equal keys.
692  *----------
693  */
694  movedright = false;
695  vacuumed = false;
696  while (PageGetFreeSpace(page) < itemsz)
697  {
698  Buffer rbuf;
699  BlockNumber rblkno;
700 
701  /*
702  * before considering moving right, see if we can obtain enough space
703  * by erasing LP_DEAD items
704  */
705  if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
706  {
707  _bt_vacuum_one_page(rel, buf, heapRel);
708 
709  /*
710  * remember that we vacuumed this page, because that makes the
711  * hint supplied by the caller invalid
712  */
713  vacuumed = true;
714 
715  if (PageGetFreeSpace(page) >= itemsz)
716  break; /* OK, now we have enough space */
717  }
718 
719  /*
720  * nope, so check conditions (b) and (c) enumerated above
721  */
722  if (P_RIGHTMOST(lpageop) ||
723  _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
724  random() <= (MAX_RANDOM_VALUE / 100))
725  break;
726 
727  /*
728  * step right to next non-dead page
729  *
730  * must write-lock that page before releasing write lock on current
731  * page; else someone else's _bt_check_unique scan could fail to see
732  * our insertion. write locks on intermediate dead pages won't do
733  * because we don't know when they will get de-linked from the tree.
734  */
735  rbuf = InvalidBuffer;
736 
737  rblkno = lpageop->btpo_next;
738  for (;;)
739  {
740  rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
741  page = BufferGetPage(rbuf);
742  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
743 
744  /*
745  * If this page was incompletely split, finish the split now. We
746  * do this while holding a lock on the left sibling, which is not
747  * good because finishing the split could be a fairly lengthy
748  * operation. But this should happen very seldom.
749  */
750  if (P_INCOMPLETE_SPLIT(lpageop))
751  {
752  _bt_finish_split(rel, rbuf, stack);
753  rbuf = InvalidBuffer;
754  continue;
755  }
756 
757  if (!P_IGNORE(lpageop))
758  break;
759  if (P_RIGHTMOST(lpageop))
760  elog(ERROR, "fell off the end of index \"%s\"",
762 
763  rblkno = lpageop->btpo_next;
764  }
765  _bt_relbuf(rel, buf);
766  buf = rbuf;
767  movedright = true;
768  vacuumed = false;
769  }
770 
771  /*
772  * Now we are on the right page, so find the insert position. If we moved
773  * right at all, we know we should insert at the start of the page. If we
774  * didn't move right, we can use the firstlegaloff hint if the caller
775  * supplied one, unless we vacuumed the page which might have moved tuples
776  * around making the hint invalid. If we didn't move right or can't use
777  * the hint, find the position by searching.
778  */
779  if (movedright)
780  newitemoff = P_FIRSTDATAKEY(lpageop);
781  else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
782  newitemoff = firstlegaloff;
783  else
784  newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
785 
786  *bufptr = buf;
787  *offsetptr = newitemoff;
788 }
BlockNumber btpo_next
Definition: nbtree.h:58
int errhint(const char *fmt,...)
Definition: elog.c:974
#define P_IGNORE(opaque)
Definition: nbtree.h:163
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:865
long random(void)
Definition: random.c:22
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define InvalidBuffer
Definition: buf.h:25
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:164
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
uint16 OffsetNumber
Definition: off.h:24
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5164
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1924
#define ERROR
Definition: elog.h:43
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
Definition: nbtinsert.c:2370
static char * buf
Definition: pg_test_fsync.c:67
#define MAX_RANDOM_VALUE
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:353
#define RelationGetRelationName(relation)
Definition: rel.h:431
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define ereport(elevel, rest)
Definition: elog.h:141
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define MAXALIGN(LEN)
Definition: c.h:685
#define BTMaxItemSize(page)
Definition: nbtree.h:126
#define P_HIKEY
Definition: nbtree.h:185
int errmsg(const char *fmt,...)
Definition: elog.c:784
#define elog(elevel,...)
Definition: elog.h:226
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:458
#define BT_WRITE
Definition: nbtree.h:300
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_findsplitloc()

static OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  page,
OffsetNumber  newitemoff,
Size  newitemsz,
bool newitemonleft 
)
static

Definition at line 1566 of file nbtinsert.c.

References _bt_checksplitloc(), FindSplitData::best_delta, BTREE_DEFAULT_FILLFACTOR, BTREE_NONLEAF_FILLFACTOR, elog, ERROR, FindSplitData::fillfactor, FindSplitData::firstright, FindSplitData::have_split, FindSplitData::is_leaf, FindSplitData::is_rightmost, ItemIdGetLength, FindSplitData::leftspace, MAXALIGN, FindSplitData::newitemoff, FindSplitData::newitemonleft, FindSplitData::newitemsz, OffsetNumberNext, FindSplitData::olddataitemstotal, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, PageGetSpecialPointer, RelationGetFillFactor, RelationGetRelationName, FindSplitData::rightspace, and SizeOfPageHeaderData.

Referenced by _bt_insertonpg().

1571 {
1572  BTPageOpaque opaque;
1573  OffsetNumber offnum;
1574  OffsetNumber maxoff;
1575  ItemId itemid;
1577  int leftspace,
1578  rightspace,
1579  goodenough,
1580  olddataitemstotal,
1581  olddataitemstoleft;
1582  bool goodenoughfound;
1583 
1584  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1585 
1586  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
1587  newitemsz += sizeof(ItemIdData);
1588 
1589  /* Total free space available on a btree page, after fixed overhead */
1590  leftspace = rightspace =
1592  MAXALIGN(sizeof(BTPageOpaqueData));
1593 
1594  /* The right page will have the same high key as the old page */
1595  if (!P_RIGHTMOST(opaque))
1596  {
1597  itemid = PageGetItemId(page, P_HIKEY);
1598  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
1599  sizeof(ItemIdData));
1600  }
1601 
1602  /* Count up total space in data items without actually scanning 'em */
1603  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
1604 
1605  state.newitemsz = newitemsz;
1606  state.is_leaf = P_ISLEAF(opaque);
1607  state.is_rightmost = P_RIGHTMOST(opaque);
1608  state.have_split = false;
1609  if (state.is_leaf)
1610  state.fillfactor = RelationGetFillFactor(rel,
1612  else
1614  state.newitemonleft = false; /* these just to keep compiler quiet */
1615  state.firstright = 0;
1616  state.best_delta = 0;
1617  state.leftspace = leftspace;
1618  state.rightspace = rightspace;
1619  state.olddataitemstotal = olddataitemstotal;
1620  state.newitemoff = newitemoff;
1621 
1622  /*
1623  * Finding the best possible split would require checking all the possible
1624  * split points, because of the high-key and left-key special cases.
1625  * That's probably more work than it's worth; instead, stop as soon as we
1626  * find a "good-enough" split, where good-enough is defined as an
1627  * imbalance in free space of no more than pagesize/16 (arbitrary...) This
1628  * should let us stop near the middle on most pages, instead of plowing to
1629  * the end.
1630  */
1631  goodenough = leftspace / 16;
1632 
1633  /*
1634  * Scan through the data items and calculate space usage for a split at
1635  * each possible position.
1636  */
1637  olddataitemstoleft = 0;
1638  goodenoughfound = false;
1639  maxoff = PageGetMaxOffsetNumber(page);
1640 
1641  for (offnum = P_FIRSTDATAKEY(opaque);
1642  offnum <= maxoff;
1643  offnum = OffsetNumberNext(offnum))
1644  {
1645  Size itemsz;
1646 
1647  itemid = PageGetItemId(page, offnum);
1648  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1649 
1650  /*
1651  * Will the new item go to left or right of split?
1652  */
1653  if (offnum > newitemoff)
1654  _bt_checksplitloc(&state, offnum, true,
1655  olddataitemstoleft, itemsz);
1656 
1657  else if (offnum < newitemoff)
1658  _bt_checksplitloc(&state, offnum, false,
1659  olddataitemstoleft, itemsz);
1660  else
1661  {
1662  /* need to try it both ways! */
1663  _bt_checksplitloc(&state, offnum, true,
1664  olddataitemstoleft, itemsz);
1665 
1666  _bt_checksplitloc(&state, offnum, false,
1667  olddataitemstoleft, itemsz);
1668  }
1669 
1670  /* Abort scan once we find a good-enough choice */
1671  if (state.have_split && state.best_delta <= goodenough)
1672  {
1673  goodenoughfound = true;
1674  break;
1675  }
1676 
1677  olddataitemstoleft += itemsz;
1678  }
1679 
1680  /*
1681  * If the new item goes as the last item, check for splitting so that all
1682  * the old items go to the left page and the new item goes to the right
1683  * page.
1684  */
1685  if (newitemoff > maxoff && !goodenoughfound)
1686  _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
1687 
1688  /*
1689  * I believe it is not possible to fail to find a feasible split, but just
1690  * in case ...
1691  */
1692  if (!state.have_split)
1693  elog(ERROR, "could not find a feasible split point for index \"%s\"",
1695 
1696  *newitemonleft = state.newitemonleft;
1697  return state.firstright;
1698 }
bool newitemonleft
Definition: nbtinsert.c:47
int best_delta
Definition: nbtinsert.c:49
bool is_rightmost
Definition: nbtinsert.c:38
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
int fillfactor
Definition: nbtinsert.c:36
bool have_split
Definition: nbtinsert.c:44
#define SizeOfPageHeaderData
Definition: bufpage.h:212
static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz)
Definition: nbtinsert.c:1714
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
int rightspace
Definition: nbtinsert.c:41
uint16 OffsetNumber
Definition: off.h:24
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:140
#define ERROR
Definition: elog.h:43
bool is_leaf
Definition: nbtinsert.c:37
Size newitemsz
Definition: nbtinsert.c:35
#define PageGetPageSize(page)
Definition: bufpage.h:264
#define RelationGetRelationName(relation)
Definition: rel.h:431
struct ItemIdData ItemIdData
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:139
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define RelationGetFillFactor(relation, defaultff)
Definition: rel.h:273
int olddataitemstotal
Definition: nbtinsert.c:42
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtinsert.c:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define MAXALIGN(LEN)
Definition: c.h:685
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:629
#define P_HIKEY
Definition: nbtree.h:185
#define elog(elevel,...)
Definition: elog.h:226
OffsetNumber firstright
Definition: nbtinsert.c:48
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 1924 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc(), _bt_getstackbuf(), and _bt_moveright().

1925 {
1926  Page lpage = BufferGetPage(lbuf);
1927  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1928  Buffer rbuf;
1929  Page rpage;
1930  BTPageOpaque rpageop;
1931  bool was_root;
1932  bool was_only;
1933 
1934  Assert(P_INCOMPLETE_SPLIT(lpageop));
1935 
1936  /* Lock right sibling, the one missing the downlink */
1937  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1938  rpage = BufferGetPage(rbuf);
1939  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1940 
1941  /* Could this be a root split? */
1942  if (!stack)
1943  {
1944  Buffer metabuf;
1945  Page metapg;
1946  BTMetaPageData *metad;
1947 
1948  /* acquire lock on the metapage */
1949  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1950  metapg = BufferGetPage(metabuf);
1951  metad = BTPageGetMeta(metapg);
1952 
1953  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1954 
1955  _bt_relbuf(rel, metabuf);
1956  }
1957  else
1958  was_root = false;
1959 
1960  /* Was this the only page on the level before split? */
1961  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1962 
1963  elog(DEBUG1, "finishing incomplete split of %u/%u",
1965 
1966  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1967 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1813
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define P_LEFTMOST(opaque)
Definition: nbtree.h:156
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define BTREE_METAPAGE
Definition: nbtree.h:115
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:300
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
Pointer Page
Definition: bufpage.h:74

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
int  access 
)

Definition at line 1983 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_branch_parent().

1984 {
1985  BlockNumber blkno;
1986  OffsetNumber start;
1987 
1988  blkno = stack->bts_blkno;
1989  start = stack->bts_offset;
1990 
1991  for (;;)
1992  {
1993  Buffer buf;
1994  Page page;
1995  BTPageOpaque opaque;
1996 
1997  buf = _bt_getbuf(rel, blkno, access);
1998  page = BufferGetPage(buf);
1999  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2000 
2001  if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
2002  {
2003  _bt_finish_split(rel, buf, stack->bts_parent);
2004  continue;
2005  }
2006 
2007  if (!P_IGNORE(opaque))
2008  {
2009  OffsetNumber offnum,
2010  minoff,
2011  maxoff;
2012  ItemId itemid;
2013  IndexTuple item;
2014 
2015  minoff = P_FIRSTDATAKEY(opaque);
2016  maxoff = PageGetMaxOffsetNumber(page);
2017 
2018  /*
2019  * start = InvalidOffsetNumber means "search the whole page". We
2020  * need this test anyway due to possibility that page has a high
2021  * key now when it didn't before.
2022  */
2023  if (start < minoff)
2024  start = minoff;
2025 
2026  /*
2027  * Need this check too, to guard against possibility that page
2028  * split since we visited it originally.
2029  */
2030  if (start > maxoff)
2031  start = OffsetNumberNext(maxoff);
2032 
2033  /*
2034  * These loops will check every item on the page --- but in an
2035  * order that's attuned to the probability of where it actually
2036  * is. Scan to the right first, then to the left.
2037  */
2038  for (offnum = start;
2039  offnum <= maxoff;
2040  offnum = OffsetNumberNext(offnum))
2041  {
2042  itemid = PageGetItemId(page, offnum);
2043  item = (IndexTuple) PageGetItem(page, itemid);
2044 
2045  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2046  {
2047  /* Return accurate pointer to where link is now */
2048  stack->bts_blkno = blkno;
2049  stack->bts_offset = offnum;
2050  return buf;
2051  }
2052  }
2053 
2054  for (offnum = OffsetNumberPrev(start);
2055  offnum >= minoff;
2056  offnum = OffsetNumberPrev(offnum))
2057  {
2058  itemid = PageGetItemId(page, offnum);
2059  item = (IndexTuple) PageGetItem(page, itemid);
2060 
2061  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
2062  {
2063  /* Return accurate pointer to where link is now */
2064  stack->bts_blkno = blkno;
2065  stack->bts_offset = offnum;
2066  return buf;
2067  }
2068  }
2069  }
2070 
2071  /*
2072  * The item we're looking for moved right at least one page.
2073  */
2074  if (P_RIGHTMOST(opaque))
2075  {
2076  _bt_relbuf(rel, buf);
2077  return InvalidBuffer;
2078  }
2079  blkno = opaque->btpo_next;
2080  start = InvalidOffsetNumber;
2081  _bt_relbuf(rel, buf);
2082  }
2083 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:163
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:224
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1924
OffsetNumber bts_offset
Definition: nbtree.h:315
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:314
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
BlockNumber bts_btentry
Definition: nbtree.h:316
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
struct BTStackData * bts_parent
Definition: nbtree.h:317
#define BT_WRITE
Definition: nbtree.h:300
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_insert_parent()

static void _bt_insert_parent ( Relation  rel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  is_root,
bool  is_only 
)
static

Definition at line 1813 of file nbtinsert.c.

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newroot(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, BTreeInnerTupleSetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), DEBUG2, elog, ERROR, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, BTPageOpaqueData::level, P_HIKEY, PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), and RelationGetRelationName.

Referenced by _bt_finish_split(), and _bt_insertonpg().

1819 {
1820  /*
1821  * Here we have to do something Lehman and Yao don't talk about: deal with
1822  * a root split and construction of a new root. If our stack is empty
1823  * then we have just split a node on what had been the root level when we
1824  * descended the tree. If it was still the root then we perform a
1825  * new-root construction. If it *wasn't* the root anymore, search to find
1826  * the next higher level that someone constructed meanwhile, and find the
1827  * right place to insert as for the normal case.
1828  *
1829  * If we have to search for the parent level, we do so by re-descending
1830  * from the root. This is not super-efficient, but it's rare enough not
1831  * to matter.
1832  */
1833  if (is_root)
1834  {
1835  Buffer rootbuf;
1836 
1837  Assert(stack == NULL);
1838  Assert(is_only);
1839  /* create a new root node and update the metapage */
1840  rootbuf = _bt_newroot(rel, buf, rbuf);
1841  /* release the split buffers */
1842  _bt_relbuf(rel, rootbuf);
1843  _bt_relbuf(rel, rbuf);
1844  _bt_relbuf(rel, buf);
1845  }
1846  else
1847  {
1849  BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1850  Page page = BufferGetPage(buf);
1851  IndexTuple new_item;
1852  BTStackData fakestack;
1853  IndexTuple ritem;
1854  Buffer pbuf;
1855 
1856  if (stack == NULL)
1857  {
1858  BTPageOpaque lpageop;
1859 
1860  elog(DEBUG2, "concurrent ROOT page split");
1861  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1862  /* Find the leftmost page at the next level up */
1863  pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
1864  NULL);
1865  /* Set up a phony stack entry pointing there */
1866  stack = &fakestack;
1867  stack->bts_blkno = BufferGetBlockNumber(pbuf);
1870  stack->bts_parent = NULL;
1871  _bt_relbuf(rel, pbuf);
1872  }
1873 
1874  /* get high key from left page == lower bound for new right page */
1875  ritem = (IndexTuple) PageGetItem(page,
1876  PageGetItemId(page, P_HIKEY));
1877 
1878  /* form an index tuple that points at the new right page */
1879  new_item = CopyIndexTuple(ritem);
1880  BTreeInnerTupleSetDownLink(new_item, rbknum);
1881 
1882  /*
1883  * Find the parent buffer and get the parent page.
1884  *
1885  * Oops - if we were moved right then we need to change stack item! We
1886  * want to find parent pointing to where we are, right ? - vadim
1887  * 05/27/97
1888  */
1889  stack->bts_btentry = bknum;
1890  pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1891 
1892  /*
1893  * Now we can unlock the right child. The left child will be unlocked
1894  * by _bt_insertonpg().
1895  */
1896  _bt_relbuf(rel, rbuf);
1897 
1898  /* Check for error only after writing children */
1899  if (pbuf == InvalidBuffer)
1900  elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
1901  RelationGetRelationName(rel), bknum, rbknum);
1902 
1903  /* Recursively update the parent */
1904  _bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
1905  new_item, stack->bts_offset + 1,
1906  is_only);
1907 
1908  /* be tidy */
1909  pfree(new_item);
1910  }
1911 }
Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access)
Definition: nbtinsert.c:1983
union BTPageOpaqueData::@45 btpo
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2104
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
Definition: nbtsearch.c:1808
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ERROR
Definition: elog.h:43
#define DEBUG2
Definition: elog.h:24
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:441
OffsetNumber bts_offset
Definition: nbtree.h:315
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:431
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 level
Definition: nbtree.h:61
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:314
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
BlockNumber bts_btentry
Definition: nbtree.h:316
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
struct BTStackData * bts_parent
Definition: nbtree.h:317
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:226
#define P_HIKEY
Definition: nbtree.h:185
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:821
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:300
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_insertonpg()

static void _bt_insertonpg ( Relation  rel,
Buffer  buf,
Buffer  cbuf,
BTStack  stack,
IndexTuple  itup,
OffsetNumber  newitemoff,
bool  split_only_page 
)
static

Definition at line 821 of file nbtinsert.c.

References _bt_findsplitloc(), _bt_getbuf(), _bt_getrootheight(), _bt_insert_parent(), _bt_pgaddtup(), _bt_relbuf(), _bt_split(), _bt_upgrademetapage(), Assert, BlockNumberIsValid, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTREE_FASTPATH_MIN_LEVEL, BTREE_METAPAGE, BTREE_VERSION, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, MarkBufferDirty(), MAXALIGN, xl_btree_insert::offnum, xl_btree_metadata::oldest_btpo_xact, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageGetFreeSpace(), PageGetSpecialPointer, PageSetLSN, PANIC, PredicateLockPageSplit(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationGetTargetBlock, RelationNeedsWAL, RelationSetTargetBlock, xl_btree_metadata::root, SizeOfBtreeInsert, START_CRIT_SECTION, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_doinsert(), and _bt_insert_parent().

828 {
829  Page page;
830  BTPageOpaque lpageop;
831  OffsetNumber firstright = InvalidOffsetNumber;
832  Size itemsz;
833 
834  page = BufferGetPage(buf);
835  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
836 
837  /* child buffer must be given iff inserting on an internal page */
838  Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
839  /* tuple must have appropriate number of attributes */
840  Assert(!P_ISLEAF(lpageop) ||
841  BTreeTupleGetNAtts(itup, rel) ==
843  Assert(P_ISLEAF(lpageop) ||
844  BTreeTupleGetNAtts(itup, rel) ==
846 
847  /* The caller should've finished any incomplete splits already. */
848  if (P_INCOMPLETE_SPLIT(lpageop))
849  elog(ERROR, "cannot insert to incompletely split page %u",
851 
852  itemsz = IndexTupleSize(itup);
853  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
854  * need to be consistent */
855 
856  /*
857  * Do we need to split the page to fit the item on it?
858  *
859  * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
860  * so this comparison is correct even though we appear to be accounting
861  * only for the item and not for its line pointer.
862  */
863  if (PageGetFreeSpace(page) < itemsz)
864  {
865  bool is_root = P_ISROOT(lpageop);
866  bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
867  bool newitemonleft;
868  Buffer rbuf;
869 
870  /*
871  * If we're here then a pagesplit is needed. We should never reach
872  * here if we're using the fastpath since we should have checked for
873  * all the required conditions, including the fact that this page has
874  * enough freespace. Note that this routine can in theory deal with
875  * the situation where a NULL stack pointer is passed (that's what
876  * would happen if the fastpath is taken). But that path is much
877  * slower, defeating the very purpose of the optimization. The
878  * following assertion should protect us from any future code changes
879  * that invalidate those assumptions.
880  *
881  * Note that whenever we fail to take the fastpath, we clear the
882  * cached block. Checking for a valid cached block at this point is
883  * enough to decide whether we're in a fastpath or not.
884  */
885  Assert(!(P_ISLEAF(lpageop) &&
887 
888  /* Choose the split point */
889  firstright = _bt_findsplitloc(rel, page,
890  newitemoff, itemsz,
891  &newitemonleft);
892 
893  /* split the buffer into left and right halves */
894  rbuf = _bt_split(rel, buf, cbuf, firstright,
895  newitemoff, itemsz, itup, newitemonleft);
898  BufferGetBlockNumber(rbuf));
899 
900  /*----------
901  * By here,
902  *
903  * + our target page has been split;
904  * + the original tuple has been inserted;
905  * + we have write locks on both the old (left half)
906  * and new (right half) buffers, after the split; and
907  * + we know the key we want to insert into the parent
908  * (it's the "high key" on the left child page).
909  *
910  * We're ready to do the parent insertion. We need to hold onto the
911  * locks for the child pages until we locate the parent, but we can
912  * release them before doing the actual insertion (see Lehman and Yao
913  * for the reasoning).
914  *----------
915  */
916  _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
917  }
918  else
919  {
920  Buffer metabuf = InvalidBuffer;
921  Page metapg = NULL;
922  BTMetaPageData *metad = NULL;
923  OffsetNumber itup_off;
924  BlockNumber itup_blkno;
925  BlockNumber cachedBlock = InvalidBlockNumber;
926 
927  itup_off = newitemoff;
928  itup_blkno = BufferGetBlockNumber(buf);
929 
930  /*
931  * If we are doing this insert because we split a page that was the
932  * only one on its tree level, but was not the root, it may have been
933  * the "fast root". We need to ensure that the fast root link points
934  * at or above the current page. We can safely acquire a lock on the
935  * metapage here --- see comments for _bt_newroot().
936  */
937  if (split_only_page)
938  {
939  Assert(!P_ISLEAF(lpageop));
940 
941  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
942  metapg = BufferGetPage(metabuf);
943  metad = BTPageGetMeta(metapg);
944 
945  if (metad->btm_fastlevel >= lpageop->btpo.level)
946  {
947  /* no update wanted */
948  _bt_relbuf(rel, metabuf);
949  metabuf = InvalidBuffer;
950  }
951  }
952 
953  /*
954  * Every internal page should have exactly one negative infinity item
955  * at all times. Only _bt_split() and _bt_newroot() should add items
956  * that become negative infinity items through truncation, since
957  * they're the only routines that allocate new internal pages. Do not
958  * allow a retail insertion of a new item at the negative infinity
959  * offset.
960  */
961  if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
962  elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
963  itup_blkno, RelationGetRelationName(rel));
964 
965  /* Do the update. No ereport(ERROR) until changes are logged */
967 
968  if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
969  elog(PANIC, "failed to add new item to block %u in index \"%s\"",
970  itup_blkno, RelationGetRelationName(rel));
971 
973 
974  if (BufferIsValid(metabuf))
975  {
976  /* upgrade meta-page if needed */
977  if (metad->btm_version < BTREE_VERSION)
978  _bt_upgrademetapage(metapg);
979  metad->btm_fastroot = itup_blkno;
980  metad->btm_fastlevel = lpageop->btpo.level;
981  MarkBufferDirty(metabuf);
982  }
983 
984  /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
985  if (BufferIsValid(cbuf))
986  {
987  Page cpage = BufferGetPage(cbuf);
989 
990  Assert(P_INCOMPLETE_SPLIT(cpageop));
991  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
992  MarkBufferDirty(cbuf);
993  }
994 
995  /*
996  * Cache the block information if we just inserted into the rightmost
997  * leaf page of the index and it's not the root page. For very small
998  * index where root is also the leaf, there is no point trying for any
999  * optimization.
1000  */
1001  if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
1002  cachedBlock = BufferGetBlockNumber(buf);
1003 
1004  /* XLOG stuff */
1005  if (RelationNeedsWAL(rel))
1006  {
1007  xl_btree_insert xlrec;
1008  xl_btree_metadata xlmeta;
1009  uint8 xlinfo;
1010  XLogRecPtr recptr;
1011 
1012  xlrec.offnum = itup_off;
1013 
1014  XLogBeginInsert();
1015  XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1016 
1017  if (P_ISLEAF(lpageop))
1018  xlinfo = XLOG_BTREE_INSERT_LEAF;
1019  else
1020  {
1021  /*
1022  * Register the left child whose INCOMPLETE_SPLIT flag was
1023  * cleared.
1024  */
1026 
1027  xlinfo = XLOG_BTREE_INSERT_UPPER;
1028  }
1029 
1030  if (BufferIsValid(metabuf))
1031  {
1032  xlmeta.root = metad->btm_root;
1033  xlmeta.level = metad->btm_level;
1034  xlmeta.fastroot = metad->btm_fastroot;
1035  xlmeta.fastlevel = metad->btm_fastlevel;
1036  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
1039 
1041  XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
1042 
1043  xlinfo = XLOG_BTREE_INSERT_META;
1044  }
1045 
1047  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1048 
1049  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1050 
1051  if (BufferIsValid(metabuf))
1052  {
1053  PageSetLSN(metapg, recptr);
1054  }
1055  if (BufferIsValid(cbuf))
1056  {
1057  PageSetLSN(BufferGetPage(cbuf), recptr);
1058  }
1059 
1060  PageSetLSN(page, recptr);
1061  }
1062 
1063  END_CRIT_SECTION();
1064 
1065  /* release buffers */
1066  if (BufferIsValid(metabuf))
1067  _bt_relbuf(rel, metabuf);
1068  if (BufferIsValid(cbuf))
1069  _bt_relbuf(rel, cbuf);
1070  _bt_relbuf(rel, buf);
1071 
1072  /*
1073  * If we decided to cache the insertion target block, then set it now.
1074  * But before that, check for the height of the tree and don't go for
1075  * the optimization for small indexes. We defer that check to this
1076  * point to ensure that we don't call _bt_getrootheight while holding
1077  * lock on any other block.
1078  *
1079  * We do this after dropping locks on all buffers. So the information
1080  * about whether the insertion block is still the rightmost block or
1081  * not may have changed in between. But we will deal with that during
1082  * next insert operation. No special care is required while setting
1083  * it.
1084  */
1085  if (BlockNumberIsValid(cachedBlock) &&
1087  RelationSetTargetBlock(rel, cachedBlock);
1088  }
1089 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:86
uint32 btm_version
Definition: nbtree.h:100
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:594
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1456
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:117
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:356
#define InvalidBuffer
Definition: buf.h:25
static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft)
Definition: nbtinsert.c:1566
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
uint32 BlockNumber
Definition: block.h:31
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1813
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:247
#define PANIC
Definition: elog.h:53
#define RelationGetTargetBlock(relation)
Definition: rel.h:482
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
uint16 OffsetNumber
Definition: off.h:24
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
static char * buf
Definition: pg_test_fsync.c:67
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:409
#define RelationGetRelationName(relation)
Definition: rel.h:431
#define P_LEFTMOST(opaque)
Definition: nbtree.h:156
static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft)
Definition: nbtinsert.c:1107
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:416
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define P_ISROOT(opaque)
Definition: nbtree.h:159
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:30
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2284
OffsetNumber offnum
Definition: nbtxlog.h:70
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:489
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define RelationNeedsWAL(relation)
Definition: rel.h:499
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
#define BT_WRITE
Definition: nbtree.h:300
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define SizeOfBtreeInsert
Definition: nbtxlog.h:73
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3065
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_isequal()

static bool _bt_isequal ( TupleDesc  itupdesc,
Page  page,
OffsetNumber  offnum,
int  keysz,
ScanKey  scankey 
)
static

Definition at line 2315 of file nbtinsert.c.

References Assert, DatumGetInt32, FunctionCall2Coll(), i, index_getattr, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, ScanKeyData::sk_argument, ScanKeyData::sk_attno, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_check_unique().

2317 {
2318  IndexTuple itup;
2319  int i;
2320 
2321  /* Better be comparing to a leaf item */
2323 
2324  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2325 
2326  /*
2327  * It's okay that we might perform a comparison against a truncated page
2328  * high key when caller needs to determine if _bt_check_unique scan must
2329  * continue on to the next page. Caller never asks us to compare non-key
2330  * attributes within an INCLUDE index.
2331  */
2332  for (i = 1; i <= keysz; i++)
2333  {
2334  AttrNumber attno;
2335  Datum datum;
2336  bool isNull;
2337  int32 result;
2338 
2339  attno = scankey->sk_attno;
2340  Assert(attno == i);
2341  datum = index_getattr(itup, attno, itupdesc, &isNull);
2342 
2343  /* NULLs are never equal to anything */
2344  if (isNull || (scankey->sk_flags & SK_ISNULL))
2345  return false;
2346 
2347  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
2348  scankey->sk_collation,
2349  datum,
2350  scankey->sk_argument));
2351 
2352  if (result != 0)
2353  return false;
2354 
2355  scankey++;
2356  }
2357 
2358  /* if we get here, the keys are equal */
2359  return true;
2360 }
#define DatumGetInt32(X)
Definition: postgres.h:457
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1133
signed int int32
Definition: c.h:346
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define SK_ISNULL
Definition: skey.h:115
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uintptr_t Datum
Definition: postgres.h:367
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
int16 AttrNumber
Definition: attnum.h:21
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_newroot()

static Buffer _bt_newroot ( Relation  rel,
Buffer  lbuf,
Buffer  rbuf 
)
static

Definition at line 2104 of file nbtinsert.c.

References _bt_getbuf(), _bt_relbuf(), _bt_upgrademetapage(), Assert, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_VERSION, BTreeInnerTupleSetDownLink, BTreeTupleGetNAtts, BTreeTupleSetNAtts, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), elog, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, MarkBufferDirty(), xl_btree_metadata::oldest_btpo_xact, P_FIRSTKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_NEW, P_NONE, PageAddItem, PageGetItem, PageGetItemId, PageGetSpecialPointer, PageSetLSN, palloc(), PANIC, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, IndexTupleData::t_info, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insert_parent().

2105 {
2106  Buffer rootbuf;
2107  Page lpage,
2108  rootpage;
2109  BlockNumber lbkno,
2110  rbkno;
2111  BlockNumber rootblknum;
2112  BTPageOpaque rootopaque;
2113  BTPageOpaque lopaque;
2114  ItemId itemid;
2115  IndexTuple item;
2116  IndexTuple left_item;
2117  Size left_item_sz;
2118  IndexTuple right_item;
2119  Size right_item_sz;
2120  Buffer metabuf;
2121  Page metapg;
2122  BTMetaPageData *metad;
2123 
2124  lbkno = BufferGetBlockNumber(lbuf);
2125  rbkno = BufferGetBlockNumber(rbuf);
2126  lpage = BufferGetPage(lbuf);
2127  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
2128 
2129  /* get a new root page */
2130  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
2131  rootpage = BufferGetPage(rootbuf);
2132  rootblknum = BufferGetBlockNumber(rootbuf);
2133 
2134  /* acquire lock on the metapage */
2135  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2136  metapg = BufferGetPage(metabuf);
2137  metad = BTPageGetMeta(metapg);
2138 
2139  /*
2140  * Create downlink item for left page (old root). Since this will be the
2141  * first item in a non-leaf page, it implicitly has minus-infinity key
2142  * value, so we need not store any actual key in it.
2143  */
2144  left_item_sz = sizeof(IndexTupleData);
2145  left_item = (IndexTuple) palloc(left_item_sz);
2146  left_item->t_info = left_item_sz;
2147  BTreeInnerTupleSetDownLink(left_item, lbkno);
2148  BTreeTupleSetNAtts(left_item, 0);
2149 
2150  /*
2151  * Create downlink item for right page. The key for it is obtained from
2152  * the "high key" position in the left page.
2153  */
2154  itemid = PageGetItemId(lpage, P_HIKEY);
2155  right_item_sz = ItemIdGetLength(itemid);
2156  item = (IndexTuple) PageGetItem(lpage, itemid);
2157  right_item = CopyIndexTuple(item);
2158  BTreeInnerTupleSetDownLink(right_item, rbkno);
2159 
2160  /* NO EREPORT(ERROR) from here till newroot op is logged */
2162 
2163  /* upgrade metapage if needed */
2164  if (metad->btm_version < BTREE_VERSION)
2165  _bt_upgrademetapage(metapg);
2166 
2167  /* set btree special data */
2168  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
2169  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2170  rootopaque->btpo_flags = BTP_ROOT;
2171  rootopaque->btpo.level =
2172  ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
2173  rootopaque->btpo_cycleid = 0;
2174 
2175  /* update metapage data */
2176  metad->btm_root = rootblknum;
2177  metad->btm_level = rootopaque->btpo.level;
2178  metad->btm_fastroot = rootblknum;
2179  metad->btm_fastlevel = rootopaque->btpo.level;
2180 
2181  /*
2182  * Insert the left page pointer into the new root page. The root page is
2183  * the rightmost page on its level so there is no "high key" in it; the
2184  * two items will go into positions P_HIKEY and P_FIRSTKEY.
2185  *
2186  * Note: we *must* insert the two items in item-number order, for the
2187  * benefit of _bt_restore_page().
2188  */
2189  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2190  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2191  false, false) == InvalidOffsetNumber)
2192  elog(PANIC, "failed to add leftkey to new root page"
2193  " while splitting block %u of index \"%s\"",
2195 
2196  /*
2197  * insert the right page pointer into the new root page.
2198  */
2199  Assert(BTreeTupleGetNAtts(right_item, rel) ==
2201  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2202  false, false) == InvalidOffsetNumber)
2203  elog(PANIC, "failed to add rightkey to new root page"
2204  " while splitting block %u of index \"%s\"",
2206 
2207  /* Clear the incomplete-split flag in the left child */
2208  Assert(P_INCOMPLETE_SPLIT(lopaque));
2209  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2210  MarkBufferDirty(lbuf);
2211 
2212  MarkBufferDirty(rootbuf);
2213  MarkBufferDirty(metabuf);
2214 
2215  /* XLOG stuff */
2216  if (RelationNeedsWAL(rel))
2217  {
2218  xl_btree_newroot xlrec;
2219  XLogRecPtr recptr;
2220  xl_btree_metadata md;
2221 
2222  xlrec.rootblk = rootblknum;
2223  xlrec.level = metad->btm_level;
2224 
2225  XLogBeginInsert();
2226  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2227 
2228  XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2231 
2232  md.root = rootblknum;
2233  md.level = metad->btm_level;
2234  md.fastroot = rootblknum;
2235  md.fastlevel = metad->btm_level;
2238 
2239  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2240 
2241  /*
2242  * Direct access to page is not good but faster - we should implement
2243  * some new func in page API.
2244  */
2246  (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2247  ((PageHeader) rootpage)->pd_special -
2248  ((PageHeader) rootpage)->pd_upper);
2249 
2250  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2251 
2252  PageSetLSN(lpage, recptr);
2253  PageSetLSN(rootpage, recptr);
2254  PageSetLSN(metapg, recptr);
2255  }
2256 
2257  END_CRIT_SECTION();
2258 
2259  /* done with metapage */
2260  _bt_relbuf(rel, metabuf);
2261 
2262  pfree(left_item);
2263  pfree(right_item);
2264 
2265  return rootbuf;
2266 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:245
#define BTP_ROOT
Definition: nbtree.h:72
BlockNumber btpo_next
Definition: nbtree.h:58
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:86
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1456
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:117
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
union BTPageOpaqueData::@45 btpo
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:150
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
uint32 level
Definition: nbtxlog.h:246
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:82
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:165
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:247
#define PANIC
Definition: elog.h:53
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:257
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:441
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define P_FIRSTKEY
Definition: nbtree.h:186
#define RelationGetRelationName(relation)
Definition: rel.h:431
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:416
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:249
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
struct IndexTupleData IndexTupleData
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define RelationNeedsWAL(relation)
Definition: rel.h:499
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:226
#define P_HIKEY
Definition: nbtree.h:185
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
void * palloc(Size size)
Definition: mcxt.c:924
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
#define BT_WRITE
Definition: nbtree.h:300
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74

◆ _bt_pgaddtup()

static bool _bt_pgaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off 
)
static

Definition at line 2284 of file nbtinsert.c.

References BTreeTupleSetNAtts, InvalidOffsetNumber, P_FIRSTDATAKEY, P_ISLEAF, PageAddItem, PageGetSpecialPointer, and IndexTupleData::t_info.

Referenced by _bt_insertonpg(), and _bt_split().

2288 {
2290  IndexTupleData trunctuple;
2291 
2292  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
2293  {
2294  trunctuple = *itup;
2295  trunctuple.t_info = sizeof(IndexTupleData);
2296  BTreeTupleSetNAtts(&trunctuple, 0);
2297  itup = &trunctuple;
2298  itemsize = sizeof(IndexTupleData);
2299  }
2300 
2301  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
2302  false, false) == InvalidOffsetNumber)
2303  return false;
2304 
2305  return true;
2306 }
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:257
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
struct IndexTupleData IndexTupleData
#define InvalidOffsetNumber
Definition: off.h:26
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
unsigned short t_info
Definition: itup.h:49
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_split()

static Buffer _bt_split ( Relation  rel,
Buffer  buf,
Buffer  cbuf,
OffsetNumber  firstright,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool  newitemonleft 
)
static

Definition at line 1107 of file nbtinsert.c.

References _bt_getbuf(), _bt_nonkey_truncate(), _bt_pageinit(), _bt_pgaddtup(), _bt_relbuf(), _bt_vacuum_cycleid(), Assert, BT_WRITE, BTP_HAS_GARBAGE, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTP_SPLIT_END, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BufferGetBlockNumber(), BufferGetPage, BufferGetPageSize, BufferIsValid, elog, END_CRIT_SECTION, ERROR, xl_btree_split::firstright, i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, InvalidBuffer, InvalidOffsetNumber, ItemIdGetLength, BTPageOpaqueData::level, xl_btree_split::level, MarkBufferDirty(), MAXALIGN, xl_btree_split::newitemoff, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_NEW, P_RIGHTMOST, PageAddItem, PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPage(), PageRestoreTempPage(), PageSetLSN, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeSplit, START_CRIT_SECTION, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_L_HIGHKEY, XLOG_BTREE_SPLIT_R, XLOG_BTREE_SPLIT_R_HIGHKEY, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insertonpg().

1110 {
1111  Buffer rbuf;
1112  Page origpage;
1113  Page leftpage,
1114  rightpage;
1115  BlockNumber origpagenumber,
1116  rightpagenumber;
1117  BTPageOpaque ropaque,
1118  lopaque,
1119  oopaque;
1120  Buffer sbuf = InvalidBuffer;
1121  Page spage = NULL;
1122  BTPageOpaque sopaque = NULL;
1123  Size itemsz;
1124  ItemId itemid;
1125  IndexTuple item;
1126  OffsetNumber leftoff,
1127  rightoff;
1128  OffsetNumber maxoff;
1129  OffsetNumber i;
1130  bool isleaf;
1131  IndexTuple lefthikey;
1132  int indnatts = IndexRelationGetNumberOfAttributes(rel);
1133  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1134 
1135  /* Acquire a new page to split into */
1136  rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1137 
1138  /*
1139  * origpage is the original page to be split. leftpage is a temporary
1140  * buffer that receives the left-sibling data, which will be copied back
1141  * into origpage on success. rightpage is the new page that receives the
1142  * right-sibling data. If we fail before reaching the critical section,
1143  * origpage hasn't been modified and leftpage is only workspace. In
1144  * principle we shouldn't need to worry about rightpage either, because it
1145  * hasn't been linked into the btree page structure; but to avoid leaving
1146  * possibly-confusing junk behind, we are careful to rewrite rightpage as
1147  * zeroes before throwing any error.
1148  */
1149  origpage = BufferGetPage(buf);
1150  leftpage = PageGetTempPage(origpage);
1151  rightpage = BufferGetPage(rbuf);
1152 
1153  origpagenumber = BufferGetBlockNumber(buf);
1154  rightpagenumber = BufferGetBlockNumber(rbuf);
1155 
1156  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1157  /* rightpage was already initialized by _bt_getbuf */
1158 
1159  /*
1160  * Copy the original page's LSN into leftpage, which will become the
1161  * updated version of the page. We need this because XLogInsert will
1162  * examine the LSN and possibly dump it in a page image.
1163  */
1164  PageSetLSN(leftpage, PageGetLSN(origpage));
1165 
1166  /* init btree private data */
1167  oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1168  lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1169  ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1170 
1171  isleaf = P_ISLEAF(oopaque);
1172 
1173  /* if we're splitting this page, it won't be the root when we're done */
1174  /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1175  lopaque->btpo_flags = oopaque->btpo_flags;
1176  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1177  ropaque->btpo_flags = lopaque->btpo_flags;
1178  /* set flag in left page indicating that the right page has no downlink */
1179  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1180  lopaque->btpo_prev = oopaque->btpo_prev;
1181  lopaque->btpo_next = rightpagenumber;
1182  ropaque->btpo_prev = origpagenumber;
1183  ropaque->btpo_next = oopaque->btpo_next;
1184  lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
1185  /* Since we already have write-lock on both pages, ok to read cycleid */
1186  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1187  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1188 
1189  /*
1190  * If the page we're splitting is not the rightmost page at its level in
1191  * the tree, then the first entry on the page is the high key for the
1192  * page. We need to copy that to the right half. Otherwise (meaning the
1193  * rightmost page case), all the items on the right half will be user
1194  * data.
1195  */
1196  rightoff = P_HIKEY;
1197 
1198  if (!P_RIGHTMOST(oopaque))
1199  {
1200  itemid = PageGetItemId(origpage, P_HIKEY);
1201  itemsz = ItemIdGetLength(itemid);
1202  item = (IndexTuple) PageGetItem(origpage, itemid);
1203  Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
1204  if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1205  false, false) == InvalidOffsetNumber)
1206  {
1207  memset(rightpage, 0, BufferGetPageSize(rbuf));
1208  elog(ERROR, "failed to add hikey to the right sibling"
1209  " while splitting block %u of index \"%s\"",
1210  origpagenumber, RelationGetRelationName(rel));
1211  }
1212  rightoff = OffsetNumberNext(rightoff);
1213  }
1214 
1215  /*
1216  * The "high key" for the new left page will be the first key that's going
1217  * to go into the new right page. This might be either the existing data
1218  * item at position firstright, or the incoming tuple.
1219  */
1220  leftoff = P_HIKEY;
1221  if (!newitemonleft && newitemoff == firstright)
1222  {
1223  /* incoming tuple will become first on right page */
1224  itemsz = newitemsz;
1225  item = newitem;
1226  }
1227  else
1228  {
1229  /* existing item at firstright will become first on right page */
1230  itemid = PageGetItemId(origpage, firstright);
1231  itemsz = ItemIdGetLength(itemid);
1232  item = (IndexTuple) PageGetItem(origpage, itemid);
1233  }
1234 
1235  /*
1236  * Truncate non-key (INCLUDE) attributes of the high key item before
1237  * inserting it on the left page. This only needs to happen at the leaf
1238  * level, since in general all pivot tuple values originate from leaf
1239  * level high keys. This isn't just about avoiding unnecessary work,
1240  * though; truncating unneeded key attributes (more aggressive suffix
1241  * truncation) can only be performed at the leaf level anyway. This is
1242  * because a pivot tuple in a grandparent page must guide a search not
1243  * only to the correct parent page, but also to the correct leaf page.
1244  */
1245  if (indnatts != indnkeyatts && isleaf)
1246  {
1247  lefthikey = _bt_nonkey_truncate(rel, item);
1248  itemsz = IndexTupleSize(lefthikey);
1249  itemsz = MAXALIGN(itemsz);
1250  }
1251  else
1252  lefthikey = item;
1253 
1254  Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
1255  if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
1256  false, false) == InvalidOffsetNumber)
1257  {
1258  memset(rightpage, 0, BufferGetPageSize(rbuf));
1259  elog(ERROR, "failed to add hikey to the left sibling"
1260  " while splitting block %u of index \"%s\"",
1261  origpagenumber, RelationGetRelationName(rel));
1262  }
1263  leftoff = OffsetNumberNext(leftoff);
1264  /* be tidy */
1265  if (lefthikey != item)
1266  pfree(lefthikey);
1267 
1268  /*
1269  * Now transfer all the data items to the appropriate page.
1270  *
1271  * Note: we *must* insert at least the right page's items in item-number
1272  * order, for the benefit of _bt_restore_page().
1273  */
1274  maxoff = PageGetMaxOffsetNumber(origpage);
1275 
1276  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1277  {
1278  itemid = PageGetItemId(origpage, i);
1279  itemsz = ItemIdGetLength(itemid);
1280  item = (IndexTuple) PageGetItem(origpage, itemid);
1281 
1282  /* does new item belong before this one? */
1283  if (i == newitemoff)
1284  {
1285  if (newitemonleft)
1286  {
1287  if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1288  {
1289  memset(rightpage, 0, BufferGetPageSize(rbuf));
1290  elog(ERROR, "failed to add new item to the left sibling"
1291  " while splitting block %u of index \"%s\"",
1292  origpagenumber, RelationGetRelationName(rel));
1293  }
1294  leftoff = OffsetNumberNext(leftoff);
1295  }
1296  else
1297  {
1298  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1299  {
1300  memset(rightpage, 0, BufferGetPageSize(rbuf));
1301  elog(ERROR, "failed to add new item to the right sibling"
1302  " while splitting block %u of index \"%s\"",
1303  origpagenumber, RelationGetRelationName(rel));
1304  }
1305  rightoff = OffsetNumberNext(rightoff);
1306  }
1307  }
1308 
1309  /* decide which page to put it on */
1310  if (i < firstright)
1311  {
1312  if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1313  {
1314  memset(rightpage, 0, BufferGetPageSize(rbuf));
1315  elog(ERROR, "failed to add old item to the left sibling"
1316  " while splitting block %u of index \"%s\"",
1317  origpagenumber, RelationGetRelationName(rel));
1318  }
1319  leftoff = OffsetNumberNext(leftoff);
1320  }
1321  else
1322  {
1323  if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1324  {
1325  memset(rightpage, 0, BufferGetPageSize(rbuf));
1326  elog(ERROR, "failed to add old item to the right sibling"
1327  " while splitting block %u of index \"%s\"",
1328  origpagenumber, RelationGetRelationName(rel));
1329  }
1330  rightoff = OffsetNumberNext(rightoff);
1331  }
1332  }
1333 
1334  /* cope with possibility that newitem goes at the end */
1335  if (i <= newitemoff)
1336  {
1337  /*
1338  * Can't have newitemonleft here; that would imply we were told to put
1339  * *everything* on the left page, which cannot fit (if it could, we'd
1340  * not be splitting the page).
1341  */
1342  Assert(!newitemonleft);
1343  if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1344  {
1345  memset(rightpage, 0, BufferGetPageSize(rbuf));
1346  elog(ERROR, "failed to add new item to the right sibling"
1347  " while splitting block %u of index \"%s\"",
1348  origpagenumber, RelationGetRelationName(rel));
1349  }
1350  rightoff = OffsetNumberNext(rightoff);
1351  }
1352 
1353  /*
1354  * We have to grab the right sibling (if any) and fix the prev pointer
1355  * there. We are guaranteed that this is deadlock-free since no other
1356  * writer will be holding a lock on that page and trying to move left, and
1357  * all readers release locks on a page before trying to fetch its
1358  * neighbors.
1359  */
1360 
1361  if (!P_RIGHTMOST(oopaque))
1362  {
1363  sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1364  spage = BufferGetPage(sbuf);
1365  sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1366  if (sopaque->btpo_prev != origpagenumber)
1367  {
1368  memset(rightpage, 0, BufferGetPageSize(rbuf));
1369  elog(ERROR, "right sibling's left-link doesn't match: "
1370  "block %u links to %u instead of expected %u in index \"%s\"",
1371  oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1373  }
1374 
1375  /*
1376  * Check to see if we can set the SPLIT_END flag in the right-hand
1377  * split page; this can save some I/O for vacuum since it need not
1378  * proceed to the right sibling. We can set the flag if the right
1379  * sibling has a different cycleid: that means it could not be part of
1380  * a group of pages that were all split off from the same ancestor
1381  * page. If you're confused, imagine that page A splits to A B and
1382  * then again, yielding A C B, while vacuum is in progress. Tuples
1383  * originally in A could now be in either B or C, hence vacuum must
1384  * examine both pages. But if D, our right sibling, has a different
1385  * cycleid then it could not contain any tuples that were in A when
1386  * the vacuum started.
1387  */
1388  if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1389  ropaque->btpo_flags |= BTP_SPLIT_END;
1390  }
1391 
1392  /*
1393  * Right sibling is locked, new siblings are prepared, but original page
1394  * is not updated yet.
1395  *
1396  * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1397  * not starting the critical section till here because we haven't been
1398  * scribbling on the original page yet; see comments above.
1399  */
1401 
1402  /*
1403  * By here, the original data page has been split into two new halves, and
1404  * these are correct. The algorithm requires that the left page never
1405  * move during a split, so we copy the new left page back on top of the
1406  * original. Note that this is not a waste of time, since we also require
1407  * (in the page management code) that the center of a page always be
1408  * clean, and the most efficient way to guarantee this is just to compact
1409  * the data by reinserting it into a new left page. (XXX the latter
1410  * comment is probably obsolete; but in any case it's good to not scribble
1411  * on the original page until we enter the critical section.)
1412  *
1413  * We need to do this before writing the WAL record, so that XLogInsert
1414  * can WAL log an image of the page if necessary.
1415  */
1416  PageRestoreTempPage(leftpage, origpage);
1417  /* leftpage, lopaque must not be used below here */
1418 
1420  MarkBufferDirty(rbuf);
1421 
1422  if (!P_RIGHTMOST(ropaque))
1423  {
1424  sopaque->btpo_prev = rightpagenumber;
1425  MarkBufferDirty(sbuf);
1426  }
1427 
1428  /*
1429  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1430  * a split.
1431  */
1432  if (!isleaf)
1433  {
1434  Page cpage = BufferGetPage(cbuf);
1435  BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
1436 
1437  cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1438  MarkBufferDirty(cbuf);
1439  }
1440 
1441  /* XLOG stuff */
1442  if (RelationNeedsWAL(rel))
1443  {
1444  xl_btree_split xlrec;
1445  uint8 xlinfo;
1446  XLogRecPtr recptr;
1447  bool loglhikey = false;
1448 
1449  xlrec.level = ropaque->btpo.level;
1450  xlrec.firstright = firstright;
1451  xlrec.newitemoff = newitemoff;
1452 
1453  XLogBeginInsert();
1454  XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1455 
1458  /* Log the right sibling, because we've changed its prev-pointer. */
1459  if (!P_RIGHTMOST(ropaque))
1461  if (BufferIsValid(cbuf))
1463 
1464  /*
1465  * Log the new item, if it was inserted on the left page. (If it was
1466  * put on the right page, we don't need to explicitly WAL log it
1467  * because it's included with all the other items on the right page.)
1468  * Show the new item as belonging to the left page buffer, so that it
1469  * is not stored if XLogInsert decides it needs a full-page image of
1470  * the left page. We store the offset anyway, though, to support
1471  * archive compression of these records.
1472  */
1473  if (newitemonleft)
1474  XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
1475 
1476  /* Log left page */
1477  if (!isleaf || indnatts != indnkeyatts)
1478  {
1479  /*
1480  * We must also log the left page's high key. There are two
1481  * reasons for that: right page's leftmost key is suppressed on
1482  * non-leaf levels and in covering indexes included columns are
1483  * truncated from high keys. Show it as belonging to the left
1484  * page buffer, so that it is not stored if XLogInsert decides it
1485  * needs a full-page image of the left page.
1486  */
1487  itemid = PageGetItemId(origpage, P_HIKEY);
1488  item = (IndexTuple) PageGetItem(origpage, itemid);
1489  XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
1490  loglhikey = true;
1491  }
1492 
1493  /*
1494  * Log the contents of the right page in the format understood by
1495  * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1496  * because we're going to recreate the whole page anyway, so it should
1497  * never be stored by XLogInsert.
1498  *
1499  * Direct access to page is not good but faster - we should implement
1500  * some new func in page API. Note we only store the tuples
1501  * themselves, knowing that they were inserted in item-number order
1502  * and so the item pointers can be reconstructed. See comments for
1503  * _bt_restore_page().
1504  */
1506  (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
1507  ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
1508 
1509  xlinfo = newitemonleft ?
1512  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1513 
1514  PageSetLSN(origpage, recptr);
1515  PageSetLSN(rightpage, recptr);
1516  if (!P_RIGHTMOST(ropaque))
1517  {
1518  PageSetLSN(spage, recptr);
1519  }
1520  if (!isleaf)
1521  {
1522  PageSetLSN(BufferGetPage(cbuf), recptr);
1523  }
1524  }
1525 
1526  END_CRIT_SECTION();
1527 
1528  /* release the old right sibling */
1529  if (!P_RIGHTMOST(ropaque))
1530  _bt_relbuf(rel, sbuf);
1531 
1532  /* release the child */
1533  if (!isleaf)
1534  _bt_relbuf(rel, cbuf);
1535 
1536  /* split's done */
1537  return rbuf;
1538 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
#define BTP_ROOT
Definition: nbtree.h:72
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:1885
#define BTP_SPLIT_END
Definition: nbtree.h:76
BlockNumber btpo_next
Definition: nbtree.h:58
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:407
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1456
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define SizeOfBtreeSplit
Definition: nbtxlog.h:115
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:356
Pointer Item
Definition: item.h:17
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup)
Definition: nbtutils.c:2102
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:82
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:247
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
#define ERROR
Definition: elog.h:43
OffsetNumber newitemoff
Definition: nbtxlog.h:112
BTCycleId btpo_cycleid
Definition: nbtree.h:65
#define XLOG_BTREE_SPLIT_L_HIGHKEY
Definition: nbtxlog.h:31
BlockNumber btpo_prev
Definition: nbtree.h:57
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:409
#define RelationGetRelationName(relation)
Definition: rel.h:431
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:416
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
Page PageGetTempPage(Page page)
Definition: bufpage.c:348
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtinsert.c:2284
uint32 level
Definition: nbtxlog.h:110
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:147
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:884
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
OffsetNumber firstright
Definition: nbtxlog.h:111
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define XLOG_BTREE_SPLIT_R_HIGHKEY
Definition: nbtxlog.h:32
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define RelationNeedsWAL(relation)
Definition: rel.h:499
#define PageGetLSN(page)
Definition: bufpage.h:362
#define P_HIKEY
Definition: nbtree.h:185
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2611
#define elog(elevel,...)
Definition: elog.h:226
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:896
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
#define BT_WRITE
Definition: nbtree.h:300
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:158

◆ _bt_vacuum_one_page()

static void _bt_vacuum_one_page ( Relation  rel,
Buffer  buffer,
Relation  heapRel 
)
static

Definition at line 2370 of file nbtinsert.c.

References _bt_delitems_delete(), BufferGetPage, ItemIdIsDead, MaxOffsetNumber, OffsetNumberNext, P_FIRSTDATAKEY, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc().

2371 {
2372  OffsetNumber deletable[MaxOffsetNumber];
2373  int ndeletable = 0;
2374  OffsetNumber offnum,
2375  minoff,
2376  maxoff;
2377  Page page = BufferGetPage(buffer);
2379 
2380  /*
2381  * Scan over all items to see which ones need to be deleted according to
2382  * LP_DEAD flags.
2383  */
2384  minoff = P_FIRSTDATAKEY(opaque);
2385  maxoff = PageGetMaxOffsetNumber(page);
2386  for (offnum = minoff;
2387  offnum <= maxoff;
2388  offnum = OffsetNumberNext(offnum))
2389  {
2390  ItemId itemId = PageGetItemId(page, offnum);
2391 
2392  if (ItemIdIsDead(itemId))
2393  deletable[ndeletable++] = offnum;
2394  }
2395 
2396  if (ndeletable > 0)
2397  _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
2398 
2399  /*
2400  * Note: if we didn't find any LP_DEAD items, then the page's
2401  * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2402  * separate write to clear it, however. We will clear it when we split
2403  * the page.
2404  */
2405 }
#define MaxOffsetNumber
Definition: off.h:28
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
Definition: nbtpage.c:1029
Pointer Page
Definition: bufpage.h:74