PostgreSQL Source Code git master
nbtinsert.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "common/int.h"
#include "common/pg_prng.h"
#include "lib/qunique.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
Include dependency graph for nbtinsert.c:

Go to the source code of this file.

Macros

#define BTREE_FASTPATH_MIN_LEVEL   2
 

Functions

static BTStack _bt_search_insert (Relation rel, Relation heaprel, BTInsertState insertstate)
 
static TransactionId _bt_check_unique (Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
 
static OffsetNumber _bt_findinsertloc (Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
 
static void _bt_stepright (Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
 
static void _bt_insertonpg (Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
 
static Buffer _bt_split (Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
 
static void _bt_insert_parent (Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
 
static Buffer _bt_newlevel (Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
 
static bool _bt_pgaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
 
static void _bt_delete_or_dedup_one_page (Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
 
static void _bt_simpledel_pass (Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
 
static BlockNumber_bt_deadblocks (Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
 
static int _bt_blk_cmp (const void *arg1, const void *arg2)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 

Macro Definition Documentation

◆ BTREE_FASTPATH_MIN_LEVEL

#define BTREE_FASTPATH_MIN_LEVEL   2

Definition at line 30 of file nbtinsert.c.

Function Documentation

◆ _bt_blk_cmp()

static int _bt_blk_cmp ( const void *  arg1,
const void *  arg2 
)
inlinestatic

Definition at line 3011 of file nbtinsert.c.

3012{
3013 BlockNumber b1 = *((BlockNumber *) arg1);
3014 BlockNumber b2 = *((BlockNumber *) arg2);
3015
3016 return pg_cmp_u32(b1, b2);
3017}
uint32 BlockNumber
Definition: block.h:31
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:652

References pg_cmp_u32().

Referenced by _bt_deadblocks(), and _bt_simpledel_pass().

◆ _bt_check_unique()

static TransactionId _bt_check_unique ( Relation  rel,
BTInsertState  insertstate,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool *  is_unique,
uint32 speculativeToken 
)
static

Definition at line 408 of file nbtinsert.c.

411{
412 IndexTuple itup = insertstate->itup;
413 IndexTuple curitup = NULL;
414 ItemId curitemid = NULL;
415 BTScanInsert itup_key = insertstate->itup_key;
416 SnapshotData SnapshotDirty;
417 OffsetNumber offset;
418 OffsetNumber maxoff;
419 Page page;
420 BTPageOpaque opaque;
421 Buffer nbuf = InvalidBuffer;
422 bool found = false;
423 bool inposting = false;
424 bool prevalldead = true;
425 int curposti = 0;
426
427 /* Assume unique until we find a duplicate */
428 *is_unique = true;
429
430 InitDirtySnapshot(SnapshotDirty);
431
432 page = BufferGetPage(insertstate->buf);
433 opaque = BTPageGetOpaque(page);
434 maxoff = PageGetMaxOffsetNumber(page);
435
436 /*
437 * Find the first tuple with the same key.
438 *
439 * This also saves the binary search bounds in insertstate. We use them
440 * in the fastpath below, but also in the _bt_findinsertloc() call later.
441 */
442 Assert(!insertstate->bounds_valid);
443 offset = _bt_binsrch_insert(rel, insertstate);
444
445 /*
446 * Scan over all equal tuples, looking for live conflicts.
447 */
448 Assert(!insertstate->bounds_valid || insertstate->low == offset);
449 Assert(!itup_key->anynullkeys);
450 Assert(itup_key->scantid == NULL);
451 for (;;)
452 {
453 /*
454 * Each iteration of the loop processes one heap TID, not one index
455 * tuple. Current offset number for page isn't usually advanced on
456 * iterations that process heap TIDs from posting list tuples.
457 *
458 * "inposting" state is set when _inside_ a posting list --- not when
459 * we're at the start (or end) of a posting list. We advance curposti
460 * at the end of the iteration when inside a posting list tuple. In
461 * general, every loop iteration either advances the page offset or
462 * advances curposti --- an iteration that handles the rightmost/max
463 * heap TID in a posting list finally advances the page offset (and
464 * unsets "inposting").
465 *
466 * Make sure the offset points to an actual index tuple before trying
467 * to examine it...
468 */
469 if (offset <= maxoff)
470 {
471 /*
472 * Fastpath: In most cases, we can use cached search bounds to
473 * limit our consideration to items that are definitely
474 * duplicates. This fastpath doesn't apply when the original page
475 * is empty, or when initial offset is past the end of the
476 * original page, which may indicate that we need to examine a
477 * second or subsequent page.
478 *
479 * Note that this optimization allows us to avoid calling
480 * _bt_compare() directly when there are no duplicates, as long as
481 * the offset where the key will go is not at the end of the page.
482 */
483 if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
484 {
485 Assert(insertstate->bounds_valid);
486 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
487 Assert(insertstate->low <= insertstate->stricthigh);
488 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
489 break;
490 }
491
492 /*
493 * We can skip items that are already marked killed.
494 *
495 * In the presence of heavy update activity an index may contain
496 * many killed items with the same key; running _bt_compare() on
497 * each killed item gets expensive. Just advance over killed
498 * items as quickly as we can. We only apply _bt_compare() when
499 * we get to a non-killed item. We could reuse the bounds to
500 * avoid _bt_compare() calls for known equal tuples, but it
501 * doesn't seem worth it.
502 */
503 if (!inposting)
504 curitemid = PageGetItemId(page, offset);
505 if (inposting || !ItemIdIsDead(curitemid))
506 {
507 ItemPointerData htid;
508 bool all_dead = false;
509
510 if (!inposting)
511 {
512 /* Plain tuple, or first TID in posting list tuple */
513 if (_bt_compare(rel, itup_key, page, offset) != 0)
514 break; /* we're past all the equal tuples */
515
516 /* Advanced curitup */
517 curitup = (IndexTuple) PageGetItem(page, curitemid);
518 Assert(!BTreeTupleIsPivot(curitup));
519 }
520
521 /* okay, we gotta fetch the heap tuple using htid ... */
522 if (!BTreeTupleIsPosting(curitup))
523 {
524 /* ... htid is from simple non-pivot tuple */
525 Assert(!inposting);
526 htid = curitup->t_tid;
527 }
528 else if (!inposting)
529 {
530 /* ... htid is first TID in new posting list */
531 inposting = true;
532 prevalldead = true;
533 curposti = 0;
534 htid = *BTreeTupleGetPostingN(curitup, 0);
535 }
536 else
537 {
538 /* ... htid is second or subsequent TID in posting list */
539 Assert(curposti > 0);
540 htid = *BTreeTupleGetPostingN(curitup, curposti);
541 }
542
543 /*
544 * If we are doing a recheck, we expect to find the tuple we
545 * are rechecking. It's not a duplicate, but we have to keep
546 * scanning.
547 */
548 if (checkUnique == UNIQUE_CHECK_EXISTING &&
549 ItemPointerCompare(&htid, &itup->t_tid) == 0)
550 {
551 found = true;
552 }
553
554 /*
555 * Check if there's any table tuples for this index entry
556 * satisfying SnapshotDirty. This is necessary because for AMs
557 * with optimizations like heap's HOT, we have just a single
558 * index entry for the entire chain.
559 */
560 else if (table_index_fetch_tuple_check(heapRel, &htid,
561 &SnapshotDirty,
562 &all_dead))
563 {
564 TransactionId xwait;
565
566 /*
567 * It is a duplicate. If we are only doing a partial
568 * check, then don't bother checking if the tuple is being
569 * updated in another transaction. Just return the fact
570 * that it is a potential conflict and leave the full
571 * check till later. Don't invalidate binary search
572 * bounds.
573 */
574 if (checkUnique == UNIQUE_CHECK_PARTIAL)
575 {
576 if (nbuf != InvalidBuffer)
577 _bt_relbuf(rel, nbuf);
578 *is_unique = false;
580 }
581
582 /*
583 * If this tuple is being updated by other transaction
584 * then we have to wait for its commit/abort.
585 */
586 xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
587 SnapshotDirty.xmin : SnapshotDirty.xmax;
588
589 if (TransactionIdIsValid(xwait))
590 {
591 if (nbuf != InvalidBuffer)
592 _bt_relbuf(rel, nbuf);
593 /* Tell _bt_doinsert to wait... */
594 *speculativeToken = SnapshotDirty.speculativeToken;
595 /* Caller releases lock on buf immediately */
596 insertstate->bounds_valid = false;
597 return xwait;
598 }
599
600 /*
601 * Otherwise we have a definite conflict. But before
602 * complaining, look to see if the tuple we want to insert
603 * is itself now committed dead --- if so, don't complain.
604 * This is a waste of time in normal scenarios but we must
605 * do it to support CREATE INDEX CONCURRENTLY.
606 *
607 * We must follow HOT-chains here because during
608 * concurrent index build, we insert the root TID though
609 * the actual tuple may be somewhere in the HOT-chain.
610 * While following the chain we might not stop at the
611 * exact tuple which triggered the insert, but that's OK
612 * because if we find a live tuple anywhere in this chain,
613 * we have a unique key conflict. The other live tuple is
614 * not part of this chain because it had a different index
615 * entry.
616 */
617 htid = itup->t_tid;
618 if (table_index_fetch_tuple_check(heapRel, &htid,
619 SnapshotSelf, NULL))
620 {
621 /* Normal case --- it's still live */
622 }
623 else
624 {
625 /*
626 * It's been deleted, so no error, and no need to
627 * continue searching
628 */
629 break;
630 }
631
632 /*
633 * Check for a conflict-in as we would if we were going to
634 * write to this page. We aren't actually going to write,
635 * but we want a chance to report SSI conflicts that would
636 * otherwise be masked by this unique constraint
637 * violation.
638 */
640
641 /*
642 * This is a definite conflict. Break the tuple down into
643 * datums and report the error. But first, make sure we
644 * release the buffer locks we're holding ---
645 * BuildIndexValueDescription could make catalog accesses,
646 * which in the worst case might touch this same index and
647 * cause deadlocks.
648 */
649 if (nbuf != InvalidBuffer)
650 _bt_relbuf(rel, nbuf);
651 _bt_relbuf(rel, insertstate->buf);
652 insertstate->buf = InvalidBuffer;
653 insertstate->bounds_valid = false;
654
655 {
657 bool isnull[INDEX_MAX_KEYS];
658 char *key_desc;
659
661 values, isnull);
662
663 key_desc = BuildIndexValueDescription(rel, values,
664 isnull);
665
667 (errcode(ERRCODE_UNIQUE_VIOLATION),
668 errmsg("duplicate key value violates unique constraint \"%s\"",
670 key_desc ? errdetail("Key %s already exists.",
671 key_desc) : 0,
672 errtableconstraint(heapRel,
674 }
675 }
676 else if (all_dead && (!inposting ||
677 (prevalldead &&
678 curposti == BTreeTupleGetNPosting(curitup) - 1)))
679 {
680 /*
681 * The conflicting tuple (or all HOT chains pointed to by
682 * all posting list TIDs) is dead to everyone, so mark the
683 * index entry killed.
684 */
685 ItemIdMarkDead(curitemid);
686 opaque->btpo_flags |= BTP_HAS_GARBAGE;
687
688 /*
689 * Mark buffer with a dirty hint, since state is not
690 * crucial. Be sure to mark the proper buffer dirty.
691 */
692 if (nbuf != InvalidBuffer)
693 MarkBufferDirtyHint(nbuf, true);
694 else
695 MarkBufferDirtyHint(insertstate->buf, true);
696 }
697
698 /*
699 * Remember if posting list tuple has even a single HOT chain
700 * whose members are not all dead
701 */
702 if (!all_dead && inposting)
703 prevalldead = false;
704 }
705 }
706
707 if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
708 {
709 /* Advance to next TID in same posting list */
710 curposti++;
711 continue;
712 }
713 else if (offset < maxoff)
714 {
715 /* Advance to next tuple */
716 curposti = 0;
717 inposting = false;
718 offset = OffsetNumberNext(offset);
719 }
720 else
721 {
722 int highkeycmp;
723
724 /* If scankey == hikey we gotta check the next page too */
725 if (P_RIGHTMOST(opaque))
726 break;
727 highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
728 Assert(highkeycmp <= 0);
729 if (highkeycmp != 0)
730 break;
731 /* Advance to next non-dead page --- there must be one */
732 for (;;)
733 {
734 BlockNumber nblkno = opaque->btpo_next;
735
736 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
737 page = BufferGetPage(nbuf);
738 opaque = BTPageGetOpaque(page);
739 if (!P_IGNORE(opaque))
740 break;
741 if (P_RIGHTMOST(opaque))
742 elog(ERROR, "fell off the end of index \"%s\"",
744 }
745 /* Will also advance to next tuple */
746 curposti = 0;
747 inposting = false;
748 maxoff = PageGetMaxOffsetNumber(page);
749 offset = P_FIRSTDATAKEY(opaque);
750 /* Don't invalidate binary search bounds */
751 }
752 }
753
754 /*
755 * If we are doing a recheck then we should have found the tuple we are
756 * checking. Otherwise there's something very wrong --- probably, the
757 * index is on a non-immutable expression.
758 */
759 if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
761 (errcode(ERRCODE_INTERNAL_ERROR),
762 errmsg("failed to re-find tuple within index \"%s\"",
764 errhint("This may be because of a non-immutable index expression."),
765 errtableconstraint(heapRel,
767
768 if (nbuf != InvalidBuffer)
769 _bt_relbuf(rel, nbuf);
770
772}
static Datum values[MAXATTR]
Definition: bootstrap.c:151
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3730
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4916
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
PageData * Page
Definition: bufpage.h:82
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
uint32 TransactionId
Definition: c.h:623
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
char * BuildIndexValueDescription(Relation indexRelation, const Datum *values, const bool *isnull)
Definition: genam.c:178
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:143
@ UNIQUE_CHECK_PARTIAL
Definition: genam.h:142
Assert(PointerIsAligned(start, uint64))
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:456
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:78
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_HIKEY
Definition: nbtree.h:367
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
#define BT_READ
Definition: nbtree.h:724
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:474
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define INDEX_MAX_KEYS
uintptr_t Datum
Definition: postgres.h:69
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4326
#define RelationGetDescr(relation)
Definition: rel.h:538
#define RelationGetRelationName(relation)
Definition: rel.h:546
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6031
#define SnapshotSelf
Definition: snapmgr.h:32
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:42
OffsetNumber stricthigh
Definition: nbtree.h:830
bool bounds_valid
Definition: nbtree.h:828
OffsetNumber low
Definition: nbtree.h:829
IndexTuple itup
Definition: nbtree.h:816
BTScanInsert itup_key
Definition: nbtree.h:818
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
ItemPointer scantid
Definition: nbtree.h:796
bool anynullkeys
Definition: nbtree.h:793
ItemPointerData t_tid
Definition: itup.h:37
TransactionId xmin
Definition: snapshot.h:153
TransactionId xmax
Definition: snapshot.h:154
uint32 speculativeToken
Definition: snapshot.h:189
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:209
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_binsrch_insert(), _bt_compare(), _bt_relandgetbuf(), _bt_relbuf(), BTScanInsertData::anynullkeys, Assert(), BTInsertStateData::bounds_valid, BT_READ, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), BuildIndexValueDescription(), CheckForSerializableConflictIn(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), if(), index_deform_tuple(), INDEX_MAX_KEYS, InitDirtySnapshot, InvalidBuffer, InvalidTransactionId, ItemIdIsDead, ItemIdMarkDead, ItemPointerCompare(), BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, MarkBufferDirtyHint(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationGetDescr, RelationGetRelationName, BTScanInsertData::scantid, SnapshotSelf, SnapshotData::speculativeToken, BTInsertStateData::stricthigh, IndexTupleData::t_tid, table_index_fetch_tuple_check(), TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_PARTIAL, values, SnapshotData::xmax, and SnapshotData::xmin.

Referenced by _bt_doinsert().

◆ _bt_deadblocks()

static BlockNumber * _bt_deadblocks ( Page  page,
OffsetNumber deletable,
int  ndeletable,
IndexTuple  newitem,
int *  nblocks 
)
static

Definition at line 2938 of file nbtinsert.c.

2940{
2941 int spacentids,
2942 ntids;
2943 BlockNumber *tidblocks;
2944
2945 /*
2946 * Accumulate each TID's block in array whose initial size has space for
2947 * one table block per LP_DEAD-set tuple (plus space for the newitem table
2948 * block). Array will only need to grow when there are LP_DEAD-marked
2949 * posting list tuples (which is not that common).
2950 */
2951 spacentids = ndeletable + 1;
2952 ntids = 0;
2953 tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2954
2955 /*
2956 * First add the table block for the incoming newitem. This is the one
2957 * case where simple deletion can visit a table block that doesn't have
2958 * any known deletable items.
2959 */
2960 Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2961 tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2962
2963 for (int i = 0; i < ndeletable; i++)
2964 {
2965 ItemId itemid = PageGetItemId(page, deletable[i]);
2966 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2967
2968 Assert(ItemIdIsDead(itemid));
2969
2970 if (!BTreeTupleIsPosting(itup))
2971 {
2972 if (ntids + 1 > spacentids)
2973 {
2974 spacentids *= 2;
2975 tidblocks = (BlockNumber *)
2976 repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2977 }
2978
2979 tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2980 }
2981 else
2982 {
2983 int nposting = BTreeTupleGetNPosting(itup);
2984
2985 if (ntids + nposting > spacentids)
2986 {
2987 spacentids = Max(spacentids * 2, ntids + nposting);
2988 tidblocks = (BlockNumber *)
2989 repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2990 }
2991
2992 for (int j = 0; j < nposting; j++)
2993 {
2994 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2995
2996 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
2997 }
2998 }
2999 }
3000
3001 qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3002 *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3003
3004 return tidblocks;
3005}
#define Max(x, y)
Definition: c.h:969
int j
Definition: isn.c:75
int i
Definition: isn.c:74
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1544
void * palloc(Size size)
Definition: mcxt.c:1317
static int _bt_blk_cmp(const void *arg1, const void *arg2)
Definition: nbtinsert.c:3011
#define qsort(a, b, c, d)
Definition: port.h:475
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition: qunique.h:21

References _bt_blk_cmp(), Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), i, ItemIdIsDead, ItemPointerGetBlockNumber(), j, Max, PageGetItem(), PageGetItemId(), palloc(), qsort, qunique(), repalloc(), and IndexTupleData::t_tid.

Referenced by _bt_simpledel_pass().

◆ _bt_delete_or_dedup_one_page()

static void _bt_delete_or_dedup_one_page ( Relation  rel,
Relation  heapRel,
BTInsertState  insertstate,
bool  simpleonly,
bool  checkingunique,
bool  uniquedup,
bool  indexUnchanged 
)
static

Definition at line 2683 of file nbtinsert.c.

2687{
2689 int ndeletable = 0;
2690 OffsetNumber offnum,
2691 minoff,
2692 maxoff;
2693 Buffer buffer = insertstate->buf;
2694 BTScanInsert itup_key = insertstate->itup_key;
2695 Page page = BufferGetPage(buffer);
2696 BTPageOpaque opaque = BTPageGetOpaque(page);
2697
2698 Assert(P_ISLEAF(opaque));
2699 Assert(simpleonly || itup_key->heapkeyspace);
2700 Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2701
2702 /*
2703 * Scan over all items to see which ones need to be deleted according to
2704 * LP_DEAD flags. We'll usually manage to delete a few extra items that
2705 * are not marked LP_DEAD in passing. Often the extra items that actually
2706 * end up getting deleted are items that would have had their LP_DEAD bit
2707 * set before long anyway (if we opted not to include them as extras).
2708 */
2709 minoff = P_FIRSTDATAKEY(opaque);
2710 maxoff = PageGetMaxOffsetNumber(page);
2711 for (offnum = minoff;
2712 offnum <= maxoff;
2713 offnum = OffsetNumberNext(offnum))
2714 {
2715 ItemId itemId = PageGetItemId(page, offnum);
2716
2717 if (ItemIdIsDead(itemId))
2718 deletable[ndeletable++] = offnum;
2719 }
2720
2721 if (ndeletable > 0)
2722 {
2723 _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2724 insertstate->itup, minoff, maxoff);
2725 insertstate->bounds_valid = false;
2726
2727 /* Return when a page split has already been avoided */
2728 if (PageGetFreeSpace(page) >= insertstate->itemsz)
2729 return;
2730
2731 /* Might as well assume duplicates (if checkingunique) */
2732 uniquedup = true;
2733 }
2734
2735 /*
2736 * We're done with simple deletion. Return early with callers that only
2737 * call here so that simple deletion can be considered. This includes
2738 * callers that explicitly ask for this and checkingunique callers that
2739 * probably don't have any version churn duplicates on the page.
2740 *
2741 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2742 * return at this point (or when we go on the try either or both of our
2743 * other strategies and they also fail). We do not bother expending a
2744 * separate write to clear it, however. Caller will definitely clear it
2745 * when it goes on to split the page (note also that the deduplication
2746 * process will clear the flag in passing, just to keep things tidy).
2747 */
2748 if (simpleonly || (checkingunique && !uniquedup))
2749 {
2750 Assert(!indexUnchanged);
2751 return;
2752 }
2753
2754 /* Assume bounds about to be invalidated (this is almost certain now) */
2755 insertstate->bounds_valid = false;
2756
2757 /*
2758 * Perform bottom-up index deletion pass when executor hint indicated that
2759 * incoming item is logically unchanged, or for a unique index that is
2760 * known to have physical duplicates for some other reason. (There is a
2761 * large overlap between these two cases for a unique index. It's worth
2762 * having both triggering conditions in order to apply the optimization in
2763 * the event of successive related INSERT and DELETE statements.)
2764 *
2765 * We'll go on to do a deduplication pass when a bottom-up pass fails to
2766 * delete an acceptable amount of free space (a significant fraction of
2767 * the page, or space for the new item, whichever is greater).
2768 *
2769 * Note: Bottom-up index deletion uses the same equality/equivalence
2770 * routines as deduplication internally. However, it does not merge
2771 * together index tuples, so the same correctness considerations do not
2772 * apply. We deliberately omit an index-is-allequalimage test here.
2773 */
2774 if ((indexUnchanged || uniquedup) &&
2775 _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2776 return;
2777
2778 /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2779 if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2780 _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2781 (indexUnchanged || uniquedup));
2782}
Size PageGetFreeSpace(const PageData *page)
Definition: bufpage.c:896
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
Definition: nbtdedup.c:58
bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
Definition: nbtdedup.c:307
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
Definition: nbtinsert.c:2812
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1141
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
bool allequalimage
Definition: nbtree.h:792
bool heapkeyspace
Definition: nbtree.h:791

References _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_simpledel_pass(), BTScanInsertData::allequalimage, Assert(), BTInsertStateData::bounds_valid, BTGetDeduplicateItems, BTPageGetOpaque, BTInsertStateData::buf, BufferGetPage(), BTScanInsertData::heapkeyspace, ItemIdIsDead, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MaxIndexTuplesPerPage, OffsetNumberNext, P_FIRSTDATAKEY, P_ISLEAF, PageGetFreeSpace(), PageGetItemId(), and PageGetMaxOffsetNumber().

Referenced by _bt_findinsertloc().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 102 of file nbtinsert.c.

105{
106 bool is_unique = false;
107 BTInsertStateData insertstate;
108 BTScanInsert itup_key;
109 BTStack stack;
110 bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111
112 /* we need an insertion scan key to do our search, so build one */
113 itup_key = _bt_mkscankey(rel, itup);
114
115 if (checkingunique)
116 {
117 if (!itup_key->anynullkeys)
118 {
119 /* No (heapkeyspace) scantid until uniqueness established */
120 itup_key->scantid = NULL;
121 }
122 else
123 {
124 /*
125 * Scan key for new tuple contains NULL key values. Bypass
126 * checkingunique steps. They are unnecessary because core code
127 * considers NULL unequal to every value, including NULL.
128 *
129 * This optimization avoids O(N^2) behavior within the
130 * _bt_findinsertloc() heapkeyspace path when a unique index has a
131 * large number of "duplicates" with NULL key values.
132 */
133 checkingunique = false;
134 /* Tuple is unique in the sense that core code cares about */
135 Assert(checkUnique != UNIQUE_CHECK_EXISTING);
136 is_unique = true;
137 }
138 }
139
140 /*
141 * Fill in the BTInsertState working area, to track the current page and
142 * position within the page to insert on.
143 *
144 * Note that itemsz is passed down to lower level code that deals with
145 * inserting the item. It must be MAXALIGN()'d. This ensures that space
146 * accounting code consistently considers the alignment overhead that we
147 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148 * already conservative about alignment, but we don't rely on that from
149 * this distance. Besides, preserving the "true" tuple size in index
150 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152 */
153 insertstate.itup = itup;
154 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
155 insertstate.itup_key = itup_key;
156 insertstate.bounds_valid = false;
157 insertstate.buf = InvalidBuffer;
158 insertstate.postingoff = 0;
159
160search:
161
162 /*
163 * Find and lock the leaf page that the tuple should be added to by
164 * searching from the root page. insertstate.buf will hold a buffer that
165 * is locked in exclusive mode afterwards.
166 */
167 stack = _bt_search_insert(rel, heapRel, &insertstate);
168
169 /*
170 * checkingunique inserts are not allowed to go ahead when two tuples with
171 * equal key attribute values would be visible to new MVCC snapshots once
172 * the xact commits. Check for conflicts in the locked page/buffer (if
173 * needed) here.
174 *
175 * It might be necessary to check a page to the right in _bt_check_unique,
176 * though that should be very rare. In practice the first page the value
177 * could be on (with scantid omitted) is almost always also the only page
178 * that a matching tuple might be found on. This is due to the behavior
179 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180 * only be allowed to cross a page boundary when there is no candidate
181 * leaf page split point that avoids it. Also, _bt_check_unique can use
182 * the leaf page high key to determine that there will be no duplicates on
183 * the right sibling without actually visiting it (it uses the high key in
184 * cases where the new item happens to belong at the far right of the leaf
185 * page).
186 *
187 * NOTE: obviously, _bt_check_unique can only detect keys that are already
188 * in the index; so it cannot defend against concurrent insertions of the
189 * same key. We protect against that by means of holding a write lock on
190 * the first page the value could be on, with omitted/-inf value for the
191 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192 * the same key must acquire a write lock on the same page, so only one
193 * would-be inserter can be making the check at one time. Furthermore,
194 * once we are past the check we hold write locks continuously until we
195 * have performed our insertion, so no later inserter can fail to see our
196 * insertion. (This requires some care in _bt_findinsertloc.)
197 *
198 * If we must wait for another xact, we release the lock while waiting,
199 * and then must perform a new search.
200 *
201 * For a partial uniqueness check, we don't wait for the other xact. Just
202 * let the tuple in and return false for possibly non-unique, or true for
203 * definitely unique.
204 */
205 if (checkingunique)
206 {
207 TransactionId xwait;
208 uint32 speculativeToken;
209
210 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211 &is_unique, &speculativeToken);
212
213 if (unlikely(TransactionIdIsValid(xwait)))
214 {
215 /* Have to wait for the other guy ... */
216 _bt_relbuf(rel, insertstate.buf);
217 insertstate.buf = InvalidBuffer;
218
219 /*
220 * If it's a speculative insertion, wait for it to finish (ie. to
221 * go ahead with the insertion, or kill the tuple). Otherwise
222 * wait for the transaction to finish as usual.
223 */
224 if (speculativeToken)
225 SpeculativeInsertionWait(xwait, speculativeToken);
226 else
227 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228
229 /* start over... */
230 if (stack)
231 _bt_freestack(stack);
232 goto search;
233 }
234
235 /* Uniqueness is established -- restore heap tid as scantid */
236 if (itup_key->heapkeyspace)
237 itup_key->scantid = &itup->t_tid;
238 }
239
240 if (checkUnique != UNIQUE_CHECK_EXISTING)
241 {
242 OffsetNumber newitemoff;
243
244 /*
245 * The only conflict predicate locking cares about for indexes is when
246 * an index tuple insert conflicts with an existing lock. We don't
247 * know the actual page we're going to insert on for sure just yet in
248 * checkingunique and !heapkeyspace cases, but it's okay to use the
249 * first page the value could be on (with scantid omitted) instead.
250 */
252
253 /*
254 * Do the insertion. Note that insertstate contains cached binary
255 * search bounds established within _bt_check_unique when insertion is
256 * checkingunique.
257 */
258 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259 indexUnchanged, stack, heapRel);
260 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261 stack, itup, insertstate.itemsz, newitemoff,
262 insertstate.postingoff, false);
263 }
264 else
265 {
266 /* just release the buffer */
267 _bt_relbuf(rel, insertstate.buf);
268 }
269
270 /* be tidy */
271 if (stack)
272 _bt_freestack(stack);
273 pfree(itup_key);
274
275 return is_unique;
276}
#define MAXALIGN(LEN)
Definition: c.h:782
#define unlikely(x)
Definition: c.h:347
uint32_t uint32
Definition: c.h:502
@ UNIQUE_CHECK_NO
Definition: genam.h:140
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:663
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:822
@ XLTW_InsertIndex
Definition: lmgr.h:31
void pfree(void *pointer)
Definition: mcxt.c:1524
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:317
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:815
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1105
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:408
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:172
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:80

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert(), BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize(), InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_findinsertloc()

static OffsetNumber _bt_findinsertloc ( Relation  rel,
BTInsertState  insertstate,
bool  checkingunique,
bool  indexUnchanged,
BTStack  stack,
Relation  heapRel 
)
static

Definition at line 815 of file nbtinsert.c.

821{
822 BTScanInsert itup_key = insertstate->itup_key;
823 Page page = BufferGetPage(insertstate->buf);
824 BTPageOpaque opaque;
825 OffsetNumber newitemoff;
826
827 opaque = BTPageGetOpaque(page);
828
829 /* Check 1/3 of a page restriction */
830 if (unlikely(insertstate->itemsz > BTMaxItemSize))
831 _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
832 insertstate->itup);
833
834 Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
835 Assert(!insertstate->bounds_valid || checkingunique);
836 Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
837 Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
838 Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
839
840 if (itup_key->heapkeyspace)
841 {
842 /* Keep track of whether checkingunique duplicate seen */
843 bool uniquedup = indexUnchanged;
844
845 /*
846 * If we're inserting into a unique index, we may have to walk right
847 * through leaf pages to find the one leaf page that we must insert on
848 * to.
849 *
850 * This is needed for checkingunique callers because a scantid was not
851 * used when we called _bt_search(). scantid can only be set after
852 * _bt_check_unique() has checked for duplicates. The buffer
853 * initially stored in insertstate->buf has the page where the first
854 * duplicate key might be found, which isn't always the page that new
855 * tuple belongs on. The heap TID attribute for new tuple (scantid)
856 * could force us to insert on a sibling page, though that should be
857 * very rare in practice.
858 */
859 if (checkingunique)
860 {
861 if (insertstate->low < insertstate->stricthigh)
862 {
863 /* Encountered a duplicate in _bt_check_unique() */
864 Assert(insertstate->bounds_valid);
865 uniquedup = true;
866 }
867
868 for (;;)
869 {
870 /*
871 * Does the new tuple belong on this page?
872 *
873 * The earlier _bt_check_unique() call may well have
874 * established a strict upper bound on the offset for the new
875 * item. If it's not the last item of the page (i.e. if there
876 * is at least one tuple on the page that goes after the tuple
877 * we're inserting) then we know that the tuple belongs on
878 * this page. We can skip the high key check.
879 */
880 if (insertstate->bounds_valid &&
881 insertstate->low <= insertstate->stricthigh &&
882 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
883 break;
884
885 /* Test '<=', not '!=', since scantid is set now */
886 if (P_RIGHTMOST(opaque) ||
887 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
888 break;
889
890 _bt_stepright(rel, heapRel, insertstate, stack);
891 /* Update local state after stepping right */
892 page = BufferGetPage(insertstate->buf);
893 opaque = BTPageGetOpaque(page);
894 /* Assume duplicates (if checkingunique) */
895 uniquedup = true;
896 }
897 }
898
899 /*
900 * If the target page cannot fit newitem, try to avoid splitting the
901 * page on insert by performing deletion or deduplication now
902 */
903 if (PageGetFreeSpace(page) < insertstate->itemsz)
904 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
905 checkingunique, uniquedup,
906 indexUnchanged);
907 }
908 else
909 {
910 /*----------
911 * This is a !heapkeyspace (version 2 or 3) index. The current page
912 * is the first page that we could insert the new tuple to, but there
913 * may be other pages to the right that we could opt to use instead.
914 *
915 * If the new key is equal to one or more existing keys, we can
916 * legitimately place it anywhere in the series of equal keys. In
917 * fact, if the new key is equal to the page's "high key" we can place
918 * it on the next page. If it is equal to the high key, and there's
919 * not room to insert the new tuple on the current page without
920 * splitting, then we move right hoping to find more free space and
921 * avoid a split.
922 *
923 * Keep scanning right until we
924 * (a) find a page with enough free space,
925 * (b) reach the last page where the tuple can legally go, or
926 * (c) get tired of searching.
927 * (c) is not flippant; it is important because if there are many
928 * pages' worth of equal keys, it's better to split one of the early
929 * pages than to scan all the way to the end of the run of equal keys
930 * on every insert. We implement "get tired" as a random choice,
931 * since stopping after scanning a fixed number of pages wouldn't work
932 * well (we'd never reach the right-hand side of previously split
933 * pages). The probability of moving right is set at 0.99, which may
934 * seem too high to change the behavior much, but it does an excellent
935 * job of preventing O(N^2) behavior with many equal keys.
936 *----------
937 */
938 while (PageGetFreeSpace(page) < insertstate->itemsz)
939 {
940 /*
941 * Before considering moving right, see if we can obtain enough
942 * space by erasing LP_DEAD items
943 */
944 if (P_HAS_GARBAGE(opaque))
945 {
946 /* Perform simple deletion */
947 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
948 false, false, false);
949
950 if (PageGetFreeSpace(page) >= insertstate->itemsz)
951 break; /* OK, now we have enough space */
952 }
953
954 /*
955 * Nope, so check conditions (b) and (c) enumerated above
956 *
957 * The earlier _bt_check_unique() call may well have established a
958 * strict upper bound on the offset for the new item. If it's not
959 * the last item of the page (i.e. if there is at least one tuple
960 * on the page that's greater than the tuple we're inserting to)
961 * then we know that the tuple belongs on this page. We can skip
962 * the high key check.
963 */
964 if (insertstate->bounds_valid &&
965 insertstate->low <= insertstate->stricthigh &&
966 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
967 break;
968
969 if (P_RIGHTMOST(opaque) ||
970 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
972 break;
973
974 _bt_stepright(rel, heapRel, insertstate, stack);
975 /* Update local state after stepping right */
976 page = BufferGetPage(insertstate->buf);
977 opaque = BTPageGetOpaque(page);
978 }
979 }
980
981 /*
982 * We should now be on the correct page. Find the offset within the page
983 * for the new tuple. (Possibly reusing earlier search bounds.)
984 */
985 Assert(P_RIGHTMOST(opaque) ||
986 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
987
988 newitemoff = _bt_binsrch_insert(rel, insertstate);
989
990 if (insertstate->postingoff == -1)
991 {
992 /*
993 * There is an overlapping posting list tuple with its LP_DEAD bit
994 * set. We don't want to unnecessarily unset its LP_DEAD bit while
995 * performing a posting list split, so perform simple index tuple
996 * deletion early.
997 */
998 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
999 false, false, false);
1000
1001 /*
1002 * Do new binary search. New insert location cannot overlap with any
1003 * posting list now.
1004 */
1005 Assert(!insertstate->bounds_valid);
1006 insertstate->postingoff = 0;
1007 newitemoff = _bt_binsrch_insert(rel, insertstate);
1008 Assert(insertstate->postingoff == 0);
1009 }
1010
1011 return newitemoff;
1012}
#define PG_UINT32_MAX
Definition: c.h:561
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
Definition: nbtinsert.c:2683
static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1027
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:226
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BTMaxItemSize
Definition: nbtree.h:164
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:3239
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:227
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34

References _bt_binsrch_insert(), _bt_check_third_page(), _bt_compare(), _bt_delete_or_dedup_one_page(), _bt_stepright(), BTScanInsertData::allequalimage, Assert(), BTInsertStateData::bounds_valid, BTMaxItemSize, BTPageGetOpaque, BTInsertStateData::buf, BufferGetPage(), BTScanInsertData::heapkeyspace, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, BTInsertStateData::low, P_HAS_GARBAGE, P_HIKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber(), pg_global_prng_state, pg_prng_uint32(), PG_UINT32_MAX, BTInsertStateData::postingoff, BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_doinsert().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2241 of file nbtinsert.c.

2242{
2243 Page lpage = BufferGetPage(lbuf);
2244 BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245 Buffer rbuf;
2246 Page rpage;
2247 BTPageOpaque rpageop;
2248 bool wasroot;
2249 bool wasonly;
2250
2251 Assert(P_INCOMPLETE_SPLIT(lpageop));
2252 Assert(heaprel != NULL);
2253
2254 /* Lock right sibling, the one missing the downlink */
2255 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2256 rpage = BufferGetPage(rbuf);
2257 rpageop = BTPageGetOpaque(rpage);
2258
2259 /* Could this be a root split? */
2260 if (!stack)
2261 {
2262 Buffer metabuf;
2263 Page metapg;
2264 BTMetaPageData *metad;
2265
2266 /* acquire lock on the metapage */
2267 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2268 metapg = BufferGetPage(metabuf);
2269 metad = BTPageGetMeta(metapg);
2270
2271 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272
2273 _bt_relbuf(rel, metabuf);
2274 }
2275 else
2276 wasroot = false;
2277
2278 /* Was this the only page on the level before split? */
2279 wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280
2281 elog(DEBUG1, "finishing incomplete split of %u/%u",
2283
2284 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2285}
#define DEBUG1
Definition: elog.h:30
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2099
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_WRITE
Definition: nbtree.h:725
BlockNumber btm_root
Definition: nbtree.h:107

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert(), BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2319 of file nbtinsert.c.

2320{
2321 BlockNumber blkno;
2323
2324 blkno = stack->bts_blkno;
2325 start = stack->bts_offset;
2326
2327 for (;;)
2328 {
2329 Buffer buf;
2330 Page page;
2331 BTPageOpaque opaque;
2332
2333 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2334 page = BufferGetPage(buf);
2335 opaque = BTPageGetOpaque(page);
2336
2337 Assert(heaprel != NULL);
2338 if (P_INCOMPLETE_SPLIT(opaque))
2339 {
2340 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2341 continue;
2342 }
2343
2344 if (!P_IGNORE(opaque))
2345 {
2346 OffsetNumber offnum,
2347 minoff,
2348 maxoff;
2349 ItemId itemid;
2350 IndexTuple item;
2351
2352 minoff = P_FIRSTDATAKEY(opaque);
2353 maxoff = PageGetMaxOffsetNumber(page);
2354
2355 /*
2356 * start = InvalidOffsetNumber means "search the whole page". We
2357 * need this test anyway due to possibility that page has a high
2358 * key now when it didn't before.
2359 */
2360 if (start < minoff)
2361 start = minoff;
2362
2363 /*
2364 * Need this check too, to guard against possibility that page
2365 * split since we visited it originally.
2366 */
2367 if (start > maxoff)
2368 start = OffsetNumberNext(maxoff);
2369
2370 /*
2371 * These loops will check every item on the page --- but in an
2372 * order that's attuned to the probability of where it actually
2373 * is. Scan to the right first, then to the left.
2374 */
2375 for (offnum = start;
2376 offnum <= maxoff;
2377 offnum = OffsetNumberNext(offnum))
2378 {
2379 itemid = PageGetItemId(page, offnum);
2380 item = (IndexTuple) PageGetItem(page, itemid);
2381
2382 if (BTreeTupleGetDownLink(item) == child)
2383 {
2384 /* Return accurate pointer to where link is now */
2385 stack->bts_blkno = blkno;
2386 stack->bts_offset = offnum;
2387 return buf;
2388 }
2389 }
2390
2391 for (offnum = OffsetNumberPrev(start);
2392 offnum >= minoff;
2393 offnum = OffsetNumberPrev(offnum))
2394 {
2395 itemid = PageGetItemId(page, offnum);
2396 item = (IndexTuple) PageGetItem(page, itemid);
2397
2398 if (BTreeTupleGetDownLink(item) == child)
2399 {
2400 /* Return accurate pointer to where link is now */
2401 stack->bts_blkno = blkno;
2402 stack->bts_offset = offnum;
2403 return buf;
2404 }
2405 }
2406 }
2407
2408 /*
2409 * The item we're looking for moved right at least one page.
2410 *
2411 * Lehman and Yao couple/chain locks when moving right here, which we
2412 * can avoid. See nbtree/README.
2413 */
2414 if (P_RIGHTMOST(opaque))
2415 {
2416 _bt_relbuf(rel, buf);
2417 return InvalidBuffer;
2418 }
2419 blkno = opaque->btpo_next;
2421 _bt_relbuf(rel, buf);
2422 }
2423}
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static char * buf
Definition: pg_test_fsync.c:72
BlockNumber bts_blkno
Definition: nbtree.h:739
struct BTStackData * bts_parent
Definition: nbtree.h:741
OffsetNumber bts_offset
Definition: nbtree.h:740

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert(), BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_insert_parent()

static void _bt_insert_parent ( Relation  rel,
Relation  heaprel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  isroot,
bool  isonly 
)
static

Definition at line 2099 of file nbtinsert.c.

2106{
2107 Assert(heaprel != NULL);
2108
2109 /*
2110 * Here we have to do something Lehman and Yao don't talk about: deal with
2111 * a root split and construction of a new root. If our stack is empty
2112 * then we have just split a node on what had been the root level when we
2113 * descended the tree. If it was still the root then we perform a
2114 * new-root construction. If it *wasn't* the root anymore, search to find
2115 * the next higher level that someone constructed meanwhile, and find the
2116 * right place to insert as for the normal case.
2117 *
2118 * If we have to search for the parent level, we do so by re-descending
2119 * from the root. This is not super-efficient, but it's rare enough not
2120 * to matter.
2121 */
2122 if (isroot)
2123 {
2124 Buffer rootbuf;
2125
2126 Assert(stack == NULL);
2127 Assert(isonly);
2128 /* create a new root node one level up and update the metapage */
2129 rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2130 /* release the split buffers */
2131 _bt_relbuf(rel, rootbuf);
2132 _bt_relbuf(rel, rbuf);
2133 _bt_relbuf(rel, buf);
2134 }
2135 else
2136 {
2138 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2139 Page page = BufferGetPage(buf);
2140 IndexTuple new_item;
2141 BTStackData fakestack;
2142 IndexTuple ritem;
2143 Buffer pbuf;
2144
2145 if (stack == NULL)
2146 {
2147 BTPageOpaque opaque;
2148
2149 elog(DEBUG2, "concurrent ROOT page split");
2150 opaque = BTPageGetOpaque(page);
2151
2152 /*
2153 * We should never reach here when a leaf page split takes place
2154 * despite the insert of newitem being able to apply the fastpath
2155 * optimization. Make sure of that with an assertion.
2156 *
2157 * This is more of a performance issue than a correctness issue.
2158 * The fastpath won't have a descent stack. Using a phony stack
2159 * here works, but never rely on that. The fastpath should be
2160 * rejected within _bt_search_insert() when the rightmost leaf
2161 * page will split, since it's faster to go through _bt_search()
2162 * and get a stack in the usual way.
2163 */
2164 Assert(!(P_ISLEAF(opaque) &&
2166
2167 /* Find the leftmost page at the next level up */
2168 pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2169 /* Set up a phony stack entry pointing there */
2170 stack = &fakestack;
2171 stack->bts_blkno = BufferGetBlockNumber(pbuf);
2173 stack->bts_parent = NULL;
2174 _bt_relbuf(rel, pbuf);
2175 }
2176
2177 /* get high key from left, a strict lower bound for new right page */
2178 ritem = (IndexTuple) PageGetItem(page,
2179 PageGetItemId(page, P_HIKEY));
2180
2181 /* form an index tuple that points at the new right page */
2182 new_item = CopyIndexTuple(ritem);
2183 BTreeTupleSetDownLink(new_item, rbknum);
2184
2185 /*
2186 * Re-find and write lock the parent of buf.
2187 *
2188 * It's possible that the location of buf's downlink has changed since
2189 * our initial _bt_search() descent. _bt_getstackbuf() will detect
2190 * and recover from this, updating the stack, which ensures that the
2191 * new downlink will be inserted at the correct offset. Even buf's
2192 * parent may have changed.
2193 */
2194 pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2195
2196 /*
2197 * Unlock the right child. The left child will be unlocked in
2198 * _bt_insertonpg().
2199 *
2200 * Unlocking the right child must be delayed until here to ensure that
2201 * no concurrent VACUUM operation can become confused. Page deletion
2202 * cannot be allowed to fail to re-find a downlink for the rbuf page.
2203 * (Actually, this is just a vestige of how things used to work. The
2204 * page deletion code is expected to check for the INCOMPLETE_SPLIT
2205 * flag on the left child. It won't attempt deletion of the right
2206 * child until the split is complete. Despite all this, we opt to
2207 * conservatively delay unlocking the right child until here.)
2208 */
2209 _bt_relbuf(rel, rbuf);
2210
2211 if (pbuf == InvalidBuffer)
2212 ereport(ERROR,
2213 (errcode(ERRCODE_INDEX_CORRUPTED),
2214 errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2215 RelationGetRelationName(rel), bknum, rbknum)));
2216
2217 /* Recursively insert into the parent */
2218 _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2219 new_item, MAXALIGN(IndexTupleSize(new_item)),
2220 stack->bts_offset + 1, 0, isonly);
2221
2222 /* be tidy */
2223 pfree(new_item);
2224 }
2225}
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
#define DEBUG2
Definition: elog.h:29
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2319
static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2444
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:562
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2456
#define RelationGetTargetBlock(relation)
Definition: rel.h:608
uint32 btpo_level
Definition: nbtree.h:66

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newlevel(), _bt_relbuf(), Assert(), BlockNumberIsValid(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleSetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetBlockNumber(), BufferGetPage(), CopyIndexTuple(), DEBUG2, elog, ereport, errcode(), errmsg_internal(), ERROR, IndexTupleSize(), InvalidBuffer, InvalidOffsetNumber, MAXALIGN, P_HIKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), pfree(), RelationGetRelationName, and RelationGetTargetBlock.

Referenced by _bt_finish_split(), and _bt_insertonpg().

◆ _bt_insertonpg()

static void _bt_insertonpg ( Relation  rel,
Relation  heaprel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
BTStack  stack,
IndexTuple  itup,
Size  itemsz,
OffsetNumber  newitemoff,
int  postingoff,
bool  split_only_page 
)
static

Definition at line 1105 of file nbtinsert.c.

1116{
1117 Page page;
1118 BTPageOpaque opaque;
1119 bool isleaf,
1120 isroot,
1121 isrightmost,
1122 isonly;
1123 IndexTuple oposting = NULL;
1124 IndexTuple origitup = NULL;
1125 IndexTuple nposting = NULL;
1126
1127 page = BufferGetPage(buf);
1128 opaque = BTPageGetOpaque(page);
1129 isleaf = P_ISLEAF(opaque);
1130 isroot = P_ISROOT(opaque);
1131 isrightmost = P_RIGHTMOST(opaque);
1132 isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1133
1134 /* child buffer must be given iff inserting on an internal page */
1135 Assert(isleaf == !BufferIsValid(cbuf));
1136 /* tuple must have appropriate number of attributes */
1137 Assert(!isleaf ||
1138 BTreeTupleGetNAtts(itup, rel) ==
1140 Assert(isleaf ||
1141 BTreeTupleGetNAtts(itup, rel) <=
1144 Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1145 /* Caller must always finish incomplete split for us */
1146 Assert(!P_INCOMPLETE_SPLIT(opaque));
1147
1148 /*
1149 * Every internal page should have exactly one negative infinity item at
1150 * all times. Only _bt_split() and _bt_newlevel() should add items that
1151 * become negative infinity items through truncation, since they're the
1152 * only routines that allocate new internal pages.
1153 */
1154 Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1155
1156 /*
1157 * Do we need to split an existing posting list item?
1158 */
1159 if (postingoff != 0)
1160 {
1161 ItemId itemid = PageGetItemId(page, newitemoff);
1162
1163 /*
1164 * The new tuple is a duplicate with a heap TID that falls inside the
1165 * range of an existing posting list tuple on a leaf page. Prepare to
1166 * split an existing posting list. Overwriting the posting list with
1167 * its post-split version is treated as an extra step in either the
1168 * insert or page split critical section.
1169 */
1170 Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1171 oposting = (IndexTuple) PageGetItem(page, itemid);
1172
1173 /*
1174 * postingoff value comes from earlier call to _bt_binsrch_posting().
1175 * Its binary search might think that a plain tuple must be a posting
1176 * list tuple that needs to be split. This can happen with corruption
1177 * involving an existing plain tuple that is a duplicate of the new
1178 * item, up to and including its table TID. Check for that here in
1179 * passing.
1180 *
1181 * Also verify that our caller has made sure that the existing posting
1182 * list tuple does not have its LP_DEAD bit set.
1183 */
1184 if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1185 ereport(ERROR,
1186 (errcode(ERRCODE_INDEX_CORRUPTED),
1187 errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1190 newitemoff, BufferGetBlockNumber(buf),
1192
1193 /* use a mutable copy of itup as our itup from here on */
1194 origitup = itup;
1195 itup = CopyIndexTuple(origitup);
1196 nposting = _bt_swap_posting(itup, oposting, postingoff);
1197 /* itup now contains rightmost/max TID from oposting */
1198
1199 /* Alter offset so that newitem goes after posting list */
1200 newitemoff = OffsetNumberNext(newitemoff);
1201 }
1202
1203 /*
1204 * Do we need to split the page to fit the item on it?
1205 *
1206 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1207 * so this comparison is correct even though we appear to be accounting
1208 * only for the item and not for its line pointer.
1209 */
1210 if (PageGetFreeSpace(page) < itemsz)
1211 {
1212 Buffer rbuf;
1213
1214 Assert(!split_only_page);
1215
1216 /* split the buffer into left and right halves */
1217 rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1218 itup, origitup, nposting, postingoff);
1221 BufferGetBlockNumber(rbuf));
1222
1223 /*----------
1224 * By here,
1225 *
1226 * + our target page has been split;
1227 * + the original tuple has been inserted;
1228 * + we have write locks on both the old (left half)
1229 * and new (right half) buffers, after the split; and
1230 * + we know the key we want to insert into the parent
1231 * (it's the "high key" on the left child page).
1232 *
1233 * We're ready to do the parent insertion. We need to hold onto the
1234 * locks for the child pages until we locate the parent, but we can
1235 * at least release the lock on the right child before doing the
1236 * actual insertion. The lock on the left child will be released
1237 * last of all by parent insertion, where it is the 'cbuf' of parent
1238 * page.
1239 *----------
1240 */
1241 _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1242 }
1243 else
1244 {
1245 Buffer metabuf = InvalidBuffer;
1246 Page metapg = NULL;
1247 BTMetaPageData *metad = NULL;
1248 BlockNumber blockcache;
1249
1250 /*
1251 * If we are doing this insert because we split a page that was the
1252 * only one on its tree level, but was not the root, it may have been
1253 * the "fast root". We need to ensure that the fast root link points
1254 * at or above the current page. We can safely acquire a lock on the
1255 * metapage here --- see comments for _bt_newlevel().
1256 */
1257 if (unlikely(split_only_page))
1258 {
1259 Assert(!isleaf);
1260 Assert(BufferIsValid(cbuf));
1261
1262 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1263 metapg = BufferGetPage(metabuf);
1264 metad = BTPageGetMeta(metapg);
1265
1266 if (metad->btm_fastlevel >= opaque->btpo_level)
1267 {
1268 /* no update wanted */
1269 _bt_relbuf(rel, metabuf);
1270 metabuf = InvalidBuffer;
1271 }
1272 }
1273
1274 /* Do the update. No ereport(ERROR) until changes are logged */
1276
1277 if (postingoff != 0)
1278 memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1279
1280 if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1281 false) == InvalidOffsetNumber)
1282 elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1284
1286
1287 if (BufferIsValid(metabuf))
1288 {
1289 /* upgrade meta-page if needed */
1290 if (metad->btm_version < BTREE_NOVAC_VERSION)
1291 _bt_upgrademetapage(metapg);
1293 metad->btm_fastlevel = opaque->btpo_level;
1294 MarkBufferDirty(metabuf);
1295 }
1296
1297 /*
1298 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1299 * finishes a split
1300 */
1301 if (!isleaf)
1302 {
1303 Page cpage = BufferGetPage(cbuf);
1304 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1305
1306 Assert(P_INCOMPLETE_SPLIT(cpageop));
1307 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1308 MarkBufferDirty(cbuf);
1309 }
1310
1311 /* XLOG stuff */
1312 if (RelationNeedsWAL(rel))
1313 {
1314 xl_btree_insert xlrec;
1315 xl_btree_metadata xlmeta;
1316 uint8 xlinfo;
1317 XLogRecPtr recptr;
1318 uint16 upostingoff;
1319
1320 xlrec.offnum = newitemoff;
1321
1324
1325 if (isleaf && postingoff == 0)
1326 {
1327 /* Simple leaf insert */
1328 xlinfo = XLOG_BTREE_INSERT_LEAF;
1329 }
1330 else if (postingoff != 0)
1331 {
1332 /*
1333 * Leaf insert with posting list split. Must include
1334 * postingoff field before newitem/orignewitem.
1335 */
1336 Assert(isleaf);
1337 xlinfo = XLOG_BTREE_INSERT_POST;
1338 }
1339 else
1340 {
1341 /* Internal page insert, which finishes a split on cbuf */
1342 xlinfo = XLOG_BTREE_INSERT_UPPER;
1344
1345 if (BufferIsValid(metabuf))
1346 {
1347 /* Actually, it's an internal page insert + meta update */
1348 xlinfo = XLOG_BTREE_INSERT_META;
1349
1351 xlmeta.version = metad->btm_version;
1352 xlmeta.root = metad->btm_root;
1353 xlmeta.level = metad->btm_level;
1354 xlmeta.fastroot = metad->btm_fastroot;
1355 xlmeta.fastlevel = metad->btm_fastlevel;
1357 xlmeta.allequalimage = metad->btm_allequalimage;
1358
1359 XLogRegisterBuffer(2, metabuf,
1361 XLogRegisterBufData(2, &xlmeta,
1362 sizeof(xl_btree_metadata));
1363 }
1364 }
1365
1367 if (postingoff == 0)
1368 {
1369 /* Just log itup from caller */
1370 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1371 }
1372 else
1373 {
1374 /*
1375 * Insert with posting list split (XLOG_BTREE_INSERT_POST
1376 * record) case.
1377 *
1378 * Log postingoff. Also log origitup, not itup. REDO routine
1379 * must reconstruct final itup (as well as nposting) using
1380 * _bt_swap_posting().
1381 */
1382 upostingoff = postingoff;
1383
1384 XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
1385 XLogRegisterBufData(0, origitup,
1386 IndexTupleSize(origitup));
1387 }
1388
1389 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1390
1391 if (BufferIsValid(metabuf))
1392 PageSetLSN(metapg, recptr);
1393 if (!isleaf)
1394 PageSetLSN(BufferGetPage(cbuf), recptr);
1395
1396 PageSetLSN(page, recptr);
1397 }
1398
1400
1401 /* Release subsidiary buffers */
1402 if (BufferIsValid(metabuf))
1403 _bt_relbuf(rel, metabuf);
1404 if (!isleaf)
1405 _bt_relbuf(rel, cbuf);
1406
1407 /*
1408 * Cache the block number if this is the rightmost leaf page. Cache
1409 * may be used by a future inserter within _bt_search_insert().
1410 */
1411 blockcache = InvalidBlockNumber;
1412 if (isrightmost && isleaf && !isroot)
1413 blockcache = BufferGetBlockNumber(buf);
1414
1415 /* Release buffer for insertion target block */
1416 _bt_relbuf(rel, buf);
1417
1418 /*
1419 * If we decided to cache the insertion target block before releasing
1420 * its buffer lock, then cache it now. Check the height of the tree
1421 * first, though. We don't go for the optimization with small
1422 * indexes. Defer final check to this point to ensure that we don't
1423 * call _bt_getrootheight while holding a buffer lock.
1424 */
1425 if (BlockNumberIsValid(blockcache) &&
1427 RelationSetTargetBlock(rel, blockcache);
1428 }
1429
1430 /* be tidy */
1431 if (postingoff != 0)
1432 {
1433 /* itup is actually a modified copy of caller's original */
1434 pfree(nposting);
1435 pfree(itup);
1436 }
1437}
#define InvalidBlockNumber
Definition: block.h:33
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2531
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
uint8_t uint8
Definition: c.h:500
uint16_t uint16
Definition: c.h:501
#define PANIC
Definition: elog.h:42
Pointer Item
Definition: item.h:17
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:1022
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:30
static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1467
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675
#define P_ISROOT(opaque)
Definition: nbtree.h:221
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:32
#define SizeOfBtreeInsert
Definition: nbtxlog.h:84
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:27
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:28
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:29
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3134
#define RelationNeedsWAL(relation)
Definition: rel.h:635
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:615
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:524
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:531
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
OffsetNumber offnum
Definition: nbtxlog.h:78
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:364
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34

References _bt_getbuf(), _bt_getrootheight(), _bt_insert_parent(), _bt_relbuf(), _bt_split(), _bt_swap_posting(), _bt_upgrademetapage(), BTScanInsertData::allequalimage, xl_btree_metadata::allequalimage, Assert(), BlockNumberIsValid(), BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTREE_FASTPATH_MIN_LEVEL, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BTreeTupleIsPosting(), buf, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CopyIndexTuple(), elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, BTScanInsertData::heapkeyspace, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, ItemIdIsDead, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), MAXALIGN, xl_btree_insert::offnum, OffsetNumberNext, P_FIRSTDATAKEY, P_INCOMPLETE_SPLIT, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageAddItem, PageGetFreeSpace(), PageGetItem(), PageGetItemId(), PageSetLSN(), PANIC, pfree(), PredicateLockPageSplit(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, RelationSetTargetBlock, xl_btree_metadata::root, SizeOfBtreeInsert, START_CRIT_SECTION, IndexTupleData::t_tid, unlikely, xl_btree_metadata::version, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_POST, XLOG_BTREE_INSERT_UPPER, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_doinsert(), and _bt_insert_parent().

◆ _bt_newlevel()

static Buffer _bt_newlevel ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
Buffer  rbuf 
)
static

Definition at line 2444 of file nbtinsert.c.

2445{
2446 Buffer rootbuf;
2447 Page lpage,
2448 rootpage;
2449 BlockNumber lbkno,
2450 rbkno;
2451 BlockNumber rootblknum;
2452 BTPageOpaque rootopaque;
2453 BTPageOpaque lopaque;
2454 ItemId itemid;
2455 IndexTuple item;
2456 IndexTuple left_item;
2457 Size left_item_sz;
2458 IndexTuple right_item;
2459 Size right_item_sz;
2460 Buffer metabuf;
2461 Page metapg;
2462 BTMetaPageData *metad;
2463
2464 lbkno = BufferGetBlockNumber(lbuf);
2465 rbkno = BufferGetBlockNumber(rbuf);
2466 lpage = BufferGetPage(lbuf);
2467 lopaque = BTPageGetOpaque(lpage);
2468
2469 /* get a new root page */
2470 rootbuf = _bt_allocbuf(rel, heaprel);
2471 rootpage = BufferGetPage(rootbuf);
2472 rootblknum = BufferGetBlockNumber(rootbuf);
2473
2474 /* acquire lock on the metapage */
2475 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2476 metapg = BufferGetPage(metabuf);
2477 metad = BTPageGetMeta(metapg);
2478
2479 /*
2480 * Create downlink item for left page (old root). The key value used is
2481 * "minus infinity", a sentinel value that's reliably less than any real
2482 * key value that could appear in the left page.
2483 */
2484 left_item_sz = sizeof(IndexTupleData);
2485 left_item = (IndexTuple) palloc(left_item_sz);
2486 left_item->t_info = left_item_sz;
2487 BTreeTupleSetDownLink(left_item, lbkno);
2488 BTreeTupleSetNAtts(left_item, 0, false);
2489
2490 /*
2491 * Create downlink item for right page. The key for it is obtained from
2492 * the "high key" position in the left page.
2493 */
2494 itemid = PageGetItemId(lpage, P_HIKEY);
2495 right_item_sz = ItemIdGetLength(itemid);
2496 item = (IndexTuple) PageGetItem(lpage, itemid);
2497 right_item = CopyIndexTuple(item);
2498 BTreeTupleSetDownLink(right_item, rbkno);
2499
2500 /* NO EREPORT(ERROR) from here till newroot op is logged */
2502
2503 /* upgrade metapage if needed */
2504 if (metad->btm_version < BTREE_NOVAC_VERSION)
2505 _bt_upgrademetapage(metapg);
2506
2507 /* set btree special data */
2508 rootopaque = BTPageGetOpaque(rootpage);
2509 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2510 rootopaque->btpo_flags = BTP_ROOT;
2511 rootopaque->btpo_level =
2512 (BTPageGetOpaque(lpage))->btpo_level + 1;
2513 rootopaque->btpo_cycleid = 0;
2514
2515 /* update metapage data */
2516 metad->btm_root = rootblknum;
2517 metad->btm_level = rootopaque->btpo_level;
2518 metad->btm_fastroot = rootblknum;
2519 metad->btm_fastlevel = rootopaque->btpo_level;
2520
2521 /*
2522 * Insert the left page pointer into the new root page. The root page is
2523 * the rightmost page on its level so there is no "high key" in it; the
2524 * two items will go into positions P_HIKEY and P_FIRSTKEY.
2525 *
2526 * Note: we *must* insert the two items in item-number order, for the
2527 * benefit of _bt_restore_page().
2528 */
2529 Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2530 if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2531 false, false) == InvalidOffsetNumber)
2532 elog(PANIC, "failed to add leftkey to new root page"
2533 " while splitting block %u of index \"%s\"",
2535
2536 /*
2537 * insert the right page pointer into the new root page.
2538 */
2539 Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2540 Assert(BTreeTupleGetNAtts(right_item, rel) <=
2542 if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2543 false, false) == InvalidOffsetNumber)
2544 elog(PANIC, "failed to add rightkey to new root page"
2545 " while splitting block %u of index \"%s\"",
2547
2548 /* Clear the incomplete-split flag in the left child */
2549 Assert(P_INCOMPLETE_SPLIT(lopaque));
2550 lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2551 MarkBufferDirty(lbuf);
2552
2553 MarkBufferDirty(rootbuf);
2554 MarkBufferDirty(metabuf);
2555
2556 /* XLOG stuff */
2557 if (RelationNeedsWAL(rel))
2558 {
2559 xl_btree_newroot xlrec;
2560 XLogRecPtr recptr;
2562
2563 xlrec.rootblk = rootblknum;
2564 xlrec.level = metad->btm_level;
2565
2568
2572
2574 md.version = metad->btm_version;
2575 md.root = rootblknum;
2576 md.level = metad->btm_level;
2577 md.fastroot = rootblknum;
2578 md.fastlevel = metad->btm_level;
2580 md.allequalimage = metad->btm_allequalimage;
2581
2582 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2583
2584 /*
2585 * Direct access to page is not good but faster - we should implement
2586 * some new func in page API.
2587 */
2589 (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2590 ((PageHeader) rootpage)->pd_special -
2591 ((PageHeader) rootpage)->pd_upper);
2592
2593 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2594
2595 PageSetLSN(lpage, recptr);
2596 PageSetLSN(rootpage, recptr);
2597 PageSetLSN(metapg, recptr);
2598 }
2599
2601
2602 /* done with metapage */
2603 _bt_relbuf(rel, metabuf);
2604
2605 pfree(left_item);
2606 pfree(right_item);
2607
2608 return rootbuf;
2609}
size_t Size
Definition: c.h:576
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
struct IndexTupleData IndexTupleData
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
#define BTP_ROOT
Definition: nbtree.h:77
#define P_NONE
Definition: nbtree.h:212
#define P_FIRSTKEY
Definition: nbtree.h:368
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:595
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
BlockNumber btpo_prev
Definition: nbtree.h:64
BTCycleId btpo_cycleid
Definition: nbtree.h:68
unsigned short t_info
Definition: itup.h:49
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343

References _bt_allocbuf(), _bt_getbuf(), _bt_relbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_ROOT, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTreeTupleSetNAtts(), BufferGetBlockNumber(), BufferGetPage(), CopyIndexTuple(), elog, END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), P_FIRSTKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_NONE, PageAddItem, PageGetItem(), PageGetItemId(), PageSetLSN(), palloc(), PANIC, pfree(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, IndexTupleData::t_info, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insert_parent().

◆ _bt_pgaddtup()

static bool _bt_pgaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off,
bool  newfirstdataitem 
)
inlinestatic

Definition at line 2630 of file nbtinsert.c.

2635{
2636 IndexTupleData trunctuple;
2637
2638 if (newfirstdataitem)
2639 {
2640 trunctuple = *itup;
2641 trunctuple.t_info = sizeof(IndexTupleData);
2642 BTreeTupleSetNAtts(&trunctuple, 0, false);
2643 itup = &trunctuple;
2644 itemsize = sizeof(IndexTupleData);
2645 }
2646
2647 if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2648 false) == InvalidOffsetNumber))
2649 return false;
2650
2651 return true;
2652}

References BTreeTupleSetNAtts(), InvalidOffsetNumber, PageAddItem, IndexTupleData::t_info, and unlikely.

Referenced by _bt_split().

◆ _bt_search_insert()

static BTStack _bt_search_insert ( Relation  rel,
Relation  heaprel,
BTInsertState  insertstate 
)
static

Definition at line 317 of file nbtinsert.c.

318{
319 Assert(insertstate->buf == InvalidBuffer);
320 Assert(!insertstate->bounds_valid);
321 Assert(insertstate->postingoff == 0);
322
324 {
325 /* Simulate a _bt_getbuf() call with conditional locking */
326 insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
327 if (_bt_conditionallockbuf(rel, insertstate->buf))
328 {
329 Page page;
330 BTPageOpaque opaque;
331
332 _bt_checkpage(rel, insertstate->buf);
333 page = BufferGetPage(insertstate->buf);
334 opaque = BTPageGetOpaque(page);
335
336 /*
337 * Check if the page is still the rightmost leaf page and has
338 * enough free space to accommodate the new tuple. Also check
339 * that the insertion scan key is strictly greater than the first
340 * non-pivot tuple on the page. (Note that we expect itup_key's
341 * scantid to be unset when our caller is a checkingunique
342 * inserter.)
343 */
344 if (P_RIGHTMOST(opaque) &&
345 P_ISLEAF(opaque) &&
346 !P_IGNORE(opaque) &&
347 PageGetFreeSpace(page) > insertstate->itemsz &&
349 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
350 {
351 /*
352 * Caller can use the fastpath optimization because cached
353 * block is still rightmost leaf page, which can fit caller's
354 * new tuple without splitting. Keep block in local cache for
355 * next insert, and have caller use NULL stack.
356 *
357 * Note that _bt_insert_parent() has an assertion that catches
358 * leaf page splits that somehow follow from a fastpath insert
359 * (it should only be passed a NULL stack when it must deal
360 * with a concurrent root page split, and never because a NULL
361 * stack was returned here).
362 */
363 return NULL;
364 }
365
366 /* Page unsuitable for caller, drop lock and pin */
367 _bt_relbuf(rel, insertstate->buf);
368 }
369 else
370 {
371 /* Lock unavailable, drop pin */
372 ReleaseBuffer(insertstate->buf);
373 }
374
375 /* Forget block, since cache doesn't appear to be useful */
377 }
378
379 /* Cannot use optimization -- descend tree, return proper descent stack */
380 return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
381 BT_WRITE);
382}
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4852
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:748
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102

References _bt_checkpage(), _bt_compare(), _bt_conditionallockbuf(), _bt_relbuf(), _bt_search(), Assert(), BTInsertStateData::bounds_valid, BT_WRITE, BTPageGetOpaque, BTInsertStateData::buf, BufferGetPage(), InvalidBlockNumber, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup_key, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, and ReleaseBuffer().

Referenced by _bt_doinsert().

◆ _bt_simpledel_pass()

static void _bt_simpledel_pass ( Relation  rel,
Buffer  buffer,
Relation  heapRel,
OffsetNumber deletable,
int  ndeletable,
IndexTuple  newitem,
OffsetNumber  minoff,
OffsetNumber  maxoff 
)
static

Definition at line 2812 of file nbtinsert.c.

2815{
2816 Page page = BufferGetPage(buffer);
2817 BlockNumber *deadblocks;
2818 int ndeadblocks;
2819 TM_IndexDeleteOp delstate;
2820 OffsetNumber offnum;
2821
2822 /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2823 deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2824 &ndeadblocks);
2825
2826 /* Initialize tableam state that describes index deletion operation */
2827 delstate.irel = rel;
2828 delstate.iblknum = BufferGetBlockNumber(buffer);
2829 delstate.bottomup = false;
2830 delstate.bottomupfreespace = 0;
2831 delstate.ndeltids = 0;
2832 delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2833 delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2834
2835 for (offnum = minoff;
2836 offnum <= maxoff;
2837 offnum = OffsetNumberNext(offnum))
2838 {
2839 ItemId itemid = PageGetItemId(page, offnum);
2840 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2841 TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2842 TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2843 BlockNumber tidblock;
2844 void *match;
2845
2846 if (!BTreeTupleIsPosting(itup))
2847 {
2848 tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2849 match = bsearch(&tidblock, deadblocks, ndeadblocks,
2850 sizeof(BlockNumber), _bt_blk_cmp);
2851
2852 if (!match)
2853 {
2854 Assert(!ItemIdIsDead(itemid));
2855 continue;
2856 }
2857
2858 /*
2859 * TID's table block is among those pointed to by the TIDs from
2860 * LP_DEAD-bit set tuples on page -- add TID to deltids
2861 */
2862 odeltid->tid = itup->t_tid;
2863 odeltid->id = delstate.ndeltids;
2864 ostatus->idxoffnum = offnum;
2865 ostatus->knowndeletable = ItemIdIsDead(itemid);
2866 ostatus->promising = false; /* unused */
2867 ostatus->freespace = 0; /* unused */
2868
2869 delstate.ndeltids++;
2870 }
2871 else
2872 {
2873 int nitem = BTreeTupleGetNPosting(itup);
2874
2875 for (int p = 0; p < nitem; p++)
2876 {
2877 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2878
2879 tidblock = ItemPointerGetBlockNumber(tid);
2880 match = bsearch(&tidblock, deadblocks, ndeadblocks,
2881 sizeof(BlockNumber), _bt_blk_cmp);
2882
2883 if (!match)
2884 {
2885 Assert(!ItemIdIsDead(itemid));
2886 continue;
2887 }
2888
2889 /*
2890 * TID's table block is among those pointed to by the TIDs
2891 * from LP_DEAD-bit set tuples on page -- add TID to deltids
2892 */
2893 odeltid->tid = *tid;
2894 odeltid->id = delstate.ndeltids;
2895 ostatus->idxoffnum = offnum;
2896 ostatus->knowndeletable = ItemIdIsDead(itemid);
2897 ostatus->promising = false; /* unused */
2898 ostatus->freespace = 0; /* unused */
2899
2900 odeltid++;
2901 ostatus++;
2902 delstate.ndeltids++;
2903 }
2904 }
2905 }
2906
2907 pfree(deadblocks);
2908
2909 Assert(delstate.ndeltids >= ndeletable);
2910
2911 /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2912 _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2913
2914 pfree(delstate.deltids);
2915 pfree(delstate.status);
2916}
static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
Definition: nbtinsert.c:2938
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1513
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
TM_IndexStatus * status
Definition: tableam.h:255
int bottomupfreespace
Definition: tableam.h:250
Relation irel
Definition: tableam.h:247
TM_IndexDelete * deltids
Definition: tableam.h:254
BlockNumber iblknum
Definition: tableam.h:248
ItemPointerData tid
Definition: tableam.h:213
bool knowndeletable
Definition: tableam.h:220
bool promising
Definition: tableam.h:223
int16 freespace
Definition: tableam.h:224
OffsetNumber idxoffnum
Definition: tableam.h:219

References _bt_blk_cmp(), _bt_deadblocks(), _bt_delitems_delete_check(), Assert(), TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexStatus::freespace, TM_IndexDeleteOp::iblknum, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, TM_IndexDeleteOp::irel, ItemIdIsDead, ItemPointerGetBlockNumber(), TM_IndexStatus::knowndeletable, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, PageGetItem(), PageGetItemId(), palloc(), pfree(), TM_IndexStatus::promising, TM_IndexDeleteOp::status, IndexTupleData::t_tid, and TM_IndexDelete::tid.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_split()

static Buffer _bt_split ( Relation  rel,
Relation  heaprel,
BTScanInsert  itup_key,
Buffer  buf,
Buffer  cbuf,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
IndexTuple  orignewitem,
IndexTuple  nposting,
uint16  postingoff 
)
static

Definition at line 1467 of file nbtinsert.c.

1470{
1471 Buffer rbuf;
1472 Page origpage;
1473 Page leftpage,
1474 rightpage;
1475 BlockNumber origpagenumber,
1476 rightpagenumber;
1477 BTPageOpaque ropaque,
1478 lopaque,
1479 oopaque;
1480 Buffer sbuf = InvalidBuffer;
1481 Page spage = NULL;
1482 BTPageOpaque sopaque = NULL;
1483 Size itemsz;
1484 ItemId itemid;
1485 IndexTuple firstright,
1486 lefthighkey;
1487 OffsetNumber firstrightoff;
1488 OffsetNumber afterleftoff,
1489 afterrightoff,
1490 minusinfoff;
1491 OffsetNumber origpagepostingoff;
1492 OffsetNumber maxoff;
1494 bool newitemonleft,
1495 isleaf,
1496 isrightmost;
1497
1498 /*
1499 * origpage is the original page to be split. leftpage is a temporary
1500 * buffer that receives the left-sibling data, which will be copied back
1501 * into origpage on success. rightpage is the new page that will receive
1502 * the right-sibling data.
1503 *
1504 * leftpage is allocated after choosing a split point. rightpage's new
1505 * buffer isn't acquired until after leftpage is initialized and has new
1506 * high key, the last point where splitting the page may fail (barring
1507 * corruption). Failing before acquiring new buffer won't have lasting
1508 * consequences, since origpage won't have been modified and leftpage is
1509 * only workspace.
1510 */
1511 origpage = BufferGetPage(buf);
1512 oopaque = BTPageGetOpaque(origpage);
1513 isleaf = P_ISLEAF(oopaque);
1514 isrightmost = P_RIGHTMOST(oopaque);
1515 maxoff = PageGetMaxOffsetNumber(origpage);
1516 origpagenumber = BufferGetBlockNumber(buf);
1517
1518 /*
1519 * Choose a point to split origpage at.
1520 *
1521 * A split point can be thought of as a point _between_ two existing data
1522 * items on origpage (the lastleft and firstright tuples), provided you
1523 * pretend that the new item that didn't fit is already on origpage.
1524 *
1525 * Since origpage does not actually contain newitem, the representation of
1526 * split points needs to work with two boundary cases: splits where
1527 * newitem is lastleft, and splits where newitem is firstright.
1528 * newitemonleft resolves the ambiguity that would otherwise exist when
1529 * newitemoff == firstrightoff. In all other cases it's clear which side
1530 * of the split every tuple goes on from context. newitemonleft is
1531 * usually (but not always) redundant information.
1532 *
1533 * firstrightoff is supposed to be an origpage offset number, but it's
1534 * possible that its value will be maxoff+1, which is "past the end" of
1535 * origpage. This happens in the rare case where newitem goes after all
1536 * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1537 * origpage at the point that leaves newitem alone on new right page. Any
1538 * "!newitemonleft && newitemoff == firstrightoff" split point makes
1539 * newitem the firstright tuple, though, so this case isn't a special
1540 * case.
1541 */
1542 firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1543 newitem, &newitemonleft);
1544
1545 /* Allocate temp buffer for leftpage */
1546 leftpage = PageGetTempPage(origpage);
1548 lopaque = BTPageGetOpaque(leftpage);
1549
1550 /*
1551 * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1552 * and HAS_GARBAGE flags.
1553 */
1554 lopaque->btpo_flags = oopaque->btpo_flags;
1556 /* set flag in leftpage indicating that rightpage has no downlink yet */
1557 lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1558 lopaque->btpo_prev = oopaque->btpo_prev;
1559 /* handle btpo_next after rightpage buffer acquired */
1560 lopaque->btpo_level = oopaque->btpo_level;
1561 /* handle btpo_cycleid after rightpage buffer acquired */
1562
1563 /*
1564 * Copy the original page's LSN into leftpage, which will become the
1565 * updated version of the page. We need this because XLogInsert will
1566 * examine the LSN and possibly dump it in a page image.
1567 */
1568 PageSetLSN(leftpage, PageGetLSN(origpage));
1569
1570 /*
1571 * Determine page offset number of existing overlapped-with-orignewitem
1572 * posting list when it is necessary to perform a posting list split in
1573 * passing. Note that newitem was already changed by caller (newitem no
1574 * longer has the orignewitem TID).
1575 *
1576 * This page offset number (origpagepostingoff) will be used to pretend
1577 * that the posting split has already taken place, even though the
1578 * required modifications to origpage won't occur until we reach the
1579 * critical section. The lastleft and firstright tuples of our page split
1580 * point should, in effect, come from an imaginary version of origpage
1581 * that has the nposting tuple instead of the original posting list tuple.
1582 *
1583 * Note: _bt_findsplitloc() should have compensated for coinciding posting
1584 * list splits in just the same way, at least in theory. It doesn't
1585 * bother with that, though. In practice it won't affect its choice of
1586 * split point.
1587 */
1588 origpagepostingoff = InvalidOffsetNumber;
1589 if (postingoff != 0)
1590 {
1591 Assert(isleaf);
1592 Assert(ItemPointerCompare(&orignewitem->t_tid,
1593 &newitem->t_tid) < 0);
1594 Assert(BTreeTupleIsPosting(nposting));
1595 origpagepostingoff = OffsetNumberPrev(newitemoff);
1596 }
1597
1598 /*
1599 * The high key for the new left page is a possibly-truncated copy of
1600 * firstright on the leaf level (it's "firstright itself" on internal
1601 * pages; see !isleaf comments below). This may seem to be contrary to
1602 * Lehman & Yao's approach of using a copy of lastleft as the new high key
1603 * when splitting on the leaf level. It isn't, though.
1604 *
1605 * Suffix truncation will leave the left page's high key fully equal to
1606 * lastleft when lastleft and firstright are equal prior to heap TID (that
1607 * is, the tiebreaker TID value comes from lastleft). It isn't actually
1608 * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1609 * "subtree" invariant to hold. It's sufficient to make sure that the new
1610 * leaf high key is strictly less than firstright, and greater than or
1611 * equal to (not necessarily equal to) lastleft. In other words, when
1612 * suffix truncation isn't possible during a leaf page split, we take
1613 * L&Y's exact approach to generating a new high key for the left page.
1614 * (Actually, that is slightly inaccurate. We don't just use a copy of
1615 * lastleft. A tuple with all the keys from firstright but the max heap
1616 * TID from lastleft is used, to avoid introducing a special case.)
1617 */
1618 if (!newitemonleft && newitemoff == firstrightoff)
1619 {
1620 /* incoming tuple becomes firstright */
1621 itemsz = newitemsz;
1622 firstright = newitem;
1623 }
1624 else
1625 {
1626 /* existing item at firstrightoff becomes firstright */
1627 itemid = PageGetItemId(origpage, firstrightoff);
1628 itemsz = ItemIdGetLength(itemid);
1629 firstright = (IndexTuple) PageGetItem(origpage, itemid);
1630 if (firstrightoff == origpagepostingoff)
1631 firstright = nposting;
1632 }
1633
1634 if (isleaf)
1635 {
1636 IndexTuple lastleft;
1637
1638 /* Attempt suffix truncation for leaf page splits */
1639 if (newitemonleft && newitemoff == firstrightoff)
1640 {
1641 /* incoming tuple becomes lastleft */
1642 lastleft = newitem;
1643 }
1644 else
1645 {
1646 OffsetNumber lastleftoff;
1647
1648 /* existing item before firstrightoff becomes lastleft */
1649 lastleftoff = OffsetNumberPrev(firstrightoff);
1650 Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1651 itemid = PageGetItemId(origpage, lastleftoff);
1652 lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1653 if (lastleftoff == origpagepostingoff)
1654 lastleft = nposting;
1655 }
1656
1657 lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1658 itemsz = IndexTupleSize(lefthighkey);
1659 }
1660 else
1661 {
1662 /*
1663 * Don't perform suffix truncation on a copy of firstright to make
1664 * left page high key for internal page splits. Must use firstright
1665 * as new high key directly.
1666 *
1667 * Each distinct separator key value originates as a leaf level high
1668 * key; all other separator keys/pivot tuples are copied from one
1669 * level down. A separator key in a grandparent page must be
1670 * identical to high key in rightmost parent page of the subtree to
1671 * its left, which must itself be identical to high key in rightmost
1672 * child page of that same subtree (this even applies to separator
1673 * from grandparent's high key). There must always be an unbroken
1674 * "seam" of identical separator keys that guide index scans at every
1675 * level, starting from the grandparent. That's why suffix truncation
1676 * is unsafe here.
1677 *
1678 * Internal page splits will truncate firstright into a "negative
1679 * infinity" data item when it gets inserted on the new right page
1680 * below, though. This happens during the call to _bt_pgaddtup() for
1681 * the new first data item for right page. Do not confuse this
1682 * mechanism with suffix truncation. It is just a convenient way of
1683 * implementing page splits that split the internal page "inside"
1684 * firstright. The lefthighkey separator key cannot appear a second
1685 * time in the right page (only firstright's downlink goes in right
1686 * page).
1687 */
1688 lefthighkey = firstright;
1689 }
1690
1691 /*
1692 * Add new high key to leftpage
1693 */
1694 afterleftoff = P_HIKEY;
1695
1696 Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1697 Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1699 Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1700 if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1701 false) == InvalidOffsetNumber)
1702 elog(ERROR, "failed to add high key to the left sibling"
1703 " while splitting block %u of index \"%s\"",
1704 origpagenumber, RelationGetRelationName(rel));
1705 afterleftoff = OffsetNumberNext(afterleftoff);
1706
1707 /*
1708 * Acquire a new right page to split into, now that left page has a new
1709 * high key. From here on, it's not okay to throw an error without
1710 * zeroing rightpage first. This coding rule ensures that we won't
1711 * confuse future VACUUM operations, which might otherwise try to re-find
1712 * a downlink to a leftover junk page as the page undergoes deletion.
1713 *
1714 * It would be reasonable to start the critical section just after the new
1715 * rightpage buffer is acquired instead; that would allow us to avoid
1716 * leftover junk pages without bothering to zero rightpage. We do it this
1717 * way because it avoids an unnecessary PANIC when either origpage or its
1718 * existing sibling page are corrupt.
1719 */
1720 rbuf = _bt_allocbuf(rel, heaprel);
1721 rightpage = BufferGetPage(rbuf);
1722 rightpagenumber = BufferGetBlockNumber(rbuf);
1723 /* rightpage was initialized by _bt_allocbuf */
1724 ropaque = BTPageGetOpaque(rightpage);
1725
1726 /*
1727 * Finish off remaining leftpage special area fields. They cannot be set
1728 * before both origpage (leftpage) and rightpage buffers are acquired and
1729 * locked.
1730 *
1731 * btpo_cycleid is only used with leaf pages, though we set it here in all
1732 * cases just to be consistent.
1733 */
1734 lopaque->btpo_next = rightpagenumber;
1735 lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1736
1737 /*
1738 * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1739 * and HAS_GARBAGE flags.
1740 */
1741 ropaque->btpo_flags = oopaque->btpo_flags;
1743 ropaque->btpo_prev = origpagenumber;
1744 ropaque->btpo_next = oopaque->btpo_next;
1745 ropaque->btpo_level = oopaque->btpo_level;
1746 ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1747
1748 /*
1749 * Add new high key to rightpage where necessary.
1750 *
1751 * If the page we're splitting is not the rightmost page at its level in
1752 * the tree, then the first entry on the page is the high key from
1753 * origpage.
1754 */
1755 afterrightoff = P_HIKEY;
1756
1757 if (!isrightmost)
1758 {
1759 IndexTuple righthighkey;
1760
1761 itemid = PageGetItemId(origpage, P_HIKEY);
1762 itemsz = ItemIdGetLength(itemid);
1763 righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1764 Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1765 Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1767 if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1768 false, false) == InvalidOffsetNumber)
1769 {
1770 memset(rightpage, 0, BufferGetPageSize(rbuf));
1771 elog(ERROR, "failed to add high key to the right sibling"
1772 " while splitting block %u of index \"%s\"",
1773 origpagenumber, RelationGetRelationName(rel));
1774 }
1775 afterrightoff = OffsetNumberNext(afterrightoff);
1776 }
1777
1778 /*
1779 * Internal page splits truncate first data item on right page -- it
1780 * becomes "minus infinity" item for the page. Set this up here.
1781 */
1782 minusinfoff = InvalidOffsetNumber;
1783 if (!isleaf)
1784 minusinfoff = afterrightoff;
1785
1786 /*
1787 * Now transfer all the data items (non-pivot tuples in isleaf case, or
1788 * additional pivot tuples in !isleaf case) to the appropriate page.
1789 *
1790 * Note: we *must* insert at least the right page's items in item-number
1791 * order, for the benefit of _bt_restore_page().
1792 */
1793 for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1794 {
1795 IndexTuple dataitem;
1796
1797 itemid = PageGetItemId(origpage, i);
1798 itemsz = ItemIdGetLength(itemid);
1799 dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1800
1801 /* replace original item with nposting due to posting split? */
1802 if (i == origpagepostingoff)
1803 {
1804 Assert(BTreeTupleIsPosting(dataitem));
1805 Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1806 dataitem = nposting;
1807 }
1808
1809 /* does new item belong before this one? */
1810 else if (i == newitemoff)
1811 {
1812 if (newitemonleft)
1813 {
1814 Assert(newitemoff <= firstrightoff);
1815 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1816 false))
1817 {
1818 memset(rightpage, 0, BufferGetPageSize(rbuf));
1819 elog(ERROR, "failed to add new item to the left sibling"
1820 " while splitting block %u of index \"%s\"",
1821 origpagenumber, RelationGetRelationName(rel));
1822 }
1823 afterleftoff = OffsetNumberNext(afterleftoff);
1824 }
1825 else
1826 {
1827 Assert(newitemoff >= firstrightoff);
1828 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1829 afterrightoff == minusinfoff))
1830 {
1831 memset(rightpage, 0, BufferGetPageSize(rbuf));
1832 elog(ERROR, "failed to add new item to the right sibling"
1833 " while splitting block %u of index \"%s\"",
1834 origpagenumber, RelationGetRelationName(rel));
1835 }
1836 afterrightoff = OffsetNumberNext(afterrightoff);
1837 }
1838 }
1839
1840 /* decide which page to put it on */
1841 if (i < firstrightoff)
1842 {
1843 if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1844 {
1845 memset(rightpage, 0, BufferGetPageSize(rbuf));
1846 elog(ERROR, "failed to add old item to the left sibling"
1847 " while splitting block %u of index \"%s\"",
1848 origpagenumber, RelationGetRelationName(rel));
1849 }
1850 afterleftoff = OffsetNumberNext(afterleftoff);
1851 }
1852 else
1853 {
1854 if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1855 afterrightoff == minusinfoff))
1856 {
1857 memset(rightpage, 0, BufferGetPageSize(rbuf));
1858 elog(ERROR, "failed to add old item to the right sibling"
1859 " while splitting block %u of index \"%s\"",
1860 origpagenumber, RelationGetRelationName(rel));
1861 }
1862 afterrightoff = OffsetNumberNext(afterrightoff);
1863 }
1864 }
1865
1866 /* Handle case where newitem goes at the end of rightpage */
1867 if (i <= newitemoff)
1868 {
1869 /*
1870 * Can't have newitemonleft here; that would imply we were told to put
1871 * *everything* on the left page, which cannot fit (if it could, we'd
1872 * not be splitting the page).
1873 */
1874 Assert(!newitemonleft && newitemoff == maxoff + 1);
1875 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1876 afterrightoff == minusinfoff))
1877 {
1878 memset(rightpage, 0, BufferGetPageSize(rbuf));
1879 elog(ERROR, "failed to add new item to the right sibling"
1880 " while splitting block %u of index \"%s\"",
1881 origpagenumber, RelationGetRelationName(rel));
1882 }
1883 afterrightoff = OffsetNumberNext(afterrightoff);
1884 }
1885
1886 /*
1887 * We have to grab the original right sibling (if any) and update its prev
1888 * link. We are guaranteed that this is deadlock-free, since we couple
1889 * the locks in the standard order: left to right.
1890 */
1891 if (!isrightmost)
1892 {
1893 sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1894 spage = BufferGetPage(sbuf);
1895 sopaque = BTPageGetOpaque(spage);
1896 if (sopaque->btpo_prev != origpagenumber)
1897 {
1898 memset(rightpage, 0, BufferGetPageSize(rbuf));
1899 ereport(ERROR,
1900 (errcode(ERRCODE_INDEX_CORRUPTED),
1901 errmsg_internal("right sibling's left-link doesn't match: "
1902 "block %u links to %u instead of expected %u in index \"%s\"",
1903 oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1905 }
1906
1907 /*
1908 * Check to see if we can set the SPLIT_END flag in the right-hand
1909 * split page; this can save some I/O for vacuum since it need not
1910 * proceed to the right sibling. We can set the flag if the right
1911 * sibling has a different cycleid: that means it could not be part of
1912 * a group of pages that were all split off from the same ancestor
1913 * page. If you're confused, imagine that page A splits to A B and
1914 * then again, yielding A C B, while vacuum is in progress. Tuples
1915 * originally in A could now be in either B or C, hence vacuum must
1916 * examine both pages. But if D, our right sibling, has a different
1917 * cycleid then it could not contain any tuples that were in A when
1918 * the vacuum started.
1919 */
1920 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1921 ropaque->btpo_flags |= BTP_SPLIT_END;
1922 }
1923
1924 /*
1925 * Right sibling is locked, new siblings are prepared, but original page
1926 * is not updated yet.
1927 *
1928 * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1929 * not starting the critical section till here because we haven't been
1930 * scribbling on the original page yet; see comments above.
1931 */
1933
1934 /*
1935 * By here, the original data page has been split into two new halves, and
1936 * these are correct. The algorithm requires that the left page never
1937 * move during a split, so we copy the new left page back on top of the
1938 * original. We need to do this before writing the WAL record, so that
1939 * XLogInsert can WAL log an image of the page if necessary.
1940 */
1941 PageRestoreTempPage(leftpage, origpage);
1942 /* leftpage, lopaque must not be used below here */
1943
1945 MarkBufferDirty(rbuf);
1946
1947 if (!isrightmost)
1948 {
1949 sopaque->btpo_prev = rightpagenumber;
1950 MarkBufferDirty(sbuf);
1951 }
1952
1953 /*
1954 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1955 * a split
1956 */
1957 if (!isleaf)
1958 {
1959 Page cpage = BufferGetPage(cbuf);
1960 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1961
1962 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1963 MarkBufferDirty(cbuf);
1964 }
1965
1966 /* XLOG stuff */
1967 if (RelationNeedsWAL(rel))
1968 {
1969 xl_btree_split xlrec;
1970 uint8 xlinfo;
1971 XLogRecPtr recptr;
1972
1973 xlrec.level = ropaque->btpo_level;
1974 /* See comments below on newitem, orignewitem, and posting lists */
1975 xlrec.firstrightoff = firstrightoff;
1976 xlrec.newitemoff = newitemoff;
1977 xlrec.postingoff = 0;
1978 if (postingoff != 0 && origpagepostingoff < firstrightoff)
1979 xlrec.postingoff = postingoff;
1980
1983
1986 /* Log original right sibling, since we've changed its prev-pointer */
1987 if (!isrightmost)
1989 if (!isleaf)
1991
1992 /*
1993 * Log the new item, if it was inserted on the left page. (If it was
1994 * put on the right page, we don't need to explicitly WAL log it
1995 * because it's included with all the other items on the right page.)
1996 * Show the new item as belonging to the left page buffer, so that it
1997 * is not stored if XLogInsert decides it needs a full-page image of
1998 * the left page. We always store newitemoff in the record, though.
1999 *
2000 * The details are sometimes slightly different for page splits that
2001 * coincide with a posting list split. If both the replacement
2002 * posting list and newitem go on the right page, then we don't need
2003 * to log anything extra, just like the simple !newitemonleft
2004 * no-posting-split case (postingoff is set to zero in the WAL record,
2005 * so recovery doesn't need to process a posting list split at all).
2006 * Otherwise, we set postingoff and log orignewitem instead of
2007 * newitem, despite having actually inserted newitem. REDO routine
2008 * must reconstruct nposting and newitem using _bt_swap_posting().
2009 *
2010 * Note: It's possible that our page split point is the point that
2011 * makes the posting list lastleft and newitem firstright. This is
2012 * the only case where we log orignewitem/newitem despite newitem
2013 * going on the right page. If XLogInsert decides that it can omit
2014 * orignewitem due to logging a full-page image of the left page,
2015 * everything still works out, since recovery only needs to log
2016 * orignewitem for items on the left page (just like the regular
2017 * newitem-logged case).
2018 */
2019 if (newitemonleft && xlrec.postingoff == 0)
2020 XLogRegisterBufData(0, newitem, newitemsz);
2021 else if (xlrec.postingoff != 0)
2022 {
2023 Assert(isleaf);
2024 Assert(newitemonleft || firstrightoff == newitemoff);
2025 Assert(newitemsz == IndexTupleSize(orignewitem));
2026 XLogRegisterBufData(0, orignewitem, newitemsz);
2027 }
2028
2029 /* Log the left page's new high key */
2030 if (!isleaf)
2031 {
2032 /* lefthighkey isn't local copy, get current pointer */
2033 itemid = PageGetItemId(origpage, P_HIKEY);
2034 lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2035 }
2036 XLogRegisterBufData(0, lefthighkey,
2037 MAXALIGN(IndexTupleSize(lefthighkey)));
2038
2039 /*
2040 * Log the contents of the right page in the format understood by
2041 * _bt_restore_page(). The whole right page will be recreated.
2042 *
2043 * Direct access to page is not good but faster - we should implement
2044 * some new func in page API. Note we only store the tuples
2045 * themselves, knowing that they were inserted in item-number order
2046 * and so the line pointers can be reconstructed. See comments for
2047 * _bt_restore_page().
2048 */
2050 (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2051 ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2052
2053 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2054 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2055
2056 PageSetLSN(origpage, recptr);
2057 PageSetLSN(rightpage, recptr);
2058 if (!isrightmost)
2059 PageSetLSN(spage, recptr);
2060 if (!isleaf)
2061 PageSetLSN(BufferGetPage(cbuf), recptr);
2062 }
2063
2065
2066 /* release the old right sibling */
2067 if (!isrightmost)
2068 _bt_relbuf(rel, sbuf);
2069
2070 /* release the child */
2071 if (!isleaf)
2072 _bt_relbuf(rel, cbuf);
2073
2074 /* be tidy */
2075 if (isleaf)
2076 pfree(lefthighkey);
2077
2078 /* split's done */
2079 return rbuf;
2080}
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:389
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:413
Page PageGetTempPage(const PageData *page)
Definition: bufpage.c:354
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2630
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:83
#define BTP_SPLIT_END
Definition: nbtree.h:81
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:129
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:2550
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2813
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:31
#define SizeOfBtreeSplit
Definition: nbtxlog.h:158
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:30
uint16 postingoff
Definition: nbtxlog.h:155
OffsetNumber firstrightoff
Definition: nbtxlog.h:153
uint32 level
Definition: nbtxlog.h:152
OffsetNumber newitemoff
Definition: nbtxlog.h:154

References _bt_allocbuf(), _bt_findsplitloc(), _bt_getbuf(), _bt_pageinit(), _bt_pgaddtup(), _bt_relbuf(), _bt_truncate(), _bt_vacuum_cycleid(), Assert(), BT_WRITE, BTP_HAS_GARBAGE, BTP_INCOMPLETE_SPLIT, BTP_ROOT, BTP_SPLIT_END, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BTreeTupleIsPosting(), buf, BufferGetBlockNumber(), BufferGetPage(), BufferGetPageSize(), elog, END_CRIT_SECTION, ereport, errcode(), errmsg_internal(), ERROR, xl_btree_split::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), InvalidBuffer, InvalidOffsetNumber, ItemIdGetLength, ItemPointerCompare(), xl_btree_split::level, MarkBufferDirty(), MAXALIGN, xl_btree_split::newitemoff, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageAddItem, PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPage(), PageRestoreTempPage(), PageSetLSN(), pfree(), xl_btree_split::postingoff, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeSplit, START_CRIT_SECTION, IndexTupleData::t_tid, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_R, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_insertonpg().

◆ _bt_stepright()

static void _bt_stepright ( Relation  rel,
Relation  heaprel,
BTInsertState  insertstate,
BTStack  stack 
)
static

Definition at line 1027 of file nbtinsert.c.

1029{
1030 Page page;
1031 BTPageOpaque opaque;
1032 Buffer rbuf;
1033 BlockNumber rblkno;
1034
1035 Assert(heaprel != NULL);
1036 page = BufferGetPage(insertstate->buf);
1037 opaque = BTPageGetOpaque(page);
1038
1039 rbuf = InvalidBuffer;
1040 rblkno = opaque->btpo_next;
1041 for (;;)
1042 {
1043 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1044 page = BufferGetPage(rbuf);
1045 opaque = BTPageGetOpaque(page);
1046
1047 /*
1048 * If this page was incompletely split, finish the split now. We do
1049 * this while holding a lock on the left sibling, which is not good
1050 * because finishing the split could be a fairly lengthy operation.
1051 * But this should happen very seldom.
1052 */
1053 if (P_INCOMPLETE_SPLIT(opaque))
1054 {
1055 _bt_finish_split(rel, heaprel, rbuf, stack);
1056 rbuf = InvalidBuffer;
1057 continue;
1058 }
1059
1060 if (!P_IGNORE(opaque))
1061 break;
1062 if (P_RIGHTMOST(opaque))
1063 elog(ERROR, "fell off the end of index \"%s\"",
1065
1066 rblkno = opaque->btpo_next;
1067 }
1068 /* rbuf locked; unlock buf, update state for caller */
1069 _bt_relbuf(rel, insertstate->buf);
1070 insertstate->buf = rbuf;
1071 insertstate->bounds_valid = false;
1072}

References _bt_finish_split(), _bt_relandgetbuf(), _bt_relbuf(), Assert(), BTInsertStateData::bounds_valid, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTInsertStateData::buf, BufferGetPage(), elog, ERROR, InvalidBuffer, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, and RelationGetRelationName.

Referenced by _bt_findinsertloc().