PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtinsert.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtinsert.c
4 * Item insertion in Lehman and Yao btrees for Postgres.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtinsert.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "access/nbtree.h"
19#include "access/nbtxlog.h"
20#include "access/tableam.h"
21#include "access/transam.h"
22#include "access/xloginsert.h"
23#include "common/int.h"
24#include "common/pg_prng.h"
25#include "lib/qunique.h"
26#include "miscadmin.h"
27#include "storage/lmgr.h"
28#include "storage/predicate.h"
30
31/* Minimum tree height for application of fastpath optimization */
32#define BTREE_FASTPATH_MIN_LEVEL 2
33
34
35static BTStack _bt_search_insert(Relation rel, Relation heaprel,
38 Relation heapRel,
40 uint32 *speculativeToken);
43 bool checkingunique,
44 bool indexUnchanged,
45 BTStack stack,
46 Relation heapRel);
47static void _bt_stepright(Relation rel, Relation heaprel,
49static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
50 Buffer buf,
52 BTStack stack,
53 IndexTuple itup,
54 Size itemsz,
55 OffsetNumber newitemoff,
56 int postingoff,
57 bool split_only_page);
58static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
59 Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
60 Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
61 IndexTuple nposting, uint16 postingoff);
62static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
63 Buffer rbuf, BTStack stack, bool isroot, bool isonly);
64static void _bt_freestack(BTStack stack);
66static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
68static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
70 bool simpleonly, bool checkingunique,
71 bool uniquedup, bool indexUnchanged);
72static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
74 IndexTuple newitem, OffsetNumber minoff,
75 OffsetNumber maxoff);
77 int ndeletable, IndexTuple newitem,
78 int *nblocks);
79static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
80
81/*
82 * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
83 *
84 * This routine is called by the public interface routine, btinsert.
85 * By here, itup is filled in, including the TID.
86 *
87 * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
88 * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
89 * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
90 * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
91 * don't actually insert.
92 *
93 * indexUnchanged executor hint indicates if itup is from an
94 * UPDATE that didn't logically change the indexed value, but
95 * must nevertheless have a new entry to point to a successor
96 * version.
97 *
98 * The result value is only significant for UNIQUE_CHECK_PARTIAL:
99 * it must be true if the entry is known unique, else false.
100 * (In the current implementation we'll also return true after a
101 * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
102 * that's just a coding artifact.)
103 */
104bool
107 Relation heapRel)
108{
109 bool is_unique = false;
111 BTScanInsert itup_key;
112 BTStack stack;
114
115 /* we need an insertion scan key to do our search, so build one */
116 itup_key = _bt_mkscankey(rel, itup);
117
118 if (checkingunique)
119 {
120 if (!itup_key->anynullkeys)
121 {
122 /* No (heapkeyspace) scantid until uniqueness established */
123 itup_key->scantid = NULL;
124 }
125 else
126 {
127 /*
128 * Scan key for new tuple contains NULL key values. Bypass
129 * checkingunique steps. They are unnecessary because core code
130 * considers NULL unequal to every value, including NULL.
131 *
132 * This optimization avoids O(N^2) behavior within the
133 * _bt_findinsertloc() heapkeyspace path when a unique index has a
134 * large number of "duplicates" with NULL key values.
135 */
136 checkingunique = false;
137 /* Tuple is unique in the sense that core code cares about */
139 is_unique = true;
140 }
141 }
142
143 /*
144 * Fill in the BTInsertState working area, to track the current page and
145 * position within the page to insert on.
146 *
147 * Note that itemsz is passed down to lower level code that deals with
148 * inserting the item. It must be MAXALIGN()'d. This ensures that space
149 * accounting code consistently considers the alignment overhead that we
150 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
151 * already conservative about alignment, but we don't rely on that from
152 * this distance. Besides, preserving the "true" tuple size in index
153 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
154 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
155 */
156 insertstate.itup = itup;
157 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
158 insertstate.itup_key = itup_key;
159 insertstate.bounds_valid = false;
161 insertstate.postingoff = 0;
162
163search:
164
165 /*
166 * Find and lock the leaf page that the tuple should be added to by
167 * searching from the root page. insertstate.buf will hold a buffer that
168 * is locked in exclusive mode afterwards.
169 */
170 stack = _bt_search_insert(rel, heapRel, &insertstate);
171
172 /*
173 * checkingunique inserts are not allowed to go ahead when two tuples with
174 * equal key attribute values would be visible to new MVCC snapshots once
175 * the xact commits. Check for conflicts in the locked page/buffer (if
176 * needed) here.
177 *
178 * It might be necessary to check a page to the right in _bt_check_unique,
179 * though that should be very rare. In practice the first page the value
180 * could be on (with scantid omitted) is almost always also the only page
181 * that a matching tuple might be found on. This is due to the behavior
182 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
183 * only be allowed to cross a page boundary when there is no candidate
184 * leaf page split point that avoids it. Also, _bt_check_unique can use
185 * the leaf page high key to determine that there will be no duplicates on
186 * the right sibling without actually visiting it (it uses the high key in
187 * cases where the new item happens to belong at the far right of the leaf
188 * page).
189 *
190 * NOTE: obviously, _bt_check_unique can only detect keys that are already
191 * in the index; so it cannot defend against concurrent insertions of the
192 * same key. We protect against that by means of holding a write lock on
193 * the first page the value could be on, with omitted/-inf value for the
194 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
195 * the same key must acquire a write lock on the same page, so only one
196 * would-be inserter can be making the check at one time. Furthermore,
197 * once we are past the check we hold write locks continuously until we
198 * have performed our insertion, so no later inserter can fail to see our
199 * insertion. (This requires some care in _bt_findinsertloc.)
200 *
201 * If we must wait for another xact, we release the lock while waiting,
202 * and then must perform a new search.
203 *
204 * For a partial uniqueness check, we don't wait for the other xact. Just
205 * let the tuple in and return false for possibly non-unique, or true for
206 * definitely unique.
207 */
208 if (checkingunique)
209 {
211 uint32 speculativeToken;
212
214 &is_unique, &speculativeToken);
215
217 {
218 /* Have to wait for the other guy ... */
219 _bt_relbuf(rel, insertstate.buf);
221
222 /*
223 * If it's a speculative insertion, wait for it to finish (ie. to
224 * go ahead with the insertion, or kill the tuple). Otherwise
225 * wait for the transaction to finish as usual.
226 */
227 if (speculativeToken)
228 SpeculativeInsertionWait(xwait, speculativeToken);
229 else
231
232 /* start over... */
233 if (stack)
234 _bt_freestack(stack);
235 goto search;
236 }
237
238 /* Uniqueness is established -- restore heap tid as scantid */
239 if (itup_key->heapkeyspace)
240 itup_key->scantid = &itup->t_tid;
241 }
242
244 {
245 OffsetNumber newitemoff;
246
247 /*
248 * The only conflict predicate locking cares about for indexes is when
249 * an index tuple insert conflicts with an existing lock. We don't
250 * know the actual page we're going to insert on for sure just yet in
251 * checkingunique and !heapkeyspace cases, but it's okay to use the
252 * first page the value could be on (with scantid omitted) instead.
253 */
255
256 /*
257 * Do the insertion. Note that insertstate contains cached binary
258 * search bounds established within _bt_check_unique when insertion is
259 * checkingunique.
260 */
261 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
262 indexUnchanged, stack, heapRel);
263 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
264 stack, itup, insertstate.itemsz, newitemoff,
265 insertstate.postingoff, false);
266 }
267 else
268 {
269 /* just release the buffer */
270 _bt_relbuf(rel, insertstate.buf);
271 }
272
273 /* be tidy */
274 if (stack)
275 _bt_freestack(stack);
276 pfree(itup_key);
277
278 return is_unique;
279}
280
281/*
282 * _bt_search_insert() -- _bt_search() wrapper for inserts
283 *
284 * Search the tree for a particular scankey, or more precisely for the first
285 * leaf page it could be on. Try to make use of the fastpath optimization's
286 * rightmost leaf page cache before actually searching the tree from the root
287 * page, though.
288 *
289 * Return value is a stack of parent-page pointers (though see notes about
290 * fastpath optimization and page splits below). insertstate->buf is set to
291 * the address of the leaf-page buffer, which is write-locked and pinned in
292 * all cases (if necessary by creating a new empty root page for caller).
293 *
294 * The fastpath optimization avoids most of the work of searching the tree
295 * repeatedly when a single backend inserts successive new tuples on the
296 * rightmost leaf page of an index. A backend cache of the rightmost leaf
297 * page is maintained within _bt_insertonpg(), and used here. The cache is
298 * invalidated here when an insert of a non-pivot tuple must take place on a
299 * non-rightmost leaf page.
300 *
301 * The optimization helps with indexes on an auto-incremented field. It also
302 * helps with indexes on datetime columns, as well as indexes with lots of
303 * NULL values. (NULLs usually get inserted in the rightmost page for single
304 * column indexes, since they usually get treated as coming after everything
305 * else in the key space. Individual NULL tuples will generally be placed on
306 * the rightmost leaf page due to the influence of the heap TID column.)
307 *
308 * Note that we avoid applying the optimization when there is insufficient
309 * space on the rightmost page to fit caller's new item. This is necessary
310 * because we'll need to return a real descent stack when a page split is
311 * expected (actually, caller can cope with a leaf page split that uses a NULL
312 * stack, but that's very slow and so must be avoided). Note also that the
313 * fastpath optimization acquires the lock on the page conditionally as a way
314 * of reducing extra contention when there are concurrent insertions into the
315 * rightmost page (we give up if we'd have to wait for the lock). We assume
316 * that it isn't useful to apply the optimization when there is contention,
317 * since each per-backend cache won't stay valid for long.
318 */
319static BTStack
321{
323 Assert(!insertstate->bounds_valid);
324 Assert(insertstate->postingoff == 0);
325
327 {
328 /* Simulate a _bt_getbuf() call with conditional locking */
330 if (_bt_conditionallockbuf(rel, insertstate->buf))
331 {
332 Page page;
333 BTPageOpaque opaque;
334
335 _bt_checkpage(rel, insertstate->buf);
336 page = BufferGetPage(insertstate->buf);
337 opaque = BTPageGetOpaque(page);
338
339 /*
340 * Check if the page is still the rightmost leaf page and has
341 * enough free space to accommodate the new tuple. Also check
342 * that the insertion scan key is strictly greater than the first
343 * non-pivot tuple on the page. (Note that we expect itup_key's
344 * scantid to be unset when our caller is a checkingunique
345 * inserter.)
346 */
347 if (P_RIGHTMOST(opaque) &&
348 P_ISLEAF(opaque) &&
349 !P_IGNORE(opaque) &&
350 PageGetFreeSpace(page) > insertstate->itemsz &&
352 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
353 {
354 /*
355 * Caller can use the fastpath optimization because cached
356 * block is still rightmost leaf page, which can fit caller's
357 * new tuple without splitting. Keep block in local cache for
358 * next insert, and have caller use NULL stack.
359 *
360 * Note that _bt_insert_parent() has an assertion that catches
361 * leaf page splits that somehow follow from a fastpath insert
362 * (it should only be passed a NULL stack when it must deal
363 * with a concurrent root page split, and never because a NULL
364 * stack was returned here).
365 */
366 return NULL;
367 }
368
369 /* Page unsuitable for caller, drop lock and pin */
370 _bt_relbuf(rel, insertstate->buf);
371 }
372 else
373 {
374 /* Lock unavailable, drop pin */
376 }
377
378 /* Forget block, since cache doesn't appear to be useful */
380 }
381
382 /* Cannot use optimization -- descend tree, return proper descent stack */
383 return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
384 BT_WRITE, true);
385}
386
387/*
388 * _bt_check_unique() -- Check for violation of unique index constraint
389 *
390 * Returns InvalidTransactionId if there is no conflict, else an xact ID
391 * we must wait for to see if it commits a conflicting tuple. If an actual
392 * conflict is detected, no return --- just ereport(). If an xact ID is
393 * returned, and the conflicting tuple still has a speculative insertion in
394 * progress, *speculativeToken is set to non-zero, and the caller can wait for
395 * the verdict on the insertion using SpeculativeInsertionWait().
396 *
397 * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
398 * InvalidTransactionId because we don't want to wait. In this case we
399 * set *is_unique to false if there is a potential conflict, and the
400 * core code must redo the uniqueness check later.
401 *
402 * As a side-effect, sets state in insertstate that can later be used by
403 * _bt_findinsertloc() to reuse most of the binary search work we do
404 * here.
405 *
406 * This code treats NULLs as equal, unlike the default semantics for unique
407 * indexes. So do not call here when there are NULL values in scan key and
408 * the index uses the default NULLS DISTINCT mode.
409 */
410static TransactionId
413 uint32 *speculativeToken)
414{
415 IndexTuple itup = insertstate->itup;
418 BTScanInsert itup_key = insertstate->itup_key;
420 OffsetNumber offset;
421 OffsetNumber maxoff;
422 Page page;
423 BTPageOpaque opaque;
425 bool found = false;
426 bool inposting = false;
427 bool prevalldead = true;
428 int curposti = 0;
429
430 /* Assume unique until we find a duplicate */
431 *is_unique = true;
432
434
435 page = BufferGetPage(insertstate->buf);
436 opaque = BTPageGetOpaque(page);
437 maxoff = PageGetMaxOffsetNumber(page);
438
439 /*
440 * Find the first tuple with the same key.
441 *
442 * This also saves the binary search bounds in insertstate. We use them
443 * in the fastpath below, but also in the _bt_findinsertloc() call later.
444 */
445 Assert(!insertstate->bounds_valid);
446 offset = _bt_binsrch_insert(rel, insertstate);
447
448 /*
449 * Scan over all equal tuples, looking for live conflicts.
450 */
451 Assert(!insertstate->bounds_valid || insertstate->low == offset);
452 Assert(!itup_key->anynullkeys);
453 Assert(itup_key->scantid == NULL);
454 for (;;)
455 {
456 /*
457 * Each iteration of the loop processes one heap TID, not one index
458 * tuple. Current offset number for page isn't usually advanced on
459 * iterations that process heap TIDs from posting list tuples.
460 *
461 * "inposting" state is set when _inside_ a posting list --- not when
462 * we're at the start (or end) of a posting list. We advance curposti
463 * at the end of the iteration when inside a posting list tuple. In
464 * general, every loop iteration either advances the page offset or
465 * advances curposti --- an iteration that handles the rightmost/max
466 * heap TID in a posting list finally advances the page offset (and
467 * unsets "inposting").
468 *
469 * Make sure the offset points to an actual index tuple before trying
470 * to examine it...
471 */
472 if (offset <= maxoff)
473 {
474 /*
475 * Fastpath: In most cases, we can use cached search bounds to
476 * limit our consideration to items that are definitely
477 * duplicates. This fastpath doesn't apply when the original page
478 * is empty, or when initial offset is past the end of the
479 * original page, which may indicate that we need to examine a
480 * second or subsequent page.
481 *
482 * Note that this optimization allows us to avoid calling
483 * _bt_compare() directly when there are no duplicates, as long as
484 * the offset where the key will go is not at the end of the page.
485 */
486 if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
487 {
488 Assert(insertstate->bounds_valid);
489 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
490 Assert(insertstate->low <= insertstate->stricthigh);
491 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
492 break;
493 }
494
495 /*
496 * We can skip items that are already marked killed.
497 *
498 * In the presence of heavy update activity an index may contain
499 * many killed items with the same key; running _bt_compare() on
500 * each killed item gets expensive. Just advance over killed
501 * items as quickly as we can. We only apply _bt_compare() when
502 * we get to a non-killed item. We could reuse the bounds to
503 * avoid _bt_compare() calls for known equal tuples, but it
504 * doesn't seem worth it.
505 */
506 if (!inposting)
507 curitemid = PageGetItemId(page, offset);
509 {
511 bool all_dead = false;
512
513 if (!inposting)
514 {
515 /* Plain tuple, or first TID in posting list tuple */
516 if (_bt_compare(rel, itup_key, page, offset) != 0)
517 break; /* we're past all the equal tuples */
518
519 /* Advanced curitup */
522 }
523
524 /* okay, we gotta fetch the heap tuple using htid ... */
526 {
527 /* ... htid is from simple non-pivot tuple */
529 htid = curitup->t_tid;
530 }
531 else if (!inposting)
532 {
533 /* ... htid is first TID in new posting list */
534 inposting = true;
535 prevalldead = true;
536 curposti = 0;
538 }
539 else
540 {
541 /* ... htid is second or subsequent TID in posting list */
542 Assert(curposti > 0);
544 }
545
546 /*
547 * If we are doing a recheck, we expect to find the tuple we
548 * are rechecking. It's not a duplicate, but we have to keep
549 * scanning.
550 */
552 ItemPointerCompare(&htid, &itup->t_tid) == 0)
553 {
554 found = true;
555 }
556
557 /*
558 * Check if there's any table tuples for this index entry
559 * satisfying SnapshotDirty. This is necessary because for AMs
560 * with optimizations like heap's HOT, we have just a single
561 * index entry for the entire chain.
562 */
563 else if (table_index_fetch_tuple_check(heapRel, &htid,
565 &all_dead))
566 {
568
569 /*
570 * It is a duplicate. If we are only doing a partial
571 * check, then don't bother checking if the tuple is being
572 * updated in another transaction. Just return the fact
573 * that it is a potential conflict and leave the full
574 * check till later. Don't invalidate binary search
575 * bounds.
576 */
578 {
579 if (nbuf != InvalidBuffer)
580 _bt_relbuf(rel, nbuf);
581 *is_unique = false;
583 }
584
585 /*
586 * If this tuple is being updated by other transaction
587 * then we have to wait for its commit/abort.
588 */
590 SnapshotDirty.xmin : SnapshotDirty.xmax;
591
593 {
594 if (nbuf != InvalidBuffer)
595 _bt_relbuf(rel, nbuf);
596 /* Tell _bt_doinsert to wait... */
597 *speculativeToken = SnapshotDirty.speculativeToken;
598 /* Caller releases lock on buf immediately */
599 insertstate->bounds_valid = false;
600 return xwait;
601 }
602
603 /*
604 * Otherwise we have a definite conflict. But before
605 * complaining, look to see if the tuple we want to insert
606 * is itself now committed dead --- if so, don't complain.
607 * This is a waste of time in normal scenarios but we must
608 * do it to support CREATE INDEX CONCURRENTLY.
609 *
610 * We must follow HOT-chains here because during
611 * concurrent index build, we insert the root TID though
612 * the actual tuple may be somewhere in the HOT-chain.
613 * While following the chain we might not stop at the
614 * exact tuple which triggered the insert, but that's OK
615 * because if we find a live tuple anywhere in this chain,
616 * we have a unique key conflict. The other live tuple is
617 * not part of this chain because it had a different index
618 * entry.
619 */
620 htid = itup->t_tid;
623 {
624 /* Normal case --- it's still live */
625 }
626 else
627 {
628 /*
629 * It's been deleted, so no error, and no need to
630 * continue searching
631 */
632 break;
633 }
634
635 /*
636 * Check for a conflict-in as we would if we were going to
637 * write to this page. We aren't actually going to write,
638 * but we want a chance to report SSI conflicts that would
639 * otherwise be masked by this unique constraint
640 * violation.
641 */
643
644 /*
645 * This is a definite conflict. Break the tuple down into
646 * datums and report the error. But first, make sure we
647 * release the buffer locks we're holding ---
648 * BuildIndexValueDescription could make catalog accesses,
649 * which in the worst case might touch this same index and
650 * cause deadlocks.
651 */
652 if (nbuf != InvalidBuffer)
653 _bt_relbuf(rel, nbuf);
654 _bt_relbuf(rel, insertstate->buf);
656 insertstate->bounds_valid = false;
657
658 {
660 bool isnull[INDEX_MAX_KEYS];
661 char *key_desc;
662
664 values, isnull);
665
667 isnull);
668
671 errmsg("duplicate key value violates unique constraint \"%s\"",
673 key_desc ? errdetail("Key %s already exists.",
674 key_desc) : 0,
675 errtableconstraint(heapRel,
677 }
678 }
679 else if (all_dead && (!inposting ||
680 (prevalldead &&
682 {
683 /*
684 * The conflicting tuple (or all HOT chains pointed to by
685 * all posting list TIDs) is dead to everyone, so try to
686 * mark the index entry killed. It's ok if we're not
687 * allowed to, this isn't required for correctness.
688 */
689 Buffer buf;
690
691 /* Be sure to operate on the proper buffer */
692 if (nbuf != InvalidBuffer)
693 buf = nbuf;
694 else
695 buf = insertstate->buf;
696
697 /*
698 * Use the hint bit infrastructure to check if we can
699 * update the page while just holding a share lock.
700 *
701 * Can't use BufferSetHintBits16() here as we update two
702 * different locations.
703 */
705 {
707 opaque->btpo_flags |= BTP_HAS_GARBAGE;
708 BufferFinishSetHintBits(buf, true, true);
709 }
710 }
711
712 /*
713 * Remember if posting list tuple has even a single HOT chain
714 * whose members are not all dead
715 */
716 if (!all_dead && inposting)
717 prevalldead = false;
718 }
719 }
720
722 {
723 /* Advance to next TID in same posting list */
724 curposti++;
725 continue;
726 }
727 else if (offset < maxoff)
728 {
729 /* Advance to next tuple */
730 curposti = 0;
731 inposting = false;
732 offset = OffsetNumberNext(offset);
733 }
734 else
735 {
736 int highkeycmp;
737
738 /* If scankey == hikey we gotta check the next page too */
739 if (P_RIGHTMOST(opaque))
740 break;
741 highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
742 Assert(highkeycmp <= 0);
743 if (highkeycmp != 0)
744 break;
745 /* Advance to next non-dead page --- there must be one */
746 for (;;)
747 {
748 BlockNumber nblkno = opaque->btpo_next;
749
751 page = BufferGetPage(nbuf);
752 opaque = BTPageGetOpaque(page);
753 if (!P_IGNORE(opaque))
754 break;
755 if (P_RIGHTMOST(opaque))
756 elog(ERROR, "fell off the end of index \"%s\"",
758 }
759 /* Will also advance to next tuple */
760 curposti = 0;
761 inposting = false;
762 maxoff = PageGetMaxOffsetNumber(page);
763 offset = P_FIRSTDATAKEY(opaque);
764 /* Don't invalidate binary search bounds */
765 }
766 }
767
768 /*
769 * If we are doing a recheck then we should have found the tuple we are
770 * checking. Otherwise there's something very wrong --- probably, the
771 * index is on a non-immutable expression.
772 */
773 if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
776 errmsg("failed to re-find tuple within index \"%s\"",
778 errhint("This may be because of a non-immutable index expression."),
779 errtableconstraint(heapRel,
781
782 if (nbuf != InvalidBuffer)
783 _bt_relbuf(rel, nbuf);
784
786}
787
788
789/*
790 * _bt_findinsertloc() -- Finds an insert location for a tuple
791 *
792 * On entry, insertstate buffer contains the page the new tuple belongs
793 * on. It is exclusive-locked and pinned by the caller.
794 *
795 * If 'checkingunique' is true, the buffer on entry is the first page
796 * that contains duplicates of the new key. If there are duplicates on
797 * multiple pages, the correct insertion position might be some page to
798 * the right, rather than the first page. In that case, this function
799 * moves right to the correct target page.
800 *
801 * (In a !heapkeyspace index, there can be multiple pages with the same
802 * high key, where the new tuple could legitimately be placed on. In
803 * that case, the caller passes the first page containing duplicates,
804 * just like when checkingunique=true. If that page doesn't have enough
805 * room for the new tuple, this function moves right, trying to find a
806 * legal page that does.)
807 *
808 * If 'indexUnchanged' is true, this is for an UPDATE that didn't
809 * logically change the indexed value, but must nevertheless have a new
810 * entry to point to a successor version. This hint from the executor
811 * will influence our behavior when the page might have to be split and
812 * we must consider our options. Bottom-up index deletion can avoid
813 * pathological version-driven page splits, but we only want to go to the
814 * trouble of trying it when we already have moderate confidence that
815 * it's appropriate. The hint should not significantly affect our
816 * behavior over time unless practically all inserts on to the leaf page
817 * get the hint.
818 *
819 * On exit, insertstate buffer contains the chosen insertion page, and
820 * the offset within that page is returned. If _bt_findinsertloc needed
821 * to move right, the lock and pin on the original page are released, and
822 * the new buffer is exclusively locked and pinned instead.
823 *
824 * If insertstate contains cached binary search bounds, we will take
825 * advantage of them. This avoids repeating comparisons that we made in
826 * _bt_check_unique() already.
827 */
828static OffsetNumber
831 bool checkingunique,
832 bool indexUnchanged,
833 BTStack stack,
834 Relation heapRel)
835{
836 BTScanInsert itup_key = insertstate->itup_key;
837 Page page = BufferGetPage(insertstate->buf);
838 BTPageOpaque opaque;
839 OffsetNumber newitemoff;
840
841 opaque = BTPageGetOpaque(page);
842
843 /* Check 1/3 of a page restriction */
844 if (unlikely(insertstate->itemsz > BTMaxItemSize))
845 _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
846 insertstate->itup);
847
848 Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
849 Assert(!insertstate->bounds_valid || checkingunique);
850 Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
851 Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
852 Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
853
854 if (itup_key->heapkeyspace)
855 {
856 /* Keep track of whether checkingunique duplicate seen */
858
859 /*
860 * If we're inserting into a unique index, we may have to walk right
861 * through leaf pages to find the one leaf page that we must insert on
862 * to.
863 *
864 * This is needed for checkingunique callers because a scantid was not
865 * used when we called _bt_search(). scantid can only be set after
866 * _bt_check_unique() has checked for duplicates. The buffer
867 * initially stored in insertstate->buf has the page where the first
868 * duplicate key might be found, which isn't always the page that new
869 * tuple belongs on. The heap TID attribute for new tuple (scantid)
870 * could force us to insert on a sibling page, though that should be
871 * very rare in practice.
872 */
873 if (checkingunique)
874 {
875 if (insertstate->low < insertstate->stricthigh)
876 {
877 /* Encountered a duplicate in _bt_check_unique() */
878 Assert(insertstate->bounds_valid);
879 uniquedup = true;
880 }
881
882 for (;;)
883 {
884 /*
885 * Does the new tuple belong on this page?
886 *
887 * The earlier _bt_check_unique() call may well have
888 * established a strict upper bound on the offset for the new
889 * item. If it's not the last item of the page (i.e. if there
890 * is at least one tuple on the page that goes after the tuple
891 * we're inserting) then we know that the tuple belongs on
892 * this page. We can skip the high key check.
893 */
894 if (insertstate->bounds_valid &&
895 insertstate->low <= insertstate->stricthigh &&
896 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
897 break;
898
899 /* Test '<=', not '!=', since scantid is set now */
900 if (P_RIGHTMOST(opaque) ||
901 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
902 break;
903
904 _bt_stepright(rel, heapRel, insertstate, stack);
905 /* Update local state after stepping right */
906 page = BufferGetPage(insertstate->buf);
907 opaque = BTPageGetOpaque(page);
908 /* Assume duplicates (if checkingunique) */
909 uniquedup = true;
910 }
911 }
912
913 /*
914 * If the target page cannot fit newitem, try to avoid splitting the
915 * page on insert by performing deletion or deduplication now
916 */
917 if (PageGetFreeSpace(page) < insertstate->itemsz)
918 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
921 }
922 else
923 {
924 /*----------
925 * This is a !heapkeyspace (version 2 or 3) index. The current page
926 * is the first page that we could insert the new tuple to, but there
927 * may be other pages to the right that we could opt to use instead.
928 *
929 * If the new key is equal to one or more existing keys, we can
930 * legitimately place it anywhere in the series of equal keys. In
931 * fact, if the new key is equal to the page's "high key" we can place
932 * it on the next page. If it is equal to the high key, and there's
933 * not room to insert the new tuple on the current page without
934 * splitting, then we move right hoping to find more free space and
935 * avoid a split.
936 *
937 * Keep scanning right until we
938 * (a) find a page with enough free space,
939 * (b) reach the last page where the tuple can legally go, or
940 * (c) get tired of searching.
941 * (c) is not flippant; it is important because if there are many
942 * pages' worth of equal keys, it's better to split one of the early
943 * pages than to scan all the way to the end of the run of equal keys
944 * on every insert. We implement "get tired" as a random choice,
945 * since stopping after scanning a fixed number of pages wouldn't work
946 * well (we'd never reach the right-hand side of previously split
947 * pages). The probability of moving right is set at 0.99, which may
948 * seem too high to change the behavior much, but it does an excellent
949 * job of preventing O(N^2) behavior with many equal keys.
950 *----------
951 */
952 while (PageGetFreeSpace(page) < insertstate->itemsz)
953 {
954 /*
955 * Before considering moving right, see if we can obtain enough
956 * space by erasing LP_DEAD items
957 */
958 if (P_HAS_GARBAGE(opaque))
959 {
960 /* Perform simple deletion */
961 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
962 false, false, false);
963
964 if (PageGetFreeSpace(page) >= insertstate->itemsz)
965 break; /* OK, now we have enough space */
966 }
967
968 /*
969 * Nope, so check conditions (b) and (c) enumerated above
970 *
971 * The earlier _bt_check_unique() call may well have established a
972 * strict upper bound on the offset for the new item. If it's not
973 * the last item of the page (i.e. if there is at least one tuple
974 * on the page that's greater than the tuple we're inserting to)
975 * then we know that the tuple belongs on this page. We can skip
976 * the high key check.
977 */
978 if (insertstate->bounds_valid &&
979 insertstate->low <= insertstate->stricthigh &&
980 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
981 break;
982
983 if (P_RIGHTMOST(opaque) ||
984 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
986 break;
987
988 _bt_stepright(rel, heapRel, insertstate, stack);
989 /* Update local state after stepping right */
990 page = BufferGetPage(insertstate->buf);
991 opaque = BTPageGetOpaque(page);
992 }
993 }
994
995 /*
996 * We should now be on the correct page. Find the offset within the page
997 * for the new tuple. (Possibly reusing earlier search bounds.)
998 */
999 Assert(P_RIGHTMOST(opaque) ||
1000 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
1001
1002 newitemoff = _bt_binsrch_insert(rel, insertstate);
1003
1004 if (insertstate->postingoff == -1)
1005 {
1006 /*
1007 * There is an overlapping posting list tuple with its LP_DEAD bit
1008 * set. We don't want to unnecessarily unset its LP_DEAD bit while
1009 * performing a posting list split, so perform simple index tuple
1010 * deletion early.
1011 */
1012 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1013 false, false, false);
1014
1015 /*
1016 * Do new binary search. New insert location cannot overlap with any
1017 * posting list now.
1018 */
1019 Assert(!insertstate->bounds_valid);
1020 insertstate->postingoff = 0;
1021 newitemoff = _bt_binsrch_insert(rel, insertstate);
1022 Assert(insertstate->postingoff == 0);
1023 }
1024
1025 return newitemoff;
1026}
1027
1028/*
1029 * Step right to next non-dead page, during insertion.
1030 *
1031 * This is a bit more complicated than moving right in a search. We must
1032 * write-lock the target page before releasing write lock on current page;
1033 * else someone else's _bt_check_unique scan could fail to see our insertion.
1034 * Write locks on intermediate dead pages won't do because we don't know when
1035 * they will get de-linked from the tree.
1036 *
1037 * This is more aggressive than it needs to be for non-unique !heapkeyspace
1038 * indexes.
1039 */
1040static void
1042 BTStack stack)
1043{
1044 Page page;
1045 BTPageOpaque opaque;
1046 Buffer rbuf;
1048
1049 Assert(heaprel != NULL);
1050 page = BufferGetPage(insertstate->buf);
1051 opaque = BTPageGetOpaque(page);
1052
1054 rblkno = opaque->btpo_next;
1055 for (;;)
1056 {
1058 page = BufferGetPage(rbuf);
1059 opaque = BTPageGetOpaque(page);
1060
1061 /*
1062 * If this page was incompletely split, finish the split now. We do
1063 * this while holding a lock on the left sibling, which is not good
1064 * because finishing the split could be a fairly lengthy operation.
1065 * But this should happen very seldom.
1066 */
1067 if (P_INCOMPLETE_SPLIT(opaque))
1068 {
1069 _bt_finish_split(rel, heaprel, rbuf, stack);
1071 continue;
1072 }
1073
1074 if (!P_IGNORE(opaque))
1075 break;
1076 if (P_RIGHTMOST(opaque))
1077 elog(ERROR, "fell off the end of index \"%s\"",
1079
1080 rblkno = opaque->btpo_next;
1081 }
1082 /* rbuf locked; unlock buf, update state for caller */
1083 _bt_relbuf(rel, insertstate->buf);
1084 insertstate->buf = rbuf;
1085 insertstate->bounds_valid = false;
1086}
1087
1088/*----------
1089 * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1090 *
1091 * This recursive procedure does the following things:
1092 *
1093 * + if postingoff != 0, splits existing posting list tuple
1094 * (since it overlaps with new 'itup' tuple).
1095 * + if necessary, splits the target page, using 'itup_key' for
1096 * suffix truncation on leaf pages (caller passes NULL for
1097 * non-leaf pages).
1098 * + inserts the new tuple (might be split from posting list).
1099 * + if the page was split, pops the parent stack, and finds the
1100 * right place to insert the new child pointer (by walking
1101 * right using information stored in the parent stack).
1102 * + invokes itself with the appropriate tuple for the right
1103 * child page on the parent.
1104 * + updates the metapage if a true root or fast root is split.
1105 *
1106 * On entry, we must have the correct buffer in which to do the
1107 * insertion, and the buffer must be pinned and write-locked. On return,
1108 * we will have dropped both the pin and the lock on the buffer.
1109 *
1110 * This routine only performs retail tuple insertions. 'itup' should
1111 * always be either a non-highkey leaf item, or a downlink (new high
1112 * key items are created indirectly, when a page is split). When
1113 * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1114 * we're inserting the downlink for. This function will clear the
1115 * INCOMPLETE_SPLIT flag on it, and release the buffer.
1116 *----------
1117 */
1118static void
1120 Relation heaprel,
1121 BTScanInsert itup_key,
1122 Buffer buf,
1123 Buffer cbuf,
1124 BTStack stack,
1125 IndexTuple itup,
1126 Size itemsz,
1127 OffsetNumber newitemoff,
1128 int postingoff,
1129 bool split_only_page)
1130{
1131 Page page;
1132 BTPageOpaque opaque;
1133 bool isleaf,
1134 isroot,
1136 isonly;
1141
1142 page = BufferGetPage(buf);
1143 opaque = BTPageGetOpaque(page);
1144 isleaf = P_ISLEAF(opaque);
1145 isroot = P_ISROOT(opaque);
1146 isrightmost = P_RIGHTMOST(opaque);
1147 isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1148
1149 /* child buffer must be given iff inserting on an internal page */
1151 /* tuple must have appropriate number of attributes */
1152 Assert(!isleaf ||
1153 BTreeTupleGetNAtts(itup, rel) ==
1155 Assert(isleaf ||
1156 BTreeTupleGetNAtts(itup, rel) <=
1159 Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1160 /* Caller must always finish incomplete split for us */
1161 Assert(!P_INCOMPLETE_SPLIT(opaque));
1162
1163 /*
1164 * Every internal page should have exactly one negative infinity item at
1165 * all times. Only _bt_split() and _bt_newlevel() should add items that
1166 * become negative infinity items through truncation, since they're the
1167 * only routines that allocate new internal pages.
1168 */
1169 Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1170
1171 /*
1172 * Do we need to split an existing posting list item?
1173 */
1174 if (postingoff != 0)
1175 {
1176 ItemId itemid = PageGetItemId(page, newitemoff);
1177
1178 /*
1179 * The new tuple is a duplicate with a heap TID that falls inside the
1180 * range of an existing posting list tuple on a leaf page. Prepare to
1181 * split an existing posting list. Overwriting the posting list with
1182 * its post-split version is treated as an extra step in either the
1183 * insert or page split critical section.
1184 */
1185 Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1186 oposting = (IndexTuple) PageGetItem(page, itemid);
1187
1188 /*
1189 * postingoff value comes from earlier call to _bt_binsrch_posting().
1190 * Its binary search might think that a plain tuple must be a posting
1191 * list tuple that needs to be split. This can happen with corruption
1192 * involving an existing plain tuple that is a duplicate of the new
1193 * item, up to and including its table TID. Check for that here in
1194 * passing.
1195 *
1196 * Also verify that our caller has made sure that the existing posting
1197 * list tuple does not have its LP_DEAD bit set.
1198 */
1199 if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1200 ereport(ERROR,
1202 errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1205 newitemoff, BufferGetBlockNumber(buf),
1207
1208 /* use a mutable copy of itup as our itup from here on */
1209 origitup = itup;
1210 itup = CopyIndexTuple(origitup);
1211 nposting = _bt_swap_posting(itup, oposting, postingoff);
1212 /* itup now contains rightmost/max TID from oposting */
1213
1214 /* Alter offset so that newitem goes after posting list */
1215 newitemoff = OffsetNumberNext(newitemoff);
1216 }
1217
1218 /*
1219 * Do we need to split the page to fit the item on it?
1220 *
1221 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1222 * so this comparison is correct even though we appear to be accounting
1223 * only for the item and not for its line pointer.
1224 */
1225 if (PageGetFreeSpace(page) < itemsz)
1226 {
1227 Buffer rbuf;
1228
1230
1231 /* split the buffer into left and right halves */
1232 rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1233 itup, origitup, nposting, postingoff);
1237
1238 /*----------
1239 * By here,
1240 *
1241 * + our target page has been split;
1242 * + the original tuple has been inserted;
1243 * + we have write locks on both the old (left half)
1244 * and new (right half) buffers, after the split; and
1245 * + we know the key we want to insert into the parent
1246 * (it's the "high key" on the left child page).
1247 *
1248 * We're ready to do the parent insertion. We need to hold onto the
1249 * locks for the child pages until we locate the parent, but we can
1250 * at least release the lock on the right child before doing the
1251 * actual insertion. The lock on the left child will be released
1252 * last of all by parent insertion, where it is the 'cbuf' of parent
1253 * page.
1254 *----------
1255 */
1256#ifdef USE_INJECTION_POINTS
1257 if (P_ISLEAF(opaque))
1258 INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
1259 else
1260 INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
1261#endif
1262
1263 _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1264 }
1265 else
1266 {
1268 Page metapg = NULL;
1271
1272 /*
1273 * If we are doing this insert because we split a page that was the
1274 * only one on its tree level, but was not the root, it may have been
1275 * the "fast root". We need to ensure that the fast root link points
1276 * at or above the current page. We can safely acquire a lock on the
1277 * metapage here --- see comments for _bt_newlevel().
1278 */
1280 {
1281 Assert(!isleaf);
1283
1287
1288 if (metad->btm_fastlevel >= opaque->btpo_level)
1289 {
1290 /* no update wanted */
1291 _bt_relbuf(rel, metabuf);
1293 }
1294 }
1295
1296 /* Do the update. No ereport(ERROR) until changes are logged */
1298
1299 if (postingoff != 0)
1301
1302 if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
1303 elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1305
1307
1309 {
1310 /* upgrade meta-page if needed */
1311 if (metad->btm_version < BTREE_NOVAC_VERSION)
1313 metad->btm_fastroot = BufferGetBlockNumber(buf);
1314 metad->btm_fastlevel = opaque->btpo_level;
1316 }
1317
1318 /*
1319 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1320 * finishes a split
1321 */
1322 if (!isleaf)
1323 {
1326
1328 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1330 }
1331
1332 /* XLOG stuff */
1333 if (RelationNeedsWAL(rel))
1334 {
1337 uint8 xlinfo;
1339
1340 xlrec.offnum = newitemoff;
1341
1344
1345 if (isleaf && postingoff == 0)
1346 {
1347 /* Simple leaf insert */
1349 }
1350 else if (postingoff != 0)
1351 {
1352 /*
1353 * Leaf insert with posting list split. Must include
1354 * postingoff field before newitem/orignewitem.
1355 */
1356 Assert(isleaf);
1358 }
1359 else
1360 {
1361 /* Internal page insert, which finishes a split on cbuf */
1364
1366 {
1367 /* Actually, it's an internal page insert + meta update */
1369
1370 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1371 xlmeta.version = metad->btm_version;
1372 xlmeta.root = metad->btm_root;
1373 xlmeta.level = metad->btm_level;
1374 xlmeta.fastroot = metad->btm_fastroot;
1375 xlmeta.fastlevel = metad->btm_fastlevel;
1376 xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1377 xlmeta.allequalimage = metad->btm_allequalimage;
1378
1382 sizeof(xl_btree_metadata));
1383 }
1384 }
1385
1387 if (postingoff == 0)
1388 {
1389 /* Just log itup from caller */
1390 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1391 }
1392 else
1393 {
1394 /*
1395 * Insert with posting list split (XLOG_BTREE_INSERT_POST
1396 * record) case.
1397 *
1398 * Log postingoff. Also log origitup, not itup. REDO routine
1399 * must reconstruct final itup (as well as nposting) using
1400 * _bt_swap_posting().
1401 */
1402 upostingoff = postingoff;
1403
1407 }
1408
1410 }
1411 else
1412 recptr = XLogGetFakeLSN(rel);
1413
1416 if (!isleaf)
1418
1419 PageSetLSN(page, recptr);
1420
1422
1423 /* Release subsidiary buffers */
1425 _bt_relbuf(rel, metabuf);
1426 if (!isleaf)
1427 _bt_relbuf(rel, cbuf);
1428
1429 /*
1430 * Cache the block number if this is the rightmost leaf page. Cache
1431 * may be used by a future inserter within _bt_search_insert().
1432 */
1434 if (isrightmost && isleaf && !isroot)
1436
1437 /* Release buffer for insertion target block */
1438 _bt_relbuf(rel, buf);
1439
1440 /*
1441 * If we decided to cache the insertion target block before releasing
1442 * its buffer lock, then cache it now. Check the height of the tree
1443 * first, though. We don't go for the optimization with small
1444 * indexes. Defer final check to this point to ensure that we don't
1445 * call _bt_getrootheight while holding a buffer lock.
1446 */
1450 }
1451
1452 /* be tidy */
1453 if (postingoff != 0)
1454 {
1455 /* itup is actually a modified copy of caller's original */
1456 pfree(nposting);
1457 pfree(itup);
1458 }
1459}
1460
1461/*
1462 * _bt_split() -- split a page in the btree.
1463 *
1464 * On entry, buf is the page to split, and is pinned and write-locked.
1465 * newitemoff etc. tell us about the new item that must be inserted
1466 * along with the data from the original page.
1467 *
1468 * itup_key is used for suffix truncation on leaf pages (internal
1469 * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1470 * is the left-sibling of the page we're inserting the downlink for.
1471 * This function will clear the INCOMPLETE_SPLIT flag on it, and
1472 * release the buffer.
1473 *
1474 * orignewitem, nposting, and postingoff are needed when an insert of
1475 * orignewitem results in both a posting list split and a page split.
1476 * These extra posting list split details are used here in the same
1477 * way as they are used in the more common case where a posting list
1478 * split does not coincide with a page split. We need to deal with
1479 * posting list splits directly in order to ensure that everything
1480 * that follows from the insert of orignewitem is handled as a single
1481 * atomic operation (though caller's insert of a new pivot/downlink
1482 * into parent page will still be a separate operation). See
1483 * nbtree/README for details on the design of posting list splits.
1484 *
1485 * Returns the new right sibling of buf, pinned and write-locked.
1486 * The pin and lock on buf are maintained.
1487 */
1488static Buffer
1490 Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1492{
1493 Buffer rbuf;
1494 Page origpage;
1495 Page leftpage,
1496 rightpage;
1502 lopaque,
1503 oopaque;
1505 Page spage = NULL;
1507 Size itemsz;
1508 ItemId itemid;
1511 OffsetNumber firstrightoff;
1516 OffsetNumber maxoff;
1518 bool newitemonleft,
1519 isleaf,
1522
1523 /*
1524 * origpage is the original page to be split. leftpage is a temporary
1525 * buffer that receives the left-sibling data, which will be copied back
1526 * into origpage on success. rightpage is the new page that will receive
1527 * the right-sibling data.
1528 *
1529 * leftpage is allocated after choosing a split point. rightpage's new
1530 * buffer isn't acquired until after leftpage is initialized and has new
1531 * high key, the last point where splitting the page may fail (barring
1532 * corruption). Failing before acquiring new buffer won't have lasting
1533 * consequences, since origpage won't have been modified and leftpage is
1534 * only workspace.
1535 */
1536 origpage = BufferGetPage(buf);
1537 oopaque = BTPageGetOpaque(origpage);
1540 maxoff = PageGetMaxOffsetNumber(origpage);
1542
1543 /*
1544 * Choose a point to split origpage at.
1545 *
1546 * A split point can be thought of as a point _between_ two existing data
1547 * items on origpage (the lastleft and firstright tuples), provided you
1548 * pretend that the new item that didn't fit is already on origpage.
1549 *
1550 * Since origpage does not actually contain newitem, the representation of
1551 * split points needs to work with two boundary cases: splits where
1552 * newitem is lastleft, and splits where newitem is firstright.
1553 * newitemonleft resolves the ambiguity that would otherwise exist when
1554 * newitemoff == firstrightoff. In all other cases it's clear which side
1555 * of the split every tuple goes on from context. newitemonleft is
1556 * usually (but not always) redundant information.
1557 *
1558 * firstrightoff is supposed to be an origpage offset number, but it's
1559 * possible that its value will be maxoff+1, which is "past the end" of
1560 * origpage. This happens in the rare case where newitem goes after all
1561 * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1562 * origpage at the point that leaves newitem alone on new right page. Any
1563 * "!newitemonleft && newitemoff == firstrightoff" split point makes
1564 * newitem the firstright tuple, though, so this case isn't a special
1565 * case.
1566 */
1567 firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1568 newitem, &newitemonleft);
1569
1570 /* Use temporary buffer for leftpage */
1571 leftpage = leftpage_buf.data;
1574
1575 /*
1576 * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1577 * and HAS_GARBAGE flags.
1578 */
1579 lopaque->btpo_flags = oopaque->btpo_flags;
1580 lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1581 /* set flag in leftpage indicating that rightpage has no downlink yet */
1582 lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1583 lopaque->btpo_prev = oopaque->btpo_prev;
1584 /* handle btpo_next after rightpage buffer acquired */
1585 lopaque->btpo_level = oopaque->btpo_level;
1586 /* handle btpo_cycleid after rightpage buffer acquired */
1587
1588 /*
1589 * Copy the original page's LSN into leftpage, which will become the
1590 * updated version of the page. We need this because XLogInsert will
1591 * examine the LSN and possibly dump it in a page image.
1592 */
1593 PageSetLSN(leftpage, PageGetLSN(origpage));
1594
1595 /*
1596 * Determine page offset number of existing overlapped-with-orignewitem
1597 * posting list when it is necessary to perform a posting list split in
1598 * passing. Note that newitem was already changed by caller (newitem no
1599 * longer has the orignewitem TID).
1600 *
1601 * This page offset number (origpagepostingoff) will be used to pretend
1602 * that the posting split has already taken place, even though the
1603 * required modifications to origpage won't occur until we reach the
1604 * critical section. The lastleft and firstright tuples of our page split
1605 * point should, in effect, come from an imaginary version of origpage
1606 * that has the nposting tuple instead of the original posting list tuple.
1607 *
1608 * Note: _bt_findsplitloc() should have compensated for coinciding posting
1609 * list splits in just the same way, at least in theory. It doesn't
1610 * bother with that, though. In practice it won't affect its choice of
1611 * split point.
1612 */
1614 if (postingoff != 0)
1615 {
1616 Assert(isleaf);
1618 &newitem->t_tid) < 0);
1621 }
1622
1623 /*
1624 * The high key for the new left page is a possibly-truncated copy of
1625 * firstright on the leaf level (it's "firstright itself" on internal
1626 * pages; see !isleaf comments below). This may seem to be contrary to
1627 * Lehman & Yao's approach of using a copy of lastleft as the new high key
1628 * when splitting on the leaf level. It isn't, though.
1629 *
1630 * Suffix truncation will leave the left page's high key fully equal to
1631 * lastleft when lastleft and firstright are equal prior to heap TID (that
1632 * is, the tiebreaker TID value comes from lastleft). It isn't actually
1633 * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1634 * "subtree" invariant to hold. It's sufficient to make sure that the new
1635 * leaf high key is strictly less than firstright, and greater than or
1636 * equal to (not necessarily equal to) lastleft. In other words, when
1637 * suffix truncation isn't possible during a leaf page split, we take
1638 * L&Y's exact approach to generating a new high key for the left page.
1639 * (Actually, that is slightly inaccurate. We don't just use a copy of
1640 * lastleft. A tuple with all the keys from firstright but the max heap
1641 * TID from lastleft is used, to avoid introducing a special case.)
1642 */
1643 if (!newitemonleft && newitemoff == firstrightoff)
1644 {
1645 /* incoming tuple becomes firstright */
1646 itemsz = newitemsz;
1647 firstright = newitem;
1648 }
1649 else
1650 {
1651 /* existing item at firstrightoff becomes firstright */
1652 itemid = PageGetItemId(origpage, firstrightoff);
1653 itemsz = ItemIdGetLength(itemid);
1654 firstright = (IndexTuple) PageGetItem(origpage, itemid);
1655 if (firstrightoff == origpagepostingoff)
1657 }
1658
1659 if (isleaf)
1660 {
1661 IndexTuple lastleft;
1662
1663 /* Attempt suffix truncation for leaf page splits */
1664 if (newitemonleft && newitemoff == firstrightoff)
1665 {
1666 /* incoming tuple becomes lastleft */
1667 lastleft = newitem;
1668 }
1669 else
1670 {
1672
1673 /* existing item before firstrightoff becomes lastleft */
1674 lastleftoff = OffsetNumberPrev(firstrightoff);
1676 itemid = PageGetItemId(origpage, lastleftoff);
1677 lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1679 lastleft = nposting;
1680 }
1681
1682 lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1683 itemsz = IndexTupleSize(lefthighkey);
1684 }
1685 else
1686 {
1687 /*
1688 * Don't perform suffix truncation on a copy of firstright to make
1689 * left page high key for internal page splits. Must use firstright
1690 * as new high key directly.
1691 *
1692 * Each distinct separator key value originates as a leaf level high
1693 * key; all other separator keys/pivot tuples are copied from one
1694 * level down. A separator key in a grandparent page must be
1695 * identical to high key in rightmost parent page of the subtree to
1696 * its left, which must itself be identical to high key in rightmost
1697 * child page of that same subtree (this even applies to separator
1698 * from grandparent's high key). There must always be an unbroken
1699 * "seam" of identical separator keys that guide index scans at every
1700 * level, starting from the grandparent. That's why suffix truncation
1701 * is unsafe here.
1702 *
1703 * Internal page splits will truncate firstright into a "negative
1704 * infinity" data item when it gets inserted on the new right page
1705 * below, though. This happens during the call to _bt_pgaddtup() for
1706 * the new first data item for right page. Do not confuse this
1707 * mechanism with suffix truncation. It is just a convenient way of
1708 * implementing page splits that split the internal page "inside"
1709 * firstright. The lefthighkey separator key cannot appear a second
1710 * time in the right page (only firstright's downlink goes in right
1711 * page).
1712 */
1714 }
1715
1716 /*
1717 * Add new high key to leftpage
1718 */
1720
1725 if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
1726 elog(ERROR, "failed to add high key to the left sibling"
1727 " while splitting block %u of index \"%s\"",
1730
1731 /*
1732 * Acquire a new right page to split into, now that left page has a new
1733 * high key.
1734 *
1735 * To not confuse future VACUUM operations, we zero the right page and
1736 * work on an in-memory copy of it before writing WAL, then copy its
1737 * contents back to the actual page once we start the critical section
1738 * work. This simplifies the split work, so as there is no need to zero
1739 * the right page before throwing an error.
1740 */
1741 rbuf = _bt_allocbuf(rel, heaprel);
1742 rightpage = rightpage_buf.data;
1743
1744 /*
1745 * Copy the contents of the right page into its temporary location, and
1746 * zero the original space.
1747 */
1751 /* rightpage was initialized by _bt_allocbuf */
1753
1754 /*
1755 * Finish off remaining leftpage special area fields. They cannot be set
1756 * before both origpage (leftpage) and rightpage buffers are acquired and
1757 * locked.
1758 *
1759 * btpo_cycleid is only used with leaf pages, though we set it here in all
1760 * cases just to be consistent.
1761 */
1762 lopaque->btpo_next = rightpagenumber;
1763 lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1764
1765 /*
1766 * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1767 * and HAS_GARBAGE flags.
1768 */
1769 ropaque->btpo_flags = oopaque->btpo_flags;
1770 ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1771 ropaque->btpo_prev = origpagenumber;
1772 ropaque->btpo_next = oopaque->btpo_next;
1773 ropaque->btpo_level = oopaque->btpo_level;
1774 ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1775
1776 /*
1777 * Add new high key to rightpage where necessary.
1778 *
1779 * If the page we're splitting is not the rightmost page at its level in
1780 * the tree, then the first entry on the page is the high key from
1781 * origpage.
1782 */
1784
1785 if (!isrightmost)
1786 {
1788
1789 itemid = PageGetItemId(origpage, P_HIKEY);
1790 itemsz = ItemIdGetLength(itemid);
1791 righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1795 if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
1796 {
1797 elog(ERROR, "failed to add high key to the right sibling"
1798 " while splitting block %u of index \"%s\"",
1800 }
1802 }
1803
1804 /*
1805 * Internal page splits truncate first data item on right page -- it
1806 * becomes "minus infinity" item for the page. Set this up here.
1807 */
1809 if (!isleaf)
1811
1812 /*
1813 * Now transfer all the data items (non-pivot tuples in isleaf case, or
1814 * additional pivot tuples in !isleaf case) to the appropriate page.
1815 *
1816 * Note: we *must* insert at least the right page's items in item-number
1817 * order, for the benefit of _bt_restore_page().
1818 */
1819 for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1820 {
1822
1823 itemid = PageGetItemId(origpage, i);
1824 itemsz = ItemIdGetLength(itemid);
1825 dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1826
1827 /* replace original item with nposting due to posting split? */
1828 if (i == origpagepostingoff)
1829 {
1833 }
1834
1835 /* does new item belong before this one? */
1836 else if (i == newitemoff)
1837 {
1838 if (newitemonleft)
1839 {
1840 Assert(newitemoff <= firstrightoff);
1841 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1842 false))
1843 {
1844 elog(ERROR, "failed to add new item to the left sibling"
1845 " while splitting block %u of index \"%s\"",
1847 }
1849 }
1850 else
1851 {
1852 Assert(newitemoff >= firstrightoff);
1853 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1855 {
1856 elog(ERROR, "failed to add new item to the right sibling"
1857 " while splitting block %u of index \"%s\"",
1859 }
1861 }
1862 }
1863
1864 /* decide which page to put it on */
1865 if (i < firstrightoff)
1866 {
1867 if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1868 {
1869 elog(ERROR, "failed to add old item to the left sibling"
1870 " while splitting block %u of index \"%s\"",
1872 }
1874 }
1875 else
1876 {
1879 {
1880 elog(ERROR, "failed to add old item to the right sibling"
1881 " while splitting block %u of index \"%s\"",
1883 }
1885 }
1886 }
1887
1888 /* Handle case where newitem goes at the end of rightpage */
1889 if (i <= newitemoff)
1890 {
1891 /*
1892 * Can't have newitemonleft here; that would imply we were told to put
1893 * *everything* on the left page, which cannot fit (if it could, we'd
1894 * not be splitting the page).
1895 */
1896 Assert(!newitemonleft && newitemoff == maxoff + 1);
1897 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1899 {
1900 elog(ERROR, "failed to add new item to the right sibling"
1901 " while splitting block %u of index \"%s\"",
1903 }
1905 }
1906
1907 /*
1908 * We have to grab the original right sibling (if any) and update its prev
1909 * link. We are guaranteed that this is deadlock-free, since we couple
1910 * the locks in the standard order: left to right.
1911 */
1912 if (!isrightmost)
1913 {
1914 sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1917 if (sopaque->btpo_prev != origpagenumber)
1918 {
1919 ereport(ERROR,
1921 errmsg_internal("right sibling's left-link doesn't match: "
1922 "block %u links to %u instead of expected %u in index \"%s\"",
1923 oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1925 }
1926
1927 /*
1928 * Check to see if we can set the SPLIT_END flag in the right-hand
1929 * split page; this can save some I/O for vacuum since it need not
1930 * proceed to the right sibling. We can set the flag if the right
1931 * sibling has a different cycleid: that means it could not be part of
1932 * a group of pages that were all split off from the same ancestor
1933 * page. If you're confused, imagine that page A splits to A B and
1934 * then again, yielding A C B, while vacuum is in progress. Tuples
1935 * originally in A could now be in either B or C, hence vacuum must
1936 * examine both pages. But if D, our right sibling, has a different
1937 * cycleid then it could not contain any tuples that were in A when
1938 * the vacuum started.
1939 */
1940 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1941 ropaque->btpo_flags |= BTP_SPLIT_END;
1942 }
1943
1944 /*
1945 * Right sibling is locked, new siblings are prepared, but original page
1946 * is not updated yet.
1947 *
1948 * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1949 * not starting the critical section till here because we haven't been
1950 * scribbling on the original page yet; see comments above.
1951 */
1953
1954 /*
1955 * By here, the original data page has been split into two new halves, and
1956 * these are correct. The algorithm requires that the left page never
1957 * move during a split, so we copy the new left page back on top of the
1958 * original. We need to do this before writing the WAL record, so that
1959 * XLogInsert can WAL log an image of the page if necessary.
1960 */
1961 memcpy(origpage, leftpage, BLCKSZ);
1962 /* leftpage, lopaque must not be used below here */
1963
1964 /*
1965 * Move the contents of the right page from its temporary location to the
1966 * destination buffer, before writing the WAL record. Unlike the left
1967 * page, the right page and its opaque area are still needed to complete
1968 * the update of the page, so reinitialize them.
1969 */
1973
1976
1977 if (!isrightmost)
1978 {
1979 sopaque->btpo_prev = rightpagenumber;
1981 }
1982
1983 /*
1984 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1985 * a split
1986 */
1987 if (!isleaf)
1988 {
1991
1992 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1994 }
1995
1996 /* XLOG stuff */
1997 if (RelationNeedsWAL(rel))
1998 {
2000 uint8 xlinfo;
2001
2002 xlrec.level = ropaque->btpo_level;
2003 /* See comments below on newitem, orignewitem, and posting lists */
2004 xlrec.firstrightoff = firstrightoff;
2005 xlrec.newitemoff = newitemoff;
2006 xlrec.postingoff = 0;
2007 if (postingoff != 0 && origpagepostingoff < firstrightoff)
2008 xlrec.postingoff = postingoff;
2009
2012
2015 /* Log original right sibling, since we've changed its prev-pointer */
2016 if (!isrightmost)
2018 if (!isleaf)
2020
2021 /*
2022 * Log the new item, if it was inserted on the left page. (If it was
2023 * put on the right page, we don't need to explicitly WAL log it
2024 * because it's included with all the other items on the right page.)
2025 * Show the new item as belonging to the left page buffer, so that it
2026 * is not stored if XLogInsert decides it needs a full-page image of
2027 * the left page. We always store newitemoff in the record, though.
2028 *
2029 * The details are sometimes slightly different for page splits that
2030 * coincide with a posting list split. If both the replacement
2031 * posting list and newitem go on the right page, then we don't need
2032 * to log anything extra, just like the simple !newitemonleft
2033 * no-posting-split case (postingoff is set to zero in the WAL record,
2034 * so recovery doesn't need to process a posting list split at all).
2035 * Otherwise, we set postingoff and log orignewitem instead of
2036 * newitem, despite having actually inserted newitem. REDO routine
2037 * must reconstruct nposting and newitem using _bt_swap_posting().
2038 *
2039 * Note: It's possible that our page split point is the point that
2040 * makes the posting list lastleft and newitem firstright. This is
2041 * the only case where we log orignewitem/newitem despite newitem
2042 * going on the right page. If XLogInsert decides that it can omit
2043 * orignewitem due to logging a full-page image of the left page,
2044 * everything still works out, since recovery only needs to log
2045 * orignewitem for items on the left page (just like the regular
2046 * newitem-logged case).
2047 */
2048 if (newitemonleft && xlrec.postingoff == 0)
2049 XLogRegisterBufData(0, newitem, newitemsz);
2050 else if (xlrec.postingoff != 0)
2051 {
2052 Assert(isleaf);
2053 Assert(newitemonleft || firstrightoff == newitemoff);
2054 Assert(newitemsz == IndexTupleSize(orignewitem));
2055 XLogRegisterBufData(0, orignewitem, newitemsz);
2056 }
2057
2058 /* Log the left page's new high key */
2059 if (!isleaf)
2060 {
2061 /* lefthighkey isn't local copy, get current pointer */
2062 itemid = PageGetItemId(origpage, P_HIKEY);
2063 lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2064 }
2067
2068 /*
2069 * Log the contents of the right page in the format understood by
2070 * _bt_restore_page(). The whole right page will be recreated.
2071 *
2072 * Direct access to page is not good but faster - we should implement
2073 * some new func in page API. Note we only store the tuples
2074 * themselves, knowing that they were inserted in item-number order
2075 * and so the line pointers can be reconstructed. See comments for
2076 * _bt_restore_page().
2077 */
2079 (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2080 ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2081
2082 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2084 }
2085 else
2086 recptr = XLogGetFakeLSN(rel);
2087
2088 PageSetLSN(origpage, recptr);
2090 if (!isrightmost)
2092 if (!isleaf)
2094
2096
2097 /* release the old right sibling */
2098 if (!isrightmost)
2099 _bt_relbuf(rel, sbuf);
2100
2101 /* release the child */
2102 if (!isleaf)
2103 _bt_relbuf(rel, cbuf);
2104
2105 /* be tidy */
2106 if (isleaf)
2108
2109 /* split's done */
2110 return rbuf;
2111}
2112
2113/*
2114 * _bt_insert_parent() -- Insert downlink into parent, completing split.
2115 *
2116 * On entry, buf and rbuf are the left and right split pages, which we
2117 * still hold write locks on. Both locks will be released here. We
2118 * release the rbuf lock once we have a write lock on the page that we
2119 * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2120 * The lock on buf is released at the same point as the lock on the parent
2121 * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2122 * atomic operation that completes the split by inserting a new downlink.
2123 *
2124 * stack - stack showing how we got here. Will be NULL when splitting true
2125 * root, or during concurrent root split, where we can be inefficient
2126 * isroot - we split the true root
2127 * isonly - we split a page alone on its level (might have been fast root)
2128 */
2129static void
2131 Relation heaprel,
2132 Buffer buf,
2133 Buffer rbuf,
2134 BTStack stack,
2135 bool isroot,
2136 bool isonly)
2137{
2138 Assert(heaprel != NULL);
2139
2140 /*
2141 * Here we have to do something Lehman and Yao don't talk about: deal with
2142 * a root split and construction of a new root. If our stack is empty
2143 * then we have just split a node on what had been the root level when we
2144 * descended the tree. If it was still the root then we perform a
2145 * new-root construction. If it *wasn't* the root anymore, search to find
2146 * the next higher level that someone constructed meanwhile, and find the
2147 * right place to insert as for the normal case.
2148 *
2149 * If we have to search for the parent level, we do so by re-descending
2150 * from the root. This is not super-efficient, but it's rare enough not
2151 * to matter.
2152 */
2153 if (isroot)
2154 {
2156
2157 Assert(stack == NULL);
2158 Assert(isonly);
2159 /* create a new root node one level up and update the metapage */
2160 rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2161 /* release the split buffers */
2162 _bt_relbuf(rel, rootbuf);
2163 _bt_relbuf(rel, rbuf);
2164 _bt_relbuf(rel, buf);
2165 }
2166 else
2167 {
2170 Page page = BufferGetPage(buf);
2174 Buffer pbuf;
2175
2176 if (stack == NULL)
2177 {
2178 BTPageOpaque opaque;
2179
2180 elog(DEBUG2, "concurrent ROOT page split");
2181 opaque = BTPageGetOpaque(page);
2182
2183 /*
2184 * We should never reach here when a leaf page split takes place
2185 * despite the insert of newitem being able to apply the fastpath
2186 * optimization. Make sure of that with an assertion.
2187 *
2188 * This is more of a performance issue than a correctness issue.
2189 * The fastpath won't have a descent stack. Using a phony stack
2190 * here works, but never rely on that. The fastpath should be
2191 * rejected within _bt_search_insert() when the rightmost leaf
2192 * page will split, since it's faster to go through _bt_search()
2193 * and get a stack in the usual way.
2194 */
2195 Assert(!(P_ISLEAF(opaque) &&
2197
2198 /* Find the leftmost page at the next level up */
2199 pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2200 /* Set up a phony stack entry pointing there */
2201 stack = &fakestack;
2204 stack->bts_parent = NULL;
2205 _bt_relbuf(rel, pbuf);
2206 }
2207
2208 /* get high key from left, a strict lower bound for new right page */
2209 ritem = (IndexTuple) PageGetItem(page,
2210 PageGetItemId(page, P_HIKEY));
2211
2212 /* form an index tuple that points at the new right page */
2215
2216 /*
2217 * Re-find and write lock the parent of buf.
2218 *
2219 * It's possible that the location of buf's downlink has changed since
2220 * our initial _bt_search() descent. _bt_getstackbuf() will detect
2221 * and recover from this, updating the stack, which ensures that the
2222 * new downlink will be inserted at the correct offset. Even buf's
2223 * parent may have changed.
2224 */
2225 pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2226
2227 /*
2228 * Unlock the right child. The left child will be unlocked in
2229 * _bt_insertonpg().
2230 *
2231 * Unlocking the right child must be delayed until here to ensure that
2232 * no concurrent VACUUM operation can become confused. Page deletion
2233 * cannot be allowed to fail to re-find a downlink for the rbuf page.
2234 * (Actually, this is just a vestige of how things used to work. The
2235 * page deletion code is expected to check for the INCOMPLETE_SPLIT
2236 * flag on the left child. It won't attempt deletion of the right
2237 * child until the split is complete. Despite all this, we opt to
2238 * conservatively delay unlocking the right child until here.)
2239 */
2240 _bt_relbuf(rel, rbuf);
2241
2242 if (pbuf == InvalidBuffer)
2243 ereport(ERROR,
2245 errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2247
2248 /* Recursively insert into the parent */
2249 _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2251 stack->bts_offset + 1, 0, isonly);
2252
2253 /* be tidy */
2254 pfree(new_item);
2255 }
2256}
2257
2258/*
2259 * _bt_finish_split() -- Finish an incomplete split
2260 *
2261 * A crash or other failure can leave a split incomplete. The insertion
2262 * routines won't allow to insert on a page that is incompletely split.
2263 * Before inserting on such a page, call _bt_finish_split().
2264 *
2265 * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2266 * and unpinned.
2267 *
2268 * Caller must provide a valid heaprel, since finishing a page split requires
2269 * allocating a new page if and when the parent page splits in turn.
2270 */
2271void
2273{
2276 Buffer rbuf;
2277 Page rpage;
2279 bool wasroot;
2280 bool wasonly;
2281
2283 Assert(heaprel != NULL);
2284
2285 /* Lock right sibling, the one missing the downlink */
2286 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2289
2290 /* Could this be a root split? */
2291 if (!stack)
2292 {
2294 Page metapg;
2296
2297 /* acquire lock on the metapage */
2301
2302 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2303
2304 _bt_relbuf(rel, metabuf);
2305 }
2306 else
2307 wasroot = false;
2308
2309 /* Was this the only page on the level before split? */
2311
2312 INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
2313 elog(DEBUG1, "finishing incomplete split of %u/%u",
2315
2316 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2317}
2318
2319/*
2320 * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2321 * tuple whose downlink points to child page.
2322 *
2323 * Caller passes child's block number, which is used to identify
2324 * associated pivot tuple in parent page using a linear search that
2325 * matches on pivot's downlink/block number. The expected location of
2326 * the pivot tuple is taken from the stack one level above the child
2327 * page. This is used as a starting point. Insertions into the
2328 * parent level could cause the pivot tuple to move right; deletions
2329 * could cause it to move left, but not left of the page we previously
2330 * found it on.
2331 *
2332 * Caller can use its stack to relocate the pivot tuple/downlink for
2333 * any same-level page to the right of the page found by its initial
2334 * descent. This is necessary because of the possibility that caller
2335 * moved right to recover from a concurrent page split. It's also
2336 * convenient for certain callers to be able to step right when there
2337 * wasn't a concurrent page split, while still using their original
2338 * stack. For example, the checkingunique _bt_doinsert() case may
2339 * have to step right when there are many physical duplicates, and its
2340 * scantid forces an insertion to the right of the "first page the
2341 * value could be on". (This is also relied on by all of our callers
2342 * when dealing with !heapkeyspace indexes.)
2343 *
2344 * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2345 * tuple not found (should not happen). Adjusts bts_blkno &
2346 * bts_offset if changed. Page split caller should insert its new
2347 * pivot tuple for its new right sibling page on parent page, at the
2348 * offset number bts_offset + 1.
2349 */
2350Buffer
2352{
2353 BlockNumber blkno;
2355
2356 blkno = stack->bts_blkno;
2357 start = stack->bts_offset;
2358
2359 for (;;)
2360 {
2361 Buffer buf;
2362 Page page;
2363 BTPageOpaque opaque;
2364
2365 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2366 page = BufferGetPage(buf);
2367 opaque = BTPageGetOpaque(page);
2368
2369 Assert(heaprel != NULL);
2370 if (P_INCOMPLETE_SPLIT(opaque))
2371 {
2372 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2373 continue;
2374 }
2375
2376 if (!P_IGNORE(opaque))
2377 {
2378 OffsetNumber offnum,
2379 minoff,
2380 maxoff;
2381 ItemId itemid;
2382 IndexTuple item;
2383
2384 minoff = P_FIRSTDATAKEY(opaque);
2385 maxoff = PageGetMaxOffsetNumber(page);
2386
2387 /*
2388 * start = InvalidOffsetNumber means "search the whole page". We
2389 * need this test anyway due to possibility that page has a high
2390 * key now when it didn't before.
2391 */
2392 if (start < minoff)
2393 start = minoff;
2394
2395 /*
2396 * Need this check too, to guard against possibility that page
2397 * split since we visited it originally.
2398 */
2399 if (start > maxoff)
2400 start = OffsetNumberNext(maxoff);
2401
2402 /*
2403 * These loops will check every item on the page --- but in an
2404 * order that's attuned to the probability of where it actually
2405 * is. Scan to the right first, then to the left.
2406 */
2407 for (offnum = start;
2408 offnum <= maxoff;
2409 offnum = OffsetNumberNext(offnum))
2410 {
2411 itemid = PageGetItemId(page, offnum);
2412 item = (IndexTuple) PageGetItem(page, itemid);
2413
2414 if (BTreeTupleGetDownLink(item) == child)
2415 {
2416 /* Return accurate pointer to where link is now */
2417 stack->bts_blkno = blkno;
2418 stack->bts_offset = offnum;
2419 return buf;
2420 }
2421 }
2422
2423 for (offnum = OffsetNumberPrev(start);
2424 offnum >= minoff;
2425 offnum = OffsetNumberPrev(offnum))
2426 {
2427 itemid = PageGetItemId(page, offnum);
2428 item = (IndexTuple) PageGetItem(page, itemid);
2429
2430 if (BTreeTupleGetDownLink(item) == child)
2431 {
2432 /* Return accurate pointer to where link is now */
2433 stack->bts_blkno = blkno;
2434 stack->bts_offset = offnum;
2435 return buf;
2436 }
2437 }
2438 }
2439
2440 /*
2441 * The item we're looking for moved right at least one page.
2442 *
2443 * Lehman and Yao couple/chain locks when moving right here, which we
2444 * can avoid. See nbtree/README.
2445 */
2446 if (P_RIGHTMOST(opaque))
2447 {
2448 _bt_relbuf(rel, buf);
2449 return InvalidBuffer;
2450 }
2451 blkno = opaque->btpo_next;
2453 _bt_relbuf(rel, buf);
2454 }
2455}
2456
2457/*
2458 * _bt_freestack() -- free a retracement stack made by _bt_search_insert.
2459 */
2460static void
2462{
2464
2465 while (stack != NULL)
2466 {
2467 ostack = stack;
2468 stack = stack->bts_parent;
2469 pfree(ostack);
2470 }
2471}
2472
2473/*
2474 * _bt_newlevel() -- Create a new level above root page.
2475 *
2476 * We've just split the old root page and need to create a new one.
2477 * In order to do this, we add a new root page to the file, then lock
2478 * the metadata page and update it. This is guaranteed to be deadlock-
2479 * free, because all readers release their locks on the metadata page
2480 * before trying to lock the root, and all writers lock the root before
2481 * trying to lock the metadata page. We have a write lock on the old
2482 * root page, so we have not introduced any cycles into the waits-for
2483 * graph.
2484 *
2485 * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2486 * locked. On exit, a new root page exists with entries for the
2487 * two new children, metapage is updated and unlocked/unpinned.
2488 * The new root buffer is returned to caller which has to unlock/unpin
2489 * lbuf, rbuf & rootbuf.
2490 */
2491static Buffer
2493{
2495 Page lpage,
2496 rootpage;
2498 rbkno;
2502 ItemId itemid;
2503 IndexTuple item;
2509 Page metapg;
2512
2517
2518 /* get a new root page */
2519 rootbuf = _bt_allocbuf(rel, heaprel);
2522
2523 /* acquire lock on the metapage */
2527
2528 /*
2529 * Create downlink item for left page (old root). The key value used is
2530 * "minus infinity", a sentinel value that's reliably less than any real
2531 * key value that could appear in the left page.
2532 */
2533 left_item_sz = sizeof(IndexTupleData);
2535 left_item->t_info = left_item_sz;
2537 BTreeTupleSetNAtts(left_item, 0, false);
2538
2539 /*
2540 * Create downlink item for right page. The key for it is obtained from
2541 * the "high key" position in the left page.
2542 */
2543 itemid = PageGetItemId(lpage, P_HIKEY);
2545 item = (IndexTuple) PageGetItem(lpage, itemid);
2546 right_item = CopyIndexTuple(item);
2548
2549 /* NO EREPORT(ERROR) from here till newroot op is logged */
2551
2552 /* upgrade metapage if needed */
2553 if (metad->btm_version < BTREE_NOVAC_VERSION)
2555
2556 /* set btree special data */
2558 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2559 rootopaque->btpo_flags = BTP_ROOT;
2560 rootopaque->btpo_level =
2561 (BTPageGetOpaque(lpage))->btpo_level + 1;
2562 rootopaque->btpo_cycleid = 0;
2563
2564 /* update metapage data */
2565 metad->btm_root = rootblknum;
2566 metad->btm_level = rootopaque->btpo_level;
2567 metad->btm_fastroot = rootblknum;
2568 metad->btm_fastlevel = rootopaque->btpo_level;
2569
2570 /*
2571 * Insert the left page pointer into the new root page. The root page is
2572 * the rightmost page on its level so there is no "high key" in it; the
2573 * two items will go into positions P_HIKEY and P_FIRSTKEY.
2574 *
2575 * Note: we *must* insert the two items in item-number order, for the
2576 * benefit of _bt_restore_page().
2577 */
2580 elog(PANIC, "failed to add leftkey to new root page"
2581 " while splitting block %u of index \"%s\"",
2583
2584 /*
2585 * insert the right page pointer into the new root page.
2586 */
2591 elog(PANIC, "failed to add rightkey to new root page"
2592 " while splitting block %u of index \"%s\"",
2594
2595 /* Clear the incomplete-split flag in the left child */
2597 lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2599
2602
2603 /* XLOG stuff */
2604 if (RelationNeedsWAL(rel))
2605 {
2608
2609 xlrec.rootblk = rootblknum;
2610 xlrec.level = metad->btm_level;
2611
2614
2618
2619 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2620 md.version = metad->btm_version;
2621 md.root = rootblknum;
2622 md.level = metad->btm_level;
2623 md.fastroot = rootblknum;
2624 md.fastlevel = metad->btm_level;
2625 md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2626 md.allequalimage = metad->btm_allequalimage;
2627
2628 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2629
2630 /*
2631 * Direct access to page is not good but faster - we should implement
2632 * some new func in page API.
2633 */
2635 (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2636 ((PageHeader) rootpage)->pd_special -
2637 ((PageHeader) rootpage)->pd_upper);
2638
2640 }
2641 else
2642 recptr = XLogGetFakeLSN(rel);
2643
2647
2649
2650 /* done with metapage */
2651 _bt_relbuf(rel, metabuf);
2652
2655
2656 return rootbuf;
2657}
2658
2659/*
2660 * _bt_pgaddtup() -- add a data item to a particular page during split.
2661 *
2662 * The difference between this routine and a bare PageAddItem call is
2663 * that this code can deal with the first data item on an internal btree
2664 * page in passing. This data item (which is called "firstright" within
2665 * _bt_split()) has a key that must be treated as minus infinity after
2666 * the split. Therefore, we truncate away all attributes when caller
2667 * specifies it's the first data item on page (downlink is not changed,
2668 * though). This extra step is only needed for the right page of an
2669 * internal page split. There is no need to do this for the first data
2670 * item on the existing/left page, since that will already have been
2671 * truncated during an earlier page split.
2672 *
2673 * See _bt_split() for a high level explanation of why we truncate here.
2674 * Note that this routine has nothing to do with suffix truncation,
2675 * despite using some of the same infrastructure.
2676 */
2677static inline bool
2679 Size itemsize,
2680 const IndexTupleData *itup,
2682 bool newfirstdataitem)
2683{
2685
2686 if (newfirstdataitem)
2687 {
2688 trunctuple = *itup;
2690 BTreeTupleSetNAtts(&trunctuple, 0, false);
2691 itup = &trunctuple;
2692 itemsize = sizeof(IndexTupleData);
2693 }
2694
2695 if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
2696 return false;
2697
2698 return true;
2699}
2700
2701/*
2702 * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2703 *
2704 * There are three operations performed here: simple index deletion, bottom-up
2705 * index deletion, and deduplication. If all three operations fail to free
2706 * enough space for the incoming item then caller will go on to split the
2707 * page. We always consider simple deletion first. If that doesn't work out
2708 * we consider alternatives. Callers that only want us to consider simple
2709 * deletion (without any fallback) ask for that using the 'simpleonly'
2710 * argument.
2711 *
2712 * We usually pick only one alternative "complex" operation when simple
2713 * deletion alone won't prevent a page split. The 'checkingunique',
2714 * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2715 *
2716 * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2717 * level flag was found set. The flag was useful back when there wasn't
2718 * necessarily one single page for a duplicate tuple to go on (before heap TID
2719 * became a part of the key space in version 4 indexes). But we don't
2720 * actually look at the flag anymore (it's not a gating condition for our
2721 * caller). That would cause us to miss tuples that are safe to delete,
2722 * without getting any benefit in return. We know that the alternative is to
2723 * split the page; scanning the line pointer array in passing won't have
2724 * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2725 * all this because !heapkeyspace indexes must still do a "getting tired"
2726 * linear search, and so are likely to get some benefit from using it as a
2727 * gating condition.)
2728 */
2729static void
2732 bool simpleonly, bool checkingunique,
2733 bool uniquedup, bool indexUnchanged)
2734{
2736 int ndeletable = 0;
2737 OffsetNumber offnum,
2738 minoff,
2739 maxoff;
2740 Buffer buffer = insertstate->buf;
2741 BTScanInsert itup_key = insertstate->itup_key;
2742 Page page = BufferGetPage(buffer);
2743 BTPageOpaque opaque = BTPageGetOpaque(page);
2744
2745 Assert(P_ISLEAF(opaque));
2746 Assert(simpleonly || itup_key->heapkeyspace);
2748
2749 /*
2750 * Scan over all items to see which ones need to be deleted according to
2751 * LP_DEAD flags. We'll usually manage to delete a few extra items that
2752 * are not marked LP_DEAD in passing. Often the extra items that actually
2753 * end up getting deleted are items that would have had their LP_DEAD bit
2754 * set before long anyway (if we opted not to include them as extras).
2755 */
2756 minoff = P_FIRSTDATAKEY(opaque);
2757 maxoff = PageGetMaxOffsetNumber(page);
2758 for (offnum = minoff;
2759 offnum <= maxoff;
2760 offnum = OffsetNumberNext(offnum))
2761 {
2762 ItemId itemId = PageGetItemId(page, offnum);
2763
2764 if (ItemIdIsDead(itemId))
2765 deletable[ndeletable++] = offnum;
2766 }
2767
2768 if (ndeletable > 0)
2769 {
2770 _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2771 insertstate->itup, minoff, maxoff);
2772 insertstate->bounds_valid = false;
2773
2774 /* Return when a page split has already been avoided */
2775 if (PageGetFreeSpace(page) >= insertstate->itemsz)
2776 return;
2777
2778 /* Might as well assume duplicates (if checkingunique) */
2779 uniquedup = true;
2780 }
2781
2782 /*
2783 * We're done with simple deletion. Return early with callers that only
2784 * call here so that simple deletion can be considered. This includes
2785 * callers that explicitly ask for this and checkingunique callers that
2786 * probably don't have any version churn duplicates on the page.
2787 *
2788 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2789 * return at this point (or when we go on the try either or both of our
2790 * other strategies and they also fail). We do not bother expending a
2791 * separate write to clear it, however. Caller will definitely clear it
2792 * when it goes on to split the page (note also that the deduplication
2793 * process will clear the flag in passing, just to keep things tidy).
2794 */
2795 if (simpleonly || (checkingunique && !uniquedup))
2796 {
2798 return;
2799 }
2800
2801 /* Assume bounds about to be invalidated (this is almost certain now) */
2802 insertstate->bounds_valid = false;
2803
2804 /*
2805 * Perform bottom-up index deletion pass when executor hint indicated that
2806 * incoming item is logically unchanged, or for a unique index that is
2807 * known to have physical duplicates for some other reason. (There is a
2808 * large overlap between these two cases for a unique index. It's worth
2809 * having both triggering conditions in order to apply the optimization in
2810 * the event of successive related INSERT and DELETE statements.)
2811 *
2812 * We'll go on to do a deduplication pass when a bottom-up pass fails to
2813 * delete an acceptable amount of free space (a significant fraction of
2814 * the page, or space for the new item, whichever is greater).
2815 *
2816 * Note: Bottom-up index deletion uses the same equality/equivalence
2817 * routines as deduplication internally. However, it does not merge
2818 * together index tuples, so the same correctness considerations do not
2819 * apply. We deliberately omit an index-is-allequalimage test here.
2820 */
2821 if ((indexUnchanged || uniquedup) &&
2822 _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2823 return;
2824
2825 /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2826 if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2827 _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2829}
2830
2831/*
2832 * _bt_simpledel_pass - Simple index tuple deletion pass.
2833 *
2834 * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2835 * of all such tuples are determined by caller (caller passes these to us as
2836 * its 'deletable' argument).
2837 *
2838 * We might also delete extra index tuples that turn out to be safe to delete
2839 * in passing (though they must be cheap to check in passing to begin with).
2840 * There is no certainty that any extra tuples will be deleted, though. The
2841 * high level goal of the approach we take is to get the most out of each call
2842 * here (without noticeably increasing the per-call overhead compared to what
2843 * we need to do just to be able to delete the page's LP_DEAD-marked index
2844 * tuples).
2845 *
2846 * The number of extra index tuples that turn out to be deletable might
2847 * greatly exceed the number of LP_DEAD-marked index tuples due to various
2848 * locality related effects. For example, it's possible that the total number
2849 * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2850 * quite low, in which case we might end up checking if it's possible to
2851 * delete _most_ index tuples on the page (without the tableam needing to
2852 * access additional table blocks). The tableam will sometimes stumble upon
2853 * _many_ extra deletable index tuples in indexes where this pattern is
2854 * common.
2855 *
2856 * See nbtree/README for further details on simple index tuple deletion.
2857 */
2858static void
2861 OffsetNumber minoff, OffsetNumber maxoff)
2862{
2863 Page page = BufferGetPage(buffer);
2865 int ndeadblocks;
2867 OffsetNumber offnum;
2868
2869 /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2871 &ndeadblocks);
2872
2873 /* Initialize tableam state that describes index deletion operation */
2874 delstate.irel = rel;
2875 delstate.iblknum = BufferGetBlockNumber(buffer);
2876 delstate.bottomup = false;
2877 delstate.bottomupfreespace = 0;
2878 delstate.ndeltids = 0;
2881
2882 for (offnum = minoff;
2883 offnum <= maxoff;
2884 offnum = OffsetNumberNext(offnum))
2885 {
2886 ItemId itemid = PageGetItemId(page, offnum);
2887 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2888 TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2889 TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2891 void *match;
2892
2893 if (!BTreeTupleIsPosting(itup))
2894 {
2897 sizeof(BlockNumber), _bt_blk_cmp);
2898
2899 if (!match)
2900 {
2901 Assert(!ItemIdIsDead(itemid));
2902 continue;
2903 }
2904
2905 /*
2906 * TID's table block is among those pointed to by the TIDs from
2907 * LP_DEAD-bit set tuples on page -- add TID to deltids
2908 */
2909 odeltid->tid = itup->t_tid;
2910 odeltid->id = delstate.ndeltids;
2911 ostatus->idxoffnum = offnum;
2912 ostatus->knowndeletable = ItemIdIsDead(itemid);
2913 ostatus->promising = false; /* unused */
2914 ostatus->freespace = 0; /* unused */
2915
2916 delstate.ndeltids++;
2917 }
2918 else
2919 {
2920 int nitem = BTreeTupleGetNPosting(itup);
2921
2922 for (int p = 0; p < nitem; p++)
2923 {
2924 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2925
2928 sizeof(BlockNumber), _bt_blk_cmp);
2929
2930 if (!match)
2931 {
2932 Assert(!ItemIdIsDead(itemid));
2933 continue;
2934 }
2935
2936 /*
2937 * TID's table block is among those pointed to by the TIDs
2938 * from LP_DEAD-bit set tuples on page -- add TID to deltids
2939 */
2940 odeltid->tid = *tid;
2941 odeltid->id = delstate.ndeltids;
2942 ostatus->idxoffnum = offnum;
2943 ostatus->knowndeletable = ItemIdIsDead(itemid);
2944 ostatus->promising = false; /* unused */
2945 ostatus->freespace = 0; /* unused */
2946
2947 odeltid++;
2948 ostatus++;
2949 delstate.ndeltids++;
2950 }
2951 }
2952 }
2953
2955
2956 Assert(delstate.ndeltids >= ndeletable);
2957
2958 /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2959 _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2960
2961 pfree(delstate.deltids);
2962 pfree(delstate.status);
2963}
2964
2965/*
2966 * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2967 *
2968 * Builds sorted and unique-ified array of table block numbers from index
2969 * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2970 * block from incoming newitem just in case it isn't among the LP_DEAD-related
2971 * table blocks.
2972 *
2973 * Always counting the newitem's table block as an LP_DEAD related block makes
2974 * sense because the cost is consistently low; it is practically certain that
2975 * the table block will not incur a buffer miss in tableam. On the other hand
2976 * the benefit is often quite high. There is a decent chance that there will
2977 * be some deletable items from this block, since in general most garbage
2978 * tuples became garbage in the recent past (in many cases this won't be the
2979 * first logical row that core code added to/modified in table block
2980 * recently).
2981 *
2982 * Returns final array, and sets *nblocks to its final size for caller.
2983 */
2984static BlockNumber *
2986 IndexTuple newitem, int *nblocks)
2987{
2988 int spacentids,
2989 ntids;
2991
2992 /*
2993 * Accumulate each TID's block in array whose initial size has space for
2994 * one table block per LP_DEAD-set tuple (plus space for the newitem table
2995 * block). Array will only need to grow when there are LP_DEAD-marked
2996 * posting list tuples (which is not that common).
2997 */
2998 spacentids = ndeletable + 1;
2999 ntids = 0;
3001
3002 /*
3003 * First add the table block for the incoming newitem. This is the one
3004 * case where simple deletion can visit a table block that doesn't have
3005 * any known deletable items.
3006 */
3007 Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
3008 tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
3009
3010 for (int i = 0; i < ndeletable; i++)
3011 {
3012 ItemId itemid = PageGetItemId(page, deletable[i]);
3013 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
3014
3015 Assert(ItemIdIsDead(itemid));
3016
3017 if (!BTreeTupleIsPosting(itup))
3018 {
3019 if (ntids + 1 > spacentids)
3020 {
3021 spacentids *= 2;
3024 }
3025
3026 tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
3027 }
3028 else
3029 {
3030 int nposting = BTreeTupleGetNPosting(itup);
3031
3032 if (ntids + nposting > spacentids)
3033 {
3034 spacentids = Max(spacentids * 2, ntids + nposting);
3037 }
3038
3039 for (int j = 0; j < nposting; j++)
3040 {
3041 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
3042
3043 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
3044 }
3045 }
3046 }
3047
3048 qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3049 *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3050
3051 return tidblocks;
3052}
3053
3054/*
3055 * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3056 */
3057static inline int
3058_bt_blk_cmp(const void *arg1, const void *arg2)
3059{
3060 BlockNumber b1 = *((const BlockNumber *) arg1);
3061 BlockNumber b2 = *((const BlockNumber *) arg2);
3062
3063 return pg_cmp_u32(b1, b2);
3064}
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static Datum values[MAXATTR]
Definition bootstrap.c:188
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
Definition bufmgr.c:6937
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5505
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3063
bool BufferBeginSetHintBits(Buffer buffer)
Definition bufmgr.c:6909
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:874
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
static Size BufferGetPageSize(Buffer buffer)
Definition bufmgr.h:459
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
Size PageGetFreeSpace(const PageData *page)
Definition bufpage.c:906
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:269
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:379
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:417
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:411
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition bufpage.h:504
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:397
#define MAXALIGN(LEN)
Definition c.h:898
uint8_t uint8
Definition c.h:616
#define PG_UINT32_MAX
Definition c.h:676
#define Max(x, y)
Definition c.h:1087
#define Assert(condition)
Definition c.h:945
uint16_t uint16
Definition c.h:617
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
uint32 TransactionId
Definition c.h:738
size_t Size
Definition c.h:691
int errcode(int sqlerrcode)
Definition elog.c:874
int errhint(const char *fmt,...) pg_attribute_printf(1
int errdetail(const char *fmt,...) pg_attribute_printf(1
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_array(type, count)
Definition fe_memutils.h:76
char * BuildIndexValueDescription(Relation indexRelation, const Datum *values, const bool *isnull)
Definition genam.c:178
IndexUniqueCheck
Definition genam.h:124
@ UNIQUE_CHECK_NO
Definition genam.h:125
@ UNIQUE_CHECK_EXISTING
Definition genam.h:128
@ UNIQUE_CHECK_PARTIAL
Definition genam.h:127
return str start
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition indextuple.c:364
IndexTuple CopyIndexTuple(IndexTuple source)
Definition indextuple.c:479
#define INJECTION_POINT(name, arg)
static int pg_cmp_u32(uint32 a, uint32 b)
Definition int.h:719
int j
Definition isn.c:78
int i
Definition isn.c:77
#define ItemIdMarkDead(itemId)
Definition itemid.h:179
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define ItemIdIsDead(itemId)
Definition itemid.h:113
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
IndexTupleData * IndexTuple
Definition itup.h:53
static Size IndexTupleSize(const IndexTupleData *itup)
Definition itup.h:71
#define MaxIndexTuplesPerPage
Definition itup.h:181
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition lmgr.c:828
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition lmgr.c:663
@ XLTW_InsertIndex
Definition lmgr.h:31
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define END_CRIT_SECTION()
Definition miscadmin.h:152
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition nbtdedup.c:1022
void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
Definition nbtdedup.c:59
bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
Definition nbtdedup.c:309
static void _bt_freestack(BTStack stack)
Definition nbtinsert.c:2461
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition nbtinsert.c:320
#define BTREE_FASTPATH_MIN_LEVEL
Definition nbtinsert.c:32
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition nbtinsert.c:829
Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
Definition nbtinsert.c:2351
static bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition nbtinsert.c:2678
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
Definition nbtinsert.c:2859
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition nbtinsert.c:105
static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
Definition nbtinsert.c:2492
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition nbtinsert.c:2130
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
Definition nbtinsert.c:2730
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition nbtinsert.c:2272
static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition nbtinsert.c:1489
static int _bt_blk_cmp(const void *arg1, const void *arg2)
Definition nbtinsert.c:3058
static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
Definition nbtinsert.c:1041
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition nbtinsert.c:1119
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition nbtinsert.c:411
static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
Definition nbtinsert.c:2985
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition nbtpage.c:1008
void _bt_upgrademetapage(Page page)
Definition nbtpage.c:108
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1028
int _bt_getrootheight(Relation rel)
Definition nbtpage.c:680
void _bt_pageinit(Page page, Size size)
Definition nbtpage.c:1134
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition nbtpage.c:874
void _bt_checkpage(Relation rel, Buffer buf)
Definition nbtpage.c:802
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1098
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition nbtpage.c:850
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition nbtpage.c:1520
#define BTGetDeduplicateItems(relation)
Definition nbtree.h:1135
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
#define BTPageGetMeta(p)
Definition nbtree.h:122
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define P_HIKEY
Definition nbtree.h:368
#define P_HAS_GARBAGE(opaque)
Definition nbtree.h:227
#define BTP_HAS_GARBAGE
Definition nbtree.h:83
#define P_LEFTMOST(opaque)
Definition nbtree.h:219
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define BTP_ROOT
Definition nbtree.h:78
#define MaxTIDsPerBTreePage
Definition nbtree.h:186
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition nbtree.h:563
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
#define P_ISROOT(opaque)
Definition nbtree.h:222
#define P_NONE
Definition nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define P_INCOMPLETE_SPLIT(opaque)
Definition nbtree.h:228
#define BTREE_METAPAGE
Definition nbtree.h:149
#define BTP_INCOMPLETE_SPLIT
Definition nbtree.h:84
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition nbtree.h:545
#define BT_READ
Definition nbtree.h:730
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition nbtree.h:557
#define P_IGNORE(opaque)
Definition nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
#define BTREE_NOVAC_VERSION
Definition nbtree.h:153
#define BT_WRITE
Definition nbtree.h:731
#define P_FIRSTKEY
Definition nbtree.h:369
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition nbtree.h:596
#define BTMaxItemSize
Definition nbtree.h:165
#define BTP_SPLIT_END
Definition nbtree.h:82
#define BTreeTupleGetNAtts(itup, rel)
Definition nbtree.h:578
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition nbtsearch.c:2092
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition nbtsearch.c:475
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition nbtsearch.c:689
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access, bool returnstack)
Definition nbtsearch.c:100
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition nbtutils.c:1119
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition nbtutils.c:430
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition nbtutils.c:59
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition nbtutils.c:693
#define XLOG_BTREE_INSERT_POST
Definition nbtxlog.h:32
#define SizeOfBtreeInsert
Definition nbtxlog.h:84
#define XLOG_BTREE_SPLIT_R
Definition nbtxlog.h:31
#define XLOG_BTREE_INSERT_LEAF
Definition nbtxlog.h:27
#define XLOG_BTREE_INSERT_UPPER
Definition nbtxlog.h:28
#define SizeOfBtreeSplit
Definition nbtxlog.h:158
#define SizeOfBtreeNewroot
Definition nbtxlog.h:347
#define XLOG_BTREE_INSERT_META
Definition nbtxlog.h:29
#define XLOG_BTREE_SPLIT_L
Definition nbtxlog.h:30
#define XLOG_BTREE_NEWROOT
Definition nbtxlog.h:37
static char * errmsg
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
#define INDEX_MAX_KEYS
uint32 pg_prng_uint32(pg_prng_state *state)
Definition pg_prng.c:227
pg_prng_state pg_global_prng_state
Definition pg_prng.c:34
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define qsort(a, b, c, d)
Definition port.h:495
uint64_t Datum
Definition postgres.h:70
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition predicate.c:3153
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4345
static int fb(int x)
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition qunique.h:21
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationGetTargetBlock(relation)
Definition rel.h:610
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define RelationSetTargetBlock(relation, targblock)
Definition rel.h:617
#define IndexRelationGetNumberOfAttributes(relation)
Definition rel.h:526
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
int errtableconstraint(Relation rel, const char *conname)
Definition relcache.c:6116
#define SnapshotSelf
Definition snapmgr.h:32
#define InitDirtySnapshot(snapshotdata)
Definition snapmgr.h:42
BlockNumber btpo_next
Definition nbtree.h:66
uint16 btpo_flags
Definition nbtree.h:68
uint32 btpo_level
Definition nbtree.h:67
ItemPointer scantid
Definition nbtree.h:802
bool allequalimage
Definition nbtree.h:798
bool heapkeyspace
Definition nbtree.h:797
BlockNumber bts_blkno
Definition nbtree.h:745
struct BTStackData * bts_parent
Definition nbtree.h:747
OffsetNumber bts_offset
Definition nbtree.h:746
ItemPointerData t_tid
Definition itup.h:37
unsigned short t_info
Definition itup.h:49
BlockNumber fastroot
Definition nbtxlog.h:51
uint32 fastlevel
Definition nbtxlog.h:52
BlockNumber root
Definition nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition nbtxlog.h:53
uint32 level
Definition nbtxlog.h:152
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition tableam.c:240
#define InvalidTransactionId
Definition transam.h:31
#define TransactionIdIsValid(xid)
Definition transam.h:41
uint64 XLogRecPtr
Definition xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:479
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition xloginsert.c:410
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:369
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:246
void XLogBeginInsert(void)
Definition xloginsert.c:153
XLogRecPtr XLogGetFakeLSN(Relation rel)
Definition xloginsert.c:559
#define REGBUF_STANDARD
Definition xloginsert.h:35
#define REGBUF_WILL_INIT
Definition xloginsert.h:34