PostgreSQL Source Code git master
nbtinsert.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtinsert.c
4 * Item insertion in Lehman and Yao btrees for Postgres.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtinsert.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "access/nbtree.h"
19#include "access/nbtxlog.h"
20#include "access/tableam.h"
21#include "access/transam.h"
22#include "access/xloginsert.h"
23#include "common/int.h"
24#include "common/pg_prng.h"
25#include "lib/qunique.h"
26#include "miscadmin.h"
27#include "storage/lmgr.h"
28#include "storage/predicate.h"
30
31/* Minimum tree height for application of fastpath optimization */
32#define BTREE_FASTPATH_MIN_LEVEL 2
33
34
35static BTStack _bt_search_insert(Relation rel, Relation heaprel,
36 BTInsertState insertstate);
38 Relation heapRel,
39 IndexUniqueCheck checkUnique, bool *is_unique,
40 uint32 *speculativeToken);
42 BTInsertState insertstate,
43 bool checkingunique,
44 bool indexUnchanged,
45 BTStack stack,
46 Relation heapRel);
47static void _bt_stepright(Relation rel, Relation heaprel,
48 BTInsertState insertstate, BTStack stack);
49static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
50 Buffer buf,
51 Buffer cbuf,
52 BTStack stack,
53 IndexTuple itup,
54 Size itemsz,
55 OffsetNumber newitemoff,
56 int postingoff,
57 bool split_only_page);
58static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
59 Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
60 Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
61 IndexTuple nposting, uint16 postingoff);
62static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
63 Buffer rbuf, BTStack stack, bool isroot, bool isonly);
64static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
65static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
66 OffsetNumber itup_off, bool newfirstdataitem);
67static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
68 BTInsertState insertstate,
69 bool simpleonly, bool checkingunique,
70 bool uniquedup, bool indexUnchanged);
71static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
72 OffsetNumber *deletable, int ndeletable,
73 IndexTuple newitem, OffsetNumber minoff,
74 OffsetNumber maxoff);
75static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
76 int ndeletable, IndexTuple newitem,
77 int *nblocks);
78static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
79
80/*
81 * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
82 *
83 * This routine is called by the public interface routine, btinsert.
84 * By here, itup is filled in, including the TID.
85 *
86 * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
87 * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
88 * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
89 * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
90 * don't actually insert.
91 *
92 * indexUnchanged executor hint indicates if itup is from an
93 * UPDATE that didn't logically change the indexed value, but
94 * must nevertheless have a new entry to point to a successor
95 * version.
96 *
97 * The result value is only significant for UNIQUE_CHECK_PARTIAL:
98 * it must be true if the entry is known unique, else false.
99 * (In the current implementation we'll also return true after a
100 * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
101 * that's just a coding artifact.)
102 */
103bool
105 IndexUniqueCheck checkUnique, bool indexUnchanged,
106 Relation heapRel)
107{
108 bool is_unique = false;
109 BTInsertStateData insertstate;
110 BTScanInsert itup_key;
111 BTStack stack;
112 bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
113
114 /* we need an insertion scan key to do our search, so build one */
115 itup_key = _bt_mkscankey(rel, itup);
116
117 if (checkingunique)
118 {
119 if (!itup_key->anynullkeys)
120 {
121 /* No (heapkeyspace) scantid until uniqueness established */
122 itup_key->scantid = NULL;
123 }
124 else
125 {
126 /*
127 * Scan key for new tuple contains NULL key values. Bypass
128 * checkingunique steps. They are unnecessary because core code
129 * considers NULL unequal to every value, including NULL.
130 *
131 * This optimization avoids O(N^2) behavior within the
132 * _bt_findinsertloc() heapkeyspace path when a unique index has a
133 * large number of "duplicates" with NULL key values.
134 */
135 checkingunique = false;
136 /* Tuple is unique in the sense that core code cares about */
137 Assert(checkUnique != UNIQUE_CHECK_EXISTING);
138 is_unique = true;
139 }
140 }
141
142 /*
143 * Fill in the BTInsertState working area, to track the current page and
144 * position within the page to insert on.
145 *
146 * Note that itemsz is passed down to lower level code that deals with
147 * inserting the item. It must be MAXALIGN()'d. This ensures that space
148 * accounting code consistently considers the alignment overhead that we
149 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
150 * already conservative about alignment, but we don't rely on that from
151 * this distance. Besides, preserving the "true" tuple size in index
152 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
153 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
154 */
155 insertstate.itup = itup;
156 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
157 insertstate.itup_key = itup_key;
158 insertstate.bounds_valid = false;
159 insertstate.buf = InvalidBuffer;
160 insertstate.postingoff = 0;
161
162search:
163
164 /*
165 * Find and lock the leaf page that the tuple should be added to by
166 * searching from the root page. insertstate.buf will hold a buffer that
167 * is locked in exclusive mode afterwards.
168 */
169 stack = _bt_search_insert(rel, heapRel, &insertstate);
170
171 /*
172 * checkingunique inserts are not allowed to go ahead when two tuples with
173 * equal key attribute values would be visible to new MVCC snapshots once
174 * the xact commits. Check for conflicts in the locked page/buffer (if
175 * needed) here.
176 *
177 * It might be necessary to check a page to the right in _bt_check_unique,
178 * though that should be very rare. In practice the first page the value
179 * could be on (with scantid omitted) is almost always also the only page
180 * that a matching tuple might be found on. This is due to the behavior
181 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
182 * only be allowed to cross a page boundary when there is no candidate
183 * leaf page split point that avoids it. Also, _bt_check_unique can use
184 * the leaf page high key to determine that there will be no duplicates on
185 * the right sibling without actually visiting it (it uses the high key in
186 * cases where the new item happens to belong at the far right of the leaf
187 * page).
188 *
189 * NOTE: obviously, _bt_check_unique can only detect keys that are already
190 * in the index; so it cannot defend against concurrent insertions of the
191 * same key. We protect against that by means of holding a write lock on
192 * the first page the value could be on, with omitted/-inf value for the
193 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
194 * the same key must acquire a write lock on the same page, so only one
195 * would-be inserter can be making the check at one time. Furthermore,
196 * once we are past the check we hold write locks continuously until we
197 * have performed our insertion, so no later inserter can fail to see our
198 * insertion. (This requires some care in _bt_findinsertloc.)
199 *
200 * If we must wait for another xact, we release the lock while waiting,
201 * and then must perform a new search.
202 *
203 * For a partial uniqueness check, we don't wait for the other xact. Just
204 * let the tuple in and return false for possibly non-unique, or true for
205 * definitely unique.
206 */
207 if (checkingunique)
208 {
209 TransactionId xwait;
210 uint32 speculativeToken;
211
212 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
213 &is_unique, &speculativeToken);
214
215 if (unlikely(TransactionIdIsValid(xwait)))
216 {
217 /* Have to wait for the other guy ... */
218 _bt_relbuf(rel, insertstate.buf);
219 insertstate.buf = InvalidBuffer;
220
221 /*
222 * If it's a speculative insertion, wait for it to finish (ie. to
223 * go ahead with the insertion, or kill the tuple). Otherwise
224 * wait for the transaction to finish as usual.
225 */
226 if (speculativeToken)
227 SpeculativeInsertionWait(xwait, speculativeToken);
228 else
229 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
230
231 /* start over... */
232 if (stack)
233 _bt_freestack(stack);
234 goto search;
235 }
236
237 /* Uniqueness is established -- restore heap tid as scantid */
238 if (itup_key->heapkeyspace)
239 itup_key->scantid = &itup->t_tid;
240 }
241
242 if (checkUnique != UNIQUE_CHECK_EXISTING)
243 {
244 OffsetNumber newitemoff;
245
246 /*
247 * The only conflict predicate locking cares about for indexes is when
248 * an index tuple insert conflicts with an existing lock. We don't
249 * know the actual page we're going to insert on for sure just yet in
250 * checkingunique and !heapkeyspace cases, but it's okay to use the
251 * first page the value could be on (with scantid omitted) instead.
252 */
254
255 /*
256 * Do the insertion. Note that insertstate contains cached binary
257 * search bounds established within _bt_check_unique when insertion is
258 * checkingunique.
259 */
260 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
261 indexUnchanged, stack, heapRel);
262 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
263 stack, itup, insertstate.itemsz, newitemoff,
264 insertstate.postingoff, false);
265 }
266 else
267 {
268 /* just release the buffer */
269 _bt_relbuf(rel, insertstate.buf);
270 }
271
272 /* be tidy */
273 if (stack)
274 _bt_freestack(stack);
275 pfree(itup_key);
276
277 return is_unique;
278}
279
280/*
281 * _bt_search_insert() -- _bt_search() wrapper for inserts
282 *
283 * Search the tree for a particular scankey, or more precisely for the first
284 * leaf page it could be on. Try to make use of the fastpath optimization's
285 * rightmost leaf page cache before actually searching the tree from the root
286 * page, though.
287 *
288 * Return value is a stack of parent-page pointers (though see notes about
289 * fastpath optimization and page splits below). insertstate->buf is set to
290 * the address of the leaf-page buffer, which is write-locked and pinned in
291 * all cases (if necessary by creating a new empty root page for caller).
292 *
293 * The fastpath optimization avoids most of the work of searching the tree
294 * repeatedly when a single backend inserts successive new tuples on the
295 * rightmost leaf page of an index. A backend cache of the rightmost leaf
296 * page is maintained within _bt_insertonpg(), and used here. The cache is
297 * invalidated here when an insert of a non-pivot tuple must take place on a
298 * non-rightmost leaf page.
299 *
300 * The optimization helps with indexes on an auto-incremented field. It also
301 * helps with indexes on datetime columns, as well as indexes with lots of
302 * NULL values. (NULLs usually get inserted in the rightmost page for single
303 * column indexes, since they usually get treated as coming after everything
304 * else in the key space. Individual NULL tuples will generally be placed on
305 * the rightmost leaf page due to the influence of the heap TID column.)
306 *
307 * Note that we avoid applying the optimization when there is insufficient
308 * space on the rightmost page to fit caller's new item. This is necessary
309 * because we'll need to return a real descent stack when a page split is
310 * expected (actually, caller can cope with a leaf page split that uses a NULL
311 * stack, but that's very slow and so must be avoided). Note also that the
312 * fastpath optimization acquires the lock on the page conditionally as a way
313 * of reducing extra contention when there are concurrent insertions into the
314 * rightmost page (we give up if we'd have to wait for the lock). We assume
315 * that it isn't useful to apply the optimization when there is contention,
316 * since each per-backend cache won't stay valid for long.
317 */
318static BTStack
320{
321 Assert(insertstate->buf == InvalidBuffer);
322 Assert(!insertstate->bounds_valid);
323 Assert(insertstate->postingoff == 0);
324
326 {
327 /* Simulate a _bt_getbuf() call with conditional locking */
328 insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
329 if (_bt_conditionallockbuf(rel, insertstate->buf))
330 {
331 Page page;
332 BTPageOpaque opaque;
333
334 _bt_checkpage(rel, insertstate->buf);
335 page = BufferGetPage(insertstate->buf);
336 opaque = BTPageGetOpaque(page);
337
338 /*
339 * Check if the page is still the rightmost leaf page and has
340 * enough free space to accommodate the new tuple. Also check
341 * that the insertion scan key is strictly greater than the first
342 * non-pivot tuple on the page. (Note that we expect itup_key's
343 * scantid to be unset when our caller is a checkingunique
344 * inserter.)
345 */
346 if (P_RIGHTMOST(opaque) &&
347 P_ISLEAF(opaque) &&
348 !P_IGNORE(opaque) &&
349 PageGetFreeSpace(page) > insertstate->itemsz &&
351 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
352 {
353 /*
354 * Caller can use the fastpath optimization because cached
355 * block is still rightmost leaf page, which can fit caller's
356 * new tuple without splitting. Keep block in local cache for
357 * next insert, and have caller use NULL stack.
358 *
359 * Note that _bt_insert_parent() has an assertion that catches
360 * leaf page splits that somehow follow from a fastpath insert
361 * (it should only be passed a NULL stack when it must deal
362 * with a concurrent root page split, and never because a NULL
363 * stack was returned here).
364 */
365 return NULL;
366 }
367
368 /* Page unsuitable for caller, drop lock and pin */
369 _bt_relbuf(rel, insertstate->buf);
370 }
371 else
372 {
373 /* Lock unavailable, drop pin */
374 ReleaseBuffer(insertstate->buf);
375 }
376
377 /* Forget block, since cache doesn't appear to be useful */
379 }
380
381 /* Cannot use optimization -- descend tree, return proper descent stack */
382 return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
383 BT_WRITE);
384}
385
386/*
387 * _bt_check_unique() -- Check for violation of unique index constraint
388 *
389 * Returns InvalidTransactionId if there is no conflict, else an xact ID
390 * we must wait for to see if it commits a conflicting tuple. If an actual
391 * conflict is detected, no return --- just ereport(). If an xact ID is
392 * returned, and the conflicting tuple still has a speculative insertion in
393 * progress, *speculativeToken is set to non-zero, and the caller can wait for
394 * the verdict on the insertion using SpeculativeInsertionWait().
395 *
396 * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
397 * InvalidTransactionId because we don't want to wait. In this case we
398 * set *is_unique to false if there is a potential conflict, and the
399 * core code must redo the uniqueness check later.
400 *
401 * As a side-effect, sets state in insertstate that can later be used by
402 * _bt_findinsertloc() to reuse most of the binary search work we do
403 * here.
404 *
405 * This code treats NULLs as equal, unlike the default semantics for unique
406 * indexes. So do not call here when there are NULL values in scan key and
407 * the index uses the default NULLS DISTINCT mode.
408 */
409static TransactionId
411 IndexUniqueCheck checkUnique, bool *is_unique,
412 uint32 *speculativeToken)
413{
414 IndexTuple itup = insertstate->itup;
415 IndexTuple curitup = NULL;
416 ItemId curitemid = NULL;
417 BTScanInsert itup_key = insertstate->itup_key;
418 SnapshotData SnapshotDirty;
419 OffsetNumber offset;
420 OffsetNumber maxoff;
421 Page page;
422 BTPageOpaque opaque;
423 Buffer nbuf = InvalidBuffer;
424 bool found = false;
425 bool inposting = false;
426 bool prevalldead = true;
427 int curposti = 0;
428
429 /* Assume unique until we find a duplicate */
430 *is_unique = true;
431
432 InitDirtySnapshot(SnapshotDirty);
433
434 page = BufferGetPage(insertstate->buf);
435 opaque = BTPageGetOpaque(page);
436 maxoff = PageGetMaxOffsetNumber(page);
437
438 /*
439 * Find the first tuple with the same key.
440 *
441 * This also saves the binary search bounds in insertstate. We use them
442 * in the fastpath below, but also in the _bt_findinsertloc() call later.
443 */
444 Assert(!insertstate->bounds_valid);
445 offset = _bt_binsrch_insert(rel, insertstate);
446
447 /*
448 * Scan over all equal tuples, looking for live conflicts.
449 */
450 Assert(!insertstate->bounds_valid || insertstate->low == offset);
451 Assert(!itup_key->anynullkeys);
452 Assert(itup_key->scantid == NULL);
453 for (;;)
454 {
455 /*
456 * Each iteration of the loop processes one heap TID, not one index
457 * tuple. Current offset number for page isn't usually advanced on
458 * iterations that process heap TIDs from posting list tuples.
459 *
460 * "inposting" state is set when _inside_ a posting list --- not when
461 * we're at the start (or end) of a posting list. We advance curposti
462 * at the end of the iteration when inside a posting list tuple. In
463 * general, every loop iteration either advances the page offset or
464 * advances curposti --- an iteration that handles the rightmost/max
465 * heap TID in a posting list finally advances the page offset (and
466 * unsets "inposting").
467 *
468 * Make sure the offset points to an actual index tuple before trying
469 * to examine it...
470 */
471 if (offset <= maxoff)
472 {
473 /*
474 * Fastpath: In most cases, we can use cached search bounds to
475 * limit our consideration to items that are definitely
476 * duplicates. This fastpath doesn't apply when the original page
477 * is empty, or when initial offset is past the end of the
478 * original page, which may indicate that we need to examine a
479 * second or subsequent page.
480 *
481 * Note that this optimization allows us to avoid calling
482 * _bt_compare() directly when there are no duplicates, as long as
483 * the offset where the key will go is not at the end of the page.
484 */
485 if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
486 {
487 Assert(insertstate->bounds_valid);
488 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
489 Assert(insertstate->low <= insertstate->stricthigh);
490 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
491 break;
492 }
493
494 /*
495 * We can skip items that are already marked killed.
496 *
497 * In the presence of heavy update activity an index may contain
498 * many killed items with the same key; running _bt_compare() on
499 * each killed item gets expensive. Just advance over killed
500 * items as quickly as we can. We only apply _bt_compare() when
501 * we get to a non-killed item. We could reuse the bounds to
502 * avoid _bt_compare() calls for known equal tuples, but it
503 * doesn't seem worth it.
504 */
505 if (!inposting)
506 curitemid = PageGetItemId(page, offset);
507 if (inposting || !ItemIdIsDead(curitemid))
508 {
509 ItemPointerData htid;
510 bool all_dead = false;
511
512 if (!inposting)
513 {
514 /* Plain tuple, or first TID in posting list tuple */
515 if (_bt_compare(rel, itup_key, page, offset) != 0)
516 break; /* we're past all the equal tuples */
517
518 /* Advanced curitup */
519 curitup = (IndexTuple) PageGetItem(page, curitemid);
520 Assert(!BTreeTupleIsPivot(curitup));
521 }
522
523 /* okay, we gotta fetch the heap tuple using htid ... */
524 if (!BTreeTupleIsPosting(curitup))
525 {
526 /* ... htid is from simple non-pivot tuple */
527 Assert(!inposting);
528 htid = curitup->t_tid;
529 }
530 else if (!inposting)
531 {
532 /* ... htid is first TID in new posting list */
533 inposting = true;
534 prevalldead = true;
535 curposti = 0;
536 htid = *BTreeTupleGetPostingN(curitup, 0);
537 }
538 else
539 {
540 /* ... htid is second or subsequent TID in posting list */
541 Assert(curposti > 0);
542 htid = *BTreeTupleGetPostingN(curitup, curposti);
543 }
544
545 /*
546 * If we are doing a recheck, we expect to find the tuple we
547 * are rechecking. It's not a duplicate, but we have to keep
548 * scanning.
549 */
550 if (checkUnique == UNIQUE_CHECK_EXISTING &&
551 ItemPointerCompare(&htid, &itup->t_tid) == 0)
552 {
553 found = true;
554 }
555
556 /*
557 * Check if there's any table tuples for this index entry
558 * satisfying SnapshotDirty. This is necessary because for AMs
559 * with optimizations like heap's HOT, we have just a single
560 * index entry for the entire chain.
561 */
562 else if (table_index_fetch_tuple_check(heapRel, &htid,
563 &SnapshotDirty,
564 &all_dead))
565 {
566 TransactionId xwait;
567
568 /*
569 * It is a duplicate. If we are only doing a partial
570 * check, then don't bother checking if the tuple is being
571 * updated in another transaction. Just return the fact
572 * that it is a potential conflict and leave the full
573 * check till later. Don't invalidate binary search
574 * bounds.
575 */
576 if (checkUnique == UNIQUE_CHECK_PARTIAL)
577 {
578 if (nbuf != InvalidBuffer)
579 _bt_relbuf(rel, nbuf);
580 *is_unique = false;
582 }
583
584 /*
585 * If this tuple is being updated by other transaction
586 * then we have to wait for its commit/abort.
587 */
588 xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
589 SnapshotDirty.xmin : SnapshotDirty.xmax;
590
591 if (TransactionIdIsValid(xwait))
592 {
593 if (nbuf != InvalidBuffer)
594 _bt_relbuf(rel, nbuf);
595 /* Tell _bt_doinsert to wait... */
596 *speculativeToken = SnapshotDirty.speculativeToken;
597 /* Caller releases lock on buf immediately */
598 insertstate->bounds_valid = false;
599 return xwait;
600 }
601
602 /*
603 * Otherwise we have a definite conflict. But before
604 * complaining, look to see if the tuple we want to insert
605 * is itself now committed dead --- if so, don't complain.
606 * This is a waste of time in normal scenarios but we must
607 * do it to support CREATE INDEX CONCURRENTLY.
608 *
609 * We must follow HOT-chains here because during
610 * concurrent index build, we insert the root TID though
611 * the actual tuple may be somewhere in the HOT-chain.
612 * While following the chain we might not stop at the
613 * exact tuple which triggered the insert, but that's OK
614 * because if we find a live tuple anywhere in this chain,
615 * we have a unique key conflict. The other live tuple is
616 * not part of this chain because it had a different index
617 * entry.
618 */
619 htid = itup->t_tid;
620 if (table_index_fetch_tuple_check(heapRel, &htid,
621 SnapshotSelf, NULL))
622 {
623 /* Normal case --- it's still live */
624 }
625 else
626 {
627 /*
628 * It's been deleted, so no error, and no need to
629 * continue searching
630 */
631 break;
632 }
633
634 /*
635 * Check for a conflict-in as we would if we were going to
636 * write to this page. We aren't actually going to write,
637 * but we want a chance to report SSI conflicts that would
638 * otherwise be masked by this unique constraint
639 * violation.
640 */
642
643 /*
644 * This is a definite conflict. Break the tuple down into
645 * datums and report the error. But first, make sure we
646 * release the buffer locks we're holding ---
647 * BuildIndexValueDescription could make catalog accesses,
648 * which in the worst case might touch this same index and
649 * cause deadlocks.
650 */
651 if (nbuf != InvalidBuffer)
652 _bt_relbuf(rel, nbuf);
653 _bt_relbuf(rel, insertstate->buf);
654 insertstate->buf = InvalidBuffer;
655 insertstate->bounds_valid = false;
656
657 {
659 bool isnull[INDEX_MAX_KEYS];
660 char *key_desc;
661
663 values, isnull);
664
665 key_desc = BuildIndexValueDescription(rel, values,
666 isnull);
667
669 (errcode(ERRCODE_UNIQUE_VIOLATION),
670 errmsg("duplicate key value violates unique constraint \"%s\"",
672 key_desc ? errdetail("Key %s already exists.",
673 key_desc) : 0,
674 errtableconstraint(heapRel,
676 }
677 }
678 else if (all_dead && (!inposting ||
679 (prevalldead &&
680 curposti == BTreeTupleGetNPosting(curitup) - 1)))
681 {
682 /*
683 * The conflicting tuple (or all HOT chains pointed to by
684 * all posting list TIDs) is dead to everyone, so mark the
685 * index entry killed.
686 */
687 ItemIdMarkDead(curitemid);
688 opaque->btpo_flags |= BTP_HAS_GARBAGE;
689
690 /*
691 * Mark buffer with a dirty hint, since state is not
692 * crucial. Be sure to mark the proper buffer dirty.
693 */
694 if (nbuf != InvalidBuffer)
695 MarkBufferDirtyHint(nbuf, true);
696 else
697 MarkBufferDirtyHint(insertstate->buf, true);
698 }
699
700 /*
701 * Remember if posting list tuple has even a single HOT chain
702 * whose members are not all dead
703 */
704 if (!all_dead && inposting)
705 prevalldead = false;
706 }
707 }
708
709 if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
710 {
711 /* Advance to next TID in same posting list */
712 curposti++;
713 continue;
714 }
715 else if (offset < maxoff)
716 {
717 /* Advance to next tuple */
718 curposti = 0;
719 inposting = false;
720 offset = OffsetNumberNext(offset);
721 }
722 else
723 {
724 int highkeycmp;
725
726 /* If scankey == hikey we gotta check the next page too */
727 if (P_RIGHTMOST(opaque))
728 break;
729 highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
730 Assert(highkeycmp <= 0);
731 if (highkeycmp != 0)
732 break;
733 /* Advance to next non-dead page --- there must be one */
734 for (;;)
735 {
736 BlockNumber nblkno = opaque->btpo_next;
737
738 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
739 page = BufferGetPage(nbuf);
740 opaque = BTPageGetOpaque(page);
741 if (!P_IGNORE(opaque))
742 break;
743 if (P_RIGHTMOST(opaque))
744 elog(ERROR, "fell off the end of index \"%s\"",
746 }
747 /* Will also advance to next tuple */
748 curposti = 0;
749 inposting = false;
750 maxoff = PageGetMaxOffsetNumber(page);
751 offset = P_FIRSTDATAKEY(opaque);
752 /* Don't invalidate binary search bounds */
753 }
754 }
755
756 /*
757 * If we are doing a recheck then we should have found the tuple we are
758 * checking. Otherwise there's something very wrong --- probably, the
759 * index is on a non-immutable expression.
760 */
761 if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
763 (errcode(ERRCODE_INTERNAL_ERROR),
764 errmsg("failed to re-find tuple within index \"%s\"",
766 errhint("This may be because of a non-immutable index expression."),
767 errtableconstraint(heapRel,
769
770 if (nbuf != InvalidBuffer)
771 _bt_relbuf(rel, nbuf);
772
774}
775
776
777/*
778 * _bt_findinsertloc() -- Finds an insert location for a tuple
779 *
780 * On entry, insertstate buffer contains the page the new tuple belongs
781 * on. It is exclusive-locked and pinned by the caller.
782 *
783 * If 'checkingunique' is true, the buffer on entry is the first page
784 * that contains duplicates of the new key. If there are duplicates on
785 * multiple pages, the correct insertion position might be some page to
786 * the right, rather than the first page. In that case, this function
787 * moves right to the correct target page.
788 *
789 * (In a !heapkeyspace index, there can be multiple pages with the same
790 * high key, where the new tuple could legitimately be placed on. In
791 * that case, the caller passes the first page containing duplicates,
792 * just like when checkingunique=true. If that page doesn't have enough
793 * room for the new tuple, this function moves right, trying to find a
794 * legal page that does.)
795 *
796 * If 'indexUnchanged' is true, this is for an UPDATE that didn't
797 * logically change the indexed value, but must nevertheless have a new
798 * entry to point to a successor version. This hint from the executor
799 * will influence our behavior when the page might have to be split and
800 * we must consider our options. Bottom-up index deletion can avoid
801 * pathological version-driven page splits, but we only want to go to the
802 * trouble of trying it when we already have moderate confidence that
803 * it's appropriate. The hint should not significantly affect our
804 * behavior over time unless practically all inserts on to the leaf page
805 * get the hint.
806 *
807 * On exit, insertstate buffer contains the chosen insertion page, and
808 * the offset within that page is returned. If _bt_findinsertloc needed
809 * to move right, the lock and pin on the original page are released, and
810 * the new buffer is exclusively locked and pinned instead.
811 *
812 * If insertstate contains cached binary search bounds, we will take
813 * advantage of them. This avoids repeating comparisons that we made in
814 * _bt_check_unique() already.
815 */
816static OffsetNumber
818 BTInsertState insertstate,
819 bool checkingunique,
820 bool indexUnchanged,
821 BTStack stack,
822 Relation heapRel)
823{
824 BTScanInsert itup_key = insertstate->itup_key;
825 Page page = BufferGetPage(insertstate->buf);
826 BTPageOpaque opaque;
827 OffsetNumber newitemoff;
828
829 opaque = BTPageGetOpaque(page);
830
831 /* Check 1/3 of a page restriction */
832 if (unlikely(insertstate->itemsz > BTMaxItemSize))
833 _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
834 insertstate->itup);
835
836 Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
837 Assert(!insertstate->bounds_valid || checkingunique);
838 Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
839 Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
840 Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
841
842 if (itup_key->heapkeyspace)
843 {
844 /* Keep track of whether checkingunique duplicate seen */
845 bool uniquedup = indexUnchanged;
846
847 /*
848 * If we're inserting into a unique index, we may have to walk right
849 * through leaf pages to find the one leaf page that we must insert on
850 * to.
851 *
852 * This is needed for checkingunique callers because a scantid was not
853 * used when we called _bt_search(). scantid can only be set after
854 * _bt_check_unique() has checked for duplicates. The buffer
855 * initially stored in insertstate->buf has the page where the first
856 * duplicate key might be found, which isn't always the page that new
857 * tuple belongs on. The heap TID attribute for new tuple (scantid)
858 * could force us to insert on a sibling page, though that should be
859 * very rare in practice.
860 */
861 if (checkingunique)
862 {
863 if (insertstate->low < insertstate->stricthigh)
864 {
865 /* Encountered a duplicate in _bt_check_unique() */
866 Assert(insertstate->bounds_valid);
867 uniquedup = true;
868 }
869
870 for (;;)
871 {
872 /*
873 * Does the new tuple belong on this page?
874 *
875 * The earlier _bt_check_unique() call may well have
876 * established a strict upper bound on the offset for the new
877 * item. If it's not the last item of the page (i.e. if there
878 * is at least one tuple on the page that goes after the tuple
879 * we're inserting) then we know that the tuple belongs on
880 * this page. We can skip the high key check.
881 */
882 if (insertstate->bounds_valid &&
883 insertstate->low <= insertstate->stricthigh &&
884 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
885 break;
886
887 /* Test '<=', not '!=', since scantid is set now */
888 if (P_RIGHTMOST(opaque) ||
889 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
890 break;
891
892 _bt_stepright(rel, heapRel, insertstate, stack);
893 /* Update local state after stepping right */
894 page = BufferGetPage(insertstate->buf);
895 opaque = BTPageGetOpaque(page);
896 /* Assume duplicates (if checkingunique) */
897 uniquedup = true;
898 }
899 }
900
901 /*
902 * If the target page cannot fit newitem, try to avoid splitting the
903 * page on insert by performing deletion or deduplication now
904 */
905 if (PageGetFreeSpace(page) < insertstate->itemsz)
906 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
907 checkingunique, uniquedup,
908 indexUnchanged);
909 }
910 else
911 {
912 /*----------
913 * This is a !heapkeyspace (version 2 or 3) index. The current page
914 * is the first page that we could insert the new tuple to, but there
915 * may be other pages to the right that we could opt to use instead.
916 *
917 * If the new key is equal to one or more existing keys, we can
918 * legitimately place it anywhere in the series of equal keys. In
919 * fact, if the new key is equal to the page's "high key" we can place
920 * it on the next page. If it is equal to the high key, and there's
921 * not room to insert the new tuple on the current page without
922 * splitting, then we move right hoping to find more free space and
923 * avoid a split.
924 *
925 * Keep scanning right until we
926 * (a) find a page with enough free space,
927 * (b) reach the last page where the tuple can legally go, or
928 * (c) get tired of searching.
929 * (c) is not flippant; it is important because if there are many
930 * pages' worth of equal keys, it's better to split one of the early
931 * pages than to scan all the way to the end of the run of equal keys
932 * on every insert. We implement "get tired" as a random choice,
933 * since stopping after scanning a fixed number of pages wouldn't work
934 * well (we'd never reach the right-hand side of previously split
935 * pages). The probability of moving right is set at 0.99, which may
936 * seem too high to change the behavior much, but it does an excellent
937 * job of preventing O(N^2) behavior with many equal keys.
938 *----------
939 */
940 while (PageGetFreeSpace(page) < insertstate->itemsz)
941 {
942 /*
943 * Before considering moving right, see if we can obtain enough
944 * space by erasing LP_DEAD items
945 */
946 if (P_HAS_GARBAGE(opaque))
947 {
948 /* Perform simple deletion */
949 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
950 false, false, false);
951
952 if (PageGetFreeSpace(page) >= insertstate->itemsz)
953 break; /* OK, now we have enough space */
954 }
955
956 /*
957 * Nope, so check conditions (b) and (c) enumerated above
958 *
959 * The earlier _bt_check_unique() call may well have established a
960 * strict upper bound on the offset for the new item. If it's not
961 * the last item of the page (i.e. if there is at least one tuple
962 * on the page that's greater than the tuple we're inserting to)
963 * then we know that the tuple belongs on this page. We can skip
964 * the high key check.
965 */
966 if (insertstate->bounds_valid &&
967 insertstate->low <= insertstate->stricthigh &&
968 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
969 break;
970
971 if (P_RIGHTMOST(opaque) ||
972 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
974 break;
975
976 _bt_stepright(rel, heapRel, insertstate, stack);
977 /* Update local state after stepping right */
978 page = BufferGetPage(insertstate->buf);
979 opaque = BTPageGetOpaque(page);
980 }
981 }
982
983 /*
984 * We should now be on the correct page. Find the offset within the page
985 * for the new tuple. (Possibly reusing earlier search bounds.)
986 */
987 Assert(P_RIGHTMOST(opaque) ||
988 _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
989
990 newitemoff = _bt_binsrch_insert(rel, insertstate);
991
992 if (insertstate->postingoff == -1)
993 {
994 /*
995 * There is an overlapping posting list tuple with its LP_DEAD bit
996 * set. We don't want to unnecessarily unset its LP_DEAD bit while
997 * performing a posting list split, so perform simple index tuple
998 * deletion early.
999 */
1000 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1001 false, false, false);
1002
1003 /*
1004 * Do new binary search. New insert location cannot overlap with any
1005 * posting list now.
1006 */
1007 Assert(!insertstate->bounds_valid);
1008 insertstate->postingoff = 0;
1009 newitemoff = _bt_binsrch_insert(rel, insertstate);
1010 Assert(insertstate->postingoff == 0);
1011 }
1012
1013 return newitemoff;
1014}
1015
1016/*
1017 * Step right to next non-dead page, during insertion.
1018 *
1019 * This is a bit more complicated than moving right in a search. We must
1020 * write-lock the target page before releasing write lock on current page;
1021 * else someone else's _bt_check_unique scan could fail to see our insertion.
1022 * Write locks on intermediate dead pages won't do because we don't know when
1023 * they will get de-linked from the tree.
1024 *
1025 * This is more aggressive than it needs to be for non-unique !heapkeyspace
1026 * indexes.
1027 */
1028static void
1030 BTStack stack)
1031{
1032 Page page;
1033 BTPageOpaque opaque;
1034 Buffer rbuf;
1035 BlockNumber rblkno;
1036
1037 Assert(heaprel != NULL);
1038 page = BufferGetPage(insertstate->buf);
1039 opaque = BTPageGetOpaque(page);
1040
1041 rbuf = InvalidBuffer;
1042 rblkno = opaque->btpo_next;
1043 for (;;)
1044 {
1045 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1046 page = BufferGetPage(rbuf);
1047 opaque = BTPageGetOpaque(page);
1048
1049 /*
1050 * If this page was incompletely split, finish the split now. We do
1051 * this while holding a lock on the left sibling, which is not good
1052 * because finishing the split could be a fairly lengthy operation.
1053 * But this should happen very seldom.
1054 */
1055 if (P_INCOMPLETE_SPLIT(opaque))
1056 {
1057 _bt_finish_split(rel, heaprel, rbuf, stack);
1058 rbuf = InvalidBuffer;
1059 continue;
1060 }
1061
1062 if (!P_IGNORE(opaque))
1063 break;
1064 if (P_RIGHTMOST(opaque))
1065 elog(ERROR, "fell off the end of index \"%s\"",
1067
1068 rblkno = opaque->btpo_next;
1069 }
1070 /* rbuf locked; unlock buf, update state for caller */
1071 _bt_relbuf(rel, insertstate->buf);
1072 insertstate->buf = rbuf;
1073 insertstate->bounds_valid = false;
1074}
1075
1076/*----------
1077 * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1078 *
1079 * This recursive procedure does the following things:
1080 *
1081 * + if postingoff != 0, splits existing posting list tuple
1082 * (since it overlaps with new 'itup' tuple).
1083 * + if necessary, splits the target page, using 'itup_key' for
1084 * suffix truncation on leaf pages (caller passes NULL for
1085 * non-leaf pages).
1086 * + inserts the new tuple (might be split from posting list).
1087 * + if the page was split, pops the parent stack, and finds the
1088 * right place to insert the new child pointer (by walking
1089 * right using information stored in the parent stack).
1090 * + invokes itself with the appropriate tuple for the right
1091 * child page on the parent.
1092 * + updates the metapage if a true root or fast root is split.
1093 *
1094 * On entry, we must have the correct buffer in which to do the
1095 * insertion, and the buffer must be pinned and write-locked. On return,
1096 * we will have dropped both the pin and the lock on the buffer.
1097 *
1098 * This routine only performs retail tuple insertions. 'itup' should
1099 * always be either a non-highkey leaf item, or a downlink (new high
1100 * key items are created indirectly, when a page is split). When
1101 * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1102 * we're inserting the downlink for. This function will clear the
1103 * INCOMPLETE_SPLIT flag on it, and release the buffer.
1104 *----------
1105 */
1106static void
1108 Relation heaprel,
1109 BTScanInsert itup_key,
1110 Buffer buf,
1111 Buffer cbuf,
1112 BTStack stack,
1113 IndexTuple itup,
1114 Size itemsz,
1115 OffsetNumber newitemoff,
1116 int postingoff,
1117 bool split_only_page)
1118{
1119 Page page;
1120 BTPageOpaque opaque;
1121 bool isleaf,
1122 isroot,
1123 isrightmost,
1124 isonly;
1125 IndexTuple oposting = NULL;
1126 IndexTuple origitup = NULL;
1127 IndexTuple nposting = NULL;
1128
1129 page = BufferGetPage(buf);
1130 opaque = BTPageGetOpaque(page);
1131 isleaf = P_ISLEAF(opaque);
1132 isroot = P_ISROOT(opaque);
1133 isrightmost = P_RIGHTMOST(opaque);
1134 isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1135
1136 /* child buffer must be given iff inserting on an internal page */
1137 Assert(isleaf == !BufferIsValid(cbuf));
1138 /* tuple must have appropriate number of attributes */
1139 Assert(!isleaf ||
1140 BTreeTupleGetNAtts(itup, rel) ==
1142 Assert(isleaf ||
1143 BTreeTupleGetNAtts(itup, rel) <=
1146 Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1147 /* Caller must always finish incomplete split for us */
1148 Assert(!P_INCOMPLETE_SPLIT(opaque));
1149
1150 /*
1151 * Every internal page should have exactly one negative infinity item at
1152 * all times. Only _bt_split() and _bt_newlevel() should add items that
1153 * become negative infinity items through truncation, since they're the
1154 * only routines that allocate new internal pages.
1155 */
1156 Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1157
1158 /*
1159 * Do we need to split an existing posting list item?
1160 */
1161 if (postingoff != 0)
1162 {
1163 ItemId itemid = PageGetItemId(page, newitemoff);
1164
1165 /*
1166 * The new tuple is a duplicate with a heap TID that falls inside the
1167 * range of an existing posting list tuple on a leaf page. Prepare to
1168 * split an existing posting list. Overwriting the posting list with
1169 * its post-split version is treated as an extra step in either the
1170 * insert or page split critical section.
1171 */
1172 Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1173 oposting = (IndexTuple) PageGetItem(page, itemid);
1174
1175 /*
1176 * postingoff value comes from earlier call to _bt_binsrch_posting().
1177 * Its binary search might think that a plain tuple must be a posting
1178 * list tuple that needs to be split. This can happen with corruption
1179 * involving an existing plain tuple that is a duplicate of the new
1180 * item, up to and including its table TID. Check for that here in
1181 * passing.
1182 *
1183 * Also verify that our caller has made sure that the existing posting
1184 * list tuple does not have its LP_DEAD bit set.
1185 */
1186 if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1187 ereport(ERROR,
1188 (errcode(ERRCODE_INDEX_CORRUPTED),
1189 errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1192 newitemoff, BufferGetBlockNumber(buf),
1194
1195 /* use a mutable copy of itup as our itup from here on */
1196 origitup = itup;
1197 itup = CopyIndexTuple(origitup);
1198 nposting = _bt_swap_posting(itup, oposting, postingoff);
1199 /* itup now contains rightmost/max TID from oposting */
1200
1201 /* Alter offset so that newitem goes after posting list */
1202 newitemoff = OffsetNumberNext(newitemoff);
1203 }
1204
1205 /*
1206 * Do we need to split the page to fit the item on it?
1207 *
1208 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1209 * so this comparison is correct even though we appear to be accounting
1210 * only for the item and not for its line pointer.
1211 */
1212 if (PageGetFreeSpace(page) < itemsz)
1213 {
1214 Buffer rbuf;
1215
1216 Assert(!split_only_page);
1217
1218 /* split the buffer into left and right halves */
1219 rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1220 itup, origitup, nposting, postingoff);
1223 BufferGetBlockNumber(rbuf));
1224
1225 /*----------
1226 * By here,
1227 *
1228 * + our target page has been split;
1229 * + the original tuple has been inserted;
1230 * + we have write locks on both the old (left half)
1231 * and new (right half) buffers, after the split; and
1232 * + we know the key we want to insert into the parent
1233 * (it's the "high key" on the left child page).
1234 *
1235 * We're ready to do the parent insertion. We need to hold onto the
1236 * locks for the child pages until we locate the parent, but we can
1237 * at least release the lock on the right child before doing the
1238 * actual insertion. The lock on the left child will be released
1239 * last of all by parent insertion, where it is the 'cbuf' of parent
1240 * page.
1241 *----------
1242 */
1243#ifdef USE_INJECTION_POINTS
1244 if (P_ISLEAF(opaque))
1245 INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
1246 else
1247 INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
1248#endif
1249
1250 _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1251 }
1252 else
1253 {
1254 Buffer metabuf = InvalidBuffer;
1255 Page metapg = NULL;
1256 BTMetaPageData *metad = NULL;
1257 BlockNumber blockcache;
1258
1259 /*
1260 * If we are doing this insert because we split a page that was the
1261 * only one on its tree level, but was not the root, it may have been
1262 * the "fast root". We need to ensure that the fast root link points
1263 * at or above the current page. We can safely acquire a lock on the
1264 * metapage here --- see comments for _bt_newlevel().
1265 */
1266 if (unlikely(split_only_page))
1267 {
1268 Assert(!isleaf);
1269 Assert(BufferIsValid(cbuf));
1270
1271 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1272 metapg = BufferGetPage(metabuf);
1273 metad = BTPageGetMeta(metapg);
1274
1275 if (metad->btm_fastlevel >= opaque->btpo_level)
1276 {
1277 /* no update wanted */
1278 _bt_relbuf(rel, metabuf);
1279 metabuf = InvalidBuffer;
1280 }
1281 }
1282
1283 /* Do the update. No ereport(ERROR) until changes are logged */
1285
1286 if (postingoff != 0)
1287 memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1288
1289 if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
1290 elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1292
1294
1295 if (BufferIsValid(metabuf))
1296 {
1297 /* upgrade meta-page if needed */
1298 if (metad->btm_version < BTREE_NOVAC_VERSION)
1299 _bt_upgrademetapage(metapg);
1301 metad->btm_fastlevel = opaque->btpo_level;
1302 MarkBufferDirty(metabuf);
1303 }
1304
1305 /*
1306 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1307 * finishes a split
1308 */
1309 if (!isleaf)
1310 {
1311 Page cpage = BufferGetPage(cbuf);
1312 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1313
1314 Assert(P_INCOMPLETE_SPLIT(cpageop));
1315 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1316 MarkBufferDirty(cbuf);
1317 }
1318
1319 /* XLOG stuff */
1320 if (RelationNeedsWAL(rel))
1321 {
1322 xl_btree_insert xlrec;
1323 xl_btree_metadata xlmeta;
1324 uint8 xlinfo;
1325 XLogRecPtr recptr;
1326 uint16 upostingoff;
1327
1328 xlrec.offnum = newitemoff;
1329
1332
1333 if (isleaf && postingoff == 0)
1334 {
1335 /* Simple leaf insert */
1336 xlinfo = XLOG_BTREE_INSERT_LEAF;
1337 }
1338 else if (postingoff != 0)
1339 {
1340 /*
1341 * Leaf insert with posting list split. Must include
1342 * postingoff field before newitem/orignewitem.
1343 */
1344 Assert(isleaf);
1345 xlinfo = XLOG_BTREE_INSERT_POST;
1346 }
1347 else
1348 {
1349 /* Internal page insert, which finishes a split on cbuf */
1350 xlinfo = XLOG_BTREE_INSERT_UPPER;
1352
1353 if (BufferIsValid(metabuf))
1354 {
1355 /* Actually, it's an internal page insert + meta update */
1356 xlinfo = XLOG_BTREE_INSERT_META;
1357
1359 xlmeta.version = metad->btm_version;
1360 xlmeta.root = metad->btm_root;
1361 xlmeta.level = metad->btm_level;
1362 xlmeta.fastroot = metad->btm_fastroot;
1363 xlmeta.fastlevel = metad->btm_fastlevel;
1365 xlmeta.allequalimage = metad->btm_allequalimage;
1366
1367 XLogRegisterBuffer(2, metabuf,
1369 XLogRegisterBufData(2, &xlmeta,
1370 sizeof(xl_btree_metadata));
1371 }
1372 }
1373
1375 if (postingoff == 0)
1376 {
1377 /* Just log itup from caller */
1378 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1379 }
1380 else
1381 {
1382 /*
1383 * Insert with posting list split (XLOG_BTREE_INSERT_POST
1384 * record) case.
1385 *
1386 * Log postingoff. Also log origitup, not itup. REDO routine
1387 * must reconstruct final itup (as well as nposting) using
1388 * _bt_swap_posting().
1389 */
1390 upostingoff = postingoff;
1391
1392 XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
1393 XLogRegisterBufData(0, origitup,
1394 IndexTupleSize(origitup));
1395 }
1396
1397 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1398
1399 if (BufferIsValid(metabuf))
1400 PageSetLSN(metapg, recptr);
1401 if (!isleaf)
1402 PageSetLSN(BufferGetPage(cbuf), recptr);
1403
1404 PageSetLSN(page, recptr);
1405 }
1406
1408
1409 /* Release subsidiary buffers */
1410 if (BufferIsValid(metabuf))
1411 _bt_relbuf(rel, metabuf);
1412 if (!isleaf)
1413 _bt_relbuf(rel, cbuf);
1414
1415 /*
1416 * Cache the block number if this is the rightmost leaf page. Cache
1417 * may be used by a future inserter within _bt_search_insert().
1418 */
1419 blockcache = InvalidBlockNumber;
1420 if (isrightmost && isleaf && !isroot)
1421 blockcache = BufferGetBlockNumber(buf);
1422
1423 /* Release buffer for insertion target block */
1424 _bt_relbuf(rel, buf);
1425
1426 /*
1427 * If we decided to cache the insertion target block before releasing
1428 * its buffer lock, then cache it now. Check the height of the tree
1429 * first, though. We don't go for the optimization with small
1430 * indexes. Defer final check to this point to ensure that we don't
1431 * call _bt_getrootheight while holding a buffer lock.
1432 */
1433 if (BlockNumberIsValid(blockcache) &&
1435 RelationSetTargetBlock(rel, blockcache);
1436 }
1437
1438 /* be tidy */
1439 if (postingoff != 0)
1440 {
1441 /* itup is actually a modified copy of caller's original */
1442 pfree(nposting);
1443 pfree(itup);
1444 }
1445}
1446
1447/*
1448 * _bt_split() -- split a page in the btree.
1449 *
1450 * On entry, buf is the page to split, and is pinned and write-locked.
1451 * newitemoff etc. tell us about the new item that must be inserted
1452 * along with the data from the original page.
1453 *
1454 * itup_key is used for suffix truncation on leaf pages (internal
1455 * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1456 * is the left-sibling of the page we're inserting the downlink for.
1457 * This function will clear the INCOMPLETE_SPLIT flag on it, and
1458 * release the buffer.
1459 *
1460 * orignewitem, nposting, and postingoff are needed when an insert of
1461 * orignewitem results in both a posting list split and a page split.
1462 * These extra posting list split details are used here in the same
1463 * way as they are used in the more common case where a posting list
1464 * split does not coincide with a page split. We need to deal with
1465 * posting list splits directly in order to ensure that everything
1466 * that follows from the insert of orignewitem is handled as a single
1467 * atomic operation (though caller's insert of a new pivot/downlink
1468 * into parent page will still be a separate operation). See
1469 * nbtree/README for details on the design of posting list splits.
1470 *
1471 * Returns the new right sibling of buf, pinned and write-locked.
1472 * The pin and lock on buf are maintained.
1473 */
1474static Buffer
1476 Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1477 IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1478{
1479 Buffer rbuf;
1480 Page origpage;
1481 Page leftpage,
1482 rightpage;
1483 PGAlignedBlock leftpage_buf,
1484 rightpage_buf;
1485 BlockNumber origpagenumber,
1486 rightpagenumber;
1487 BTPageOpaque ropaque,
1488 lopaque,
1489 oopaque;
1490 Buffer sbuf = InvalidBuffer;
1491 Page spage = NULL;
1492 BTPageOpaque sopaque = NULL;
1493 Size itemsz;
1494 ItemId itemid;
1495 IndexTuple firstright,
1496 lefthighkey;
1497 OffsetNumber firstrightoff;
1498 OffsetNumber afterleftoff,
1499 afterrightoff,
1500 minusinfoff;
1501 OffsetNumber origpagepostingoff;
1502 OffsetNumber maxoff;
1504 bool newitemonleft,
1505 isleaf,
1506 isrightmost;
1507
1508 /*
1509 * origpage is the original page to be split. leftpage is a temporary
1510 * buffer that receives the left-sibling data, which will be copied back
1511 * into origpage on success. rightpage is the new page that will receive
1512 * the right-sibling data.
1513 *
1514 * leftpage is allocated after choosing a split point. rightpage's new
1515 * buffer isn't acquired until after leftpage is initialized and has new
1516 * high key, the last point where splitting the page may fail (barring
1517 * corruption). Failing before acquiring new buffer won't have lasting
1518 * consequences, since origpage won't have been modified and leftpage is
1519 * only workspace.
1520 */
1521 origpage = BufferGetPage(buf);
1522 oopaque = BTPageGetOpaque(origpage);
1523 isleaf = P_ISLEAF(oopaque);
1524 isrightmost = P_RIGHTMOST(oopaque);
1525 maxoff = PageGetMaxOffsetNumber(origpage);
1526 origpagenumber = BufferGetBlockNumber(buf);
1527
1528 /*
1529 * Choose a point to split origpage at.
1530 *
1531 * A split point can be thought of as a point _between_ two existing data
1532 * items on origpage (the lastleft and firstright tuples), provided you
1533 * pretend that the new item that didn't fit is already on origpage.
1534 *
1535 * Since origpage does not actually contain newitem, the representation of
1536 * split points needs to work with two boundary cases: splits where
1537 * newitem is lastleft, and splits where newitem is firstright.
1538 * newitemonleft resolves the ambiguity that would otherwise exist when
1539 * newitemoff == firstrightoff. In all other cases it's clear which side
1540 * of the split every tuple goes on from context. newitemonleft is
1541 * usually (but not always) redundant information.
1542 *
1543 * firstrightoff is supposed to be an origpage offset number, but it's
1544 * possible that its value will be maxoff+1, which is "past the end" of
1545 * origpage. This happens in the rare case where newitem goes after all
1546 * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1547 * origpage at the point that leaves newitem alone on new right page. Any
1548 * "!newitemonleft && newitemoff == firstrightoff" split point makes
1549 * newitem the firstright tuple, though, so this case isn't a special
1550 * case.
1551 */
1552 firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1553 newitem, &newitemonleft);
1554
1555 /* Use temporary buffer for leftpage */
1556 leftpage = leftpage_buf.data;
1558 lopaque = BTPageGetOpaque(leftpage);
1559
1560 /*
1561 * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1562 * and HAS_GARBAGE flags.
1563 */
1564 lopaque->btpo_flags = oopaque->btpo_flags;
1566 /* set flag in leftpage indicating that rightpage has no downlink yet */
1567 lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1568 lopaque->btpo_prev = oopaque->btpo_prev;
1569 /* handle btpo_next after rightpage buffer acquired */
1570 lopaque->btpo_level = oopaque->btpo_level;
1571 /* handle btpo_cycleid after rightpage buffer acquired */
1572
1573 /*
1574 * Copy the original page's LSN into leftpage, which will become the
1575 * updated version of the page. We need this because XLogInsert will
1576 * examine the LSN and possibly dump it in a page image.
1577 */
1578 PageSetLSN(leftpage, PageGetLSN(origpage));
1579
1580 /*
1581 * Determine page offset number of existing overlapped-with-orignewitem
1582 * posting list when it is necessary to perform a posting list split in
1583 * passing. Note that newitem was already changed by caller (newitem no
1584 * longer has the orignewitem TID).
1585 *
1586 * This page offset number (origpagepostingoff) will be used to pretend
1587 * that the posting split has already taken place, even though the
1588 * required modifications to origpage won't occur until we reach the
1589 * critical section. The lastleft and firstright tuples of our page split
1590 * point should, in effect, come from an imaginary version of origpage
1591 * that has the nposting tuple instead of the original posting list tuple.
1592 *
1593 * Note: _bt_findsplitloc() should have compensated for coinciding posting
1594 * list splits in just the same way, at least in theory. It doesn't
1595 * bother with that, though. In practice it won't affect its choice of
1596 * split point.
1597 */
1598 origpagepostingoff = InvalidOffsetNumber;
1599 if (postingoff != 0)
1600 {
1601 Assert(isleaf);
1602 Assert(ItemPointerCompare(&orignewitem->t_tid,
1603 &newitem->t_tid) < 0);
1604 Assert(BTreeTupleIsPosting(nposting));
1605 origpagepostingoff = OffsetNumberPrev(newitemoff);
1606 }
1607
1608 /*
1609 * The high key for the new left page is a possibly-truncated copy of
1610 * firstright on the leaf level (it's "firstright itself" on internal
1611 * pages; see !isleaf comments below). This may seem to be contrary to
1612 * Lehman & Yao's approach of using a copy of lastleft as the new high key
1613 * when splitting on the leaf level. It isn't, though.
1614 *
1615 * Suffix truncation will leave the left page's high key fully equal to
1616 * lastleft when lastleft and firstright are equal prior to heap TID (that
1617 * is, the tiebreaker TID value comes from lastleft). It isn't actually
1618 * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1619 * "subtree" invariant to hold. It's sufficient to make sure that the new
1620 * leaf high key is strictly less than firstright, and greater than or
1621 * equal to (not necessarily equal to) lastleft. In other words, when
1622 * suffix truncation isn't possible during a leaf page split, we take
1623 * L&Y's exact approach to generating a new high key for the left page.
1624 * (Actually, that is slightly inaccurate. We don't just use a copy of
1625 * lastleft. A tuple with all the keys from firstright but the max heap
1626 * TID from lastleft is used, to avoid introducing a special case.)
1627 */
1628 if (!newitemonleft && newitemoff == firstrightoff)
1629 {
1630 /* incoming tuple becomes firstright */
1631 itemsz = newitemsz;
1632 firstright = newitem;
1633 }
1634 else
1635 {
1636 /* existing item at firstrightoff becomes firstright */
1637 itemid = PageGetItemId(origpage, firstrightoff);
1638 itemsz = ItemIdGetLength(itemid);
1639 firstright = (IndexTuple) PageGetItem(origpage, itemid);
1640 if (firstrightoff == origpagepostingoff)
1641 firstright = nposting;
1642 }
1643
1644 if (isleaf)
1645 {
1646 IndexTuple lastleft;
1647
1648 /* Attempt suffix truncation for leaf page splits */
1649 if (newitemonleft && newitemoff == firstrightoff)
1650 {
1651 /* incoming tuple becomes lastleft */
1652 lastleft = newitem;
1653 }
1654 else
1655 {
1656 OffsetNumber lastleftoff;
1657
1658 /* existing item before firstrightoff becomes lastleft */
1659 lastleftoff = OffsetNumberPrev(firstrightoff);
1660 Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1661 itemid = PageGetItemId(origpage, lastleftoff);
1662 lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1663 if (lastleftoff == origpagepostingoff)
1664 lastleft = nposting;
1665 }
1666
1667 lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1668 itemsz = IndexTupleSize(lefthighkey);
1669 }
1670 else
1671 {
1672 /*
1673 * Don't perform suffix truncation on a copy of firstright to make
1674 * left page high key for internal page splits. Must use firstright
1675 * as new high key directly.
1676 *
1677 * Each distinct separator key value originates as a leaf level high
1678 * key; all other separator keys/pivot tuples are copied from one
1679 * level down. A separator key in a grandparent page must be
1680 * identical to high key in rightmost parent page of the subtree to
1681 * its left, which must itself be identical to high key in rightmost
1682 * child page of that same subtree (this even applies to separator
1683 * from grandparent's high key). There must always be an unbroken
1684 * "seam" of identical separator keys that guide index scans at every
1685 * level, starting from the grandparent. That's why suffix truncation
1686 * is unsafe here.
1687 *
1688 * Internal page splits will truncate firstright into a "negative
1689 * infinity" data item when it gets inserted on the new right page
1690 * below, though. This happens during the call to _bt_pgaddtup() for
1691 * the new first data item for right page. Do not confuse this
1692 * mechanism with suffix truncation. It is just a convenient way of
1693 * implementing page splits that split the internal page "inside"
1694 * firstright. The lefthighkey separator key cannot appear a second
1695 * time in the right page (only firstright's downlink goes in right
1696 * page).
1697 */
1698 lefthighkey = firstright;
1699 }
1700
1701 /*
1702 * Add new high key to leftpage
1703 */
1704 afterleftoff = P_HIKEY;
1705
1706 Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1707 Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1709 Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1710 if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
1711 elog(ERROR, "failed to add high key to the left sibling"
1712 " while splitting block %u of index \"%s\"",
1713 origpagenumber, RelationGetRelationName(rel));
1714 afterleftoff = OffsetNumberNext(afterleftoff);
1715
1716 /*
1717 * Acquire a new right page to split into, now that left page has a new
1718 * high key.
1719 *
1720 * To not confuse future VACUUM operations, we zero the right page and
1721 * work on an in-memory copy of it before writing WAL, then copy its
1722 * contents back to the actual page once we start the critical section
1723 * work. This simplifies the split work, so as there is no need to zero
1724 * the right page before throwing an error.
1725 */
1726 rbuf = _bt_allocbuf(rel, heaprel);
1727 rightpage = rightpage_buf.data;
1728
1729 /*
1730 * Copy the contents of the right page into its temporary location, and
1731 * zero the original space.
1732 */
1733 memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
1734 memset(BufferGetPage(rbuf), 0, BLCKSZ);
1735 rightpagenumber = BufferGetBlockNumber(rbuf);
1736 /* rightpage was initialized by _bt_allocbuf */
1737 ropaque = BTPageGetOpaque(rightpage);
1738
1739 /*
1740 * Finish off remaining leftpage special area fields. They cannot be set
1741 * before both origpage (leftpage) and rightpage buffers are acquired and
1742 * locked.
1743 *
1744 * btpo_cycleid is only used with leaf pages, though we set it here in all
1745 * cases just to be consistent.
1746 */
1747 lopaque->btpo_next = rightpagenumber;
1748 lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1749
1750 /*
1751 * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1752 * and HAS_GARBAGE flags.
1753 */
1754 ropaque->btpo_flags = oopaque->btpo_flags;
1756 ropaque->btpo_prev = origpagenumber;
1757 ropaque->btpo_next = oopaque->btpo_next;
1758 ropaque->btpo_level = oopaque->btpo_level;
1759 ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1760
1761 /*
1762 * Add new high key to rightpage where necessary.
1763 *
1764 * If the page we're splitting is not the rightmost page at its level in
1765 * the tree, then the first entry on the page is the high key from
1766 * origpage.
1767 */
1768 afterrightoff = P_HIKEY;
1769
1770 if (!isrightmost)
1771 {
1772 IndexTuple righthighkey;
1773
1774 itemid = PageGetItemId(origpage, P_HIKEY);
1775 itemsz = ItemIdGetLength(itemid);
1776 righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1777 Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1778 Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1780 if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
1781 {
1782 elog(ERROR, "failed to add high key to the right sibling"
1783 " while splitting block %u of index \"%s\"",
1784 origpagenumber, RelationGetRelationName(rel));
1785 }
1786 afterrightoff = OffsetNumberNext(afterrightoff);
1787 }
1788
1789 /*
1790 * Internal page splits truncate first data item on right page -- it
1791 * becomes "minus infinity" item for the page. Set this up here.
1792 */
1793 minusinfoff = InvalidOffsetNumber;
1794 if (!isleaf)
1795 minusinfoff = afterrightoff;
1796
1797 /*
1798 * Now transfer all the data items (non-pivot tuples in isleaf case, or
1799 * additional pivot tuples in !isleaf case) to the appropriate page.
1800 *
1801 * Note: we *must* insert at least the right page's items in item-number
1802 * order, for the benefit of _bt_restore_page().
1803 */
1804 for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1805 {
1806 IndexTuple dataitem;
1807
1808 itemid = PageGetItemId(origpage, i);
1809 itemsz = ItemIdGetLength(itemid);
1810 dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1811
1812 /* replace original item with nposting due to posting split? */
1813 if (i == origpagepostingoff)
1814 {
1815 Assert(BTreeTupleIsPosting(dataitem));
1816 Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1817 dataitem = nposting;
1818 }
1819
1820 /* does new item belong before this one? */
1821 else if (i == newitemoff)
1822 {
1823 if (newitemonleft)
1824 {
1825 Assert(newitemoff <= firstrightoff);
1826 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1827 false))
1828 {
1829 elog(ERROR, "failed to add new item to the left sibling"
1830 " while splitting block %u of index \"%s\"",
1831 origpagenumber, RelationGetRelationName(rel));
1832 }
1833 afterleftoff = OffsetNumberNext(afterleftoff);
1834 }
1835 else
1836 {
1837 Assert(newitemoff >= firstrightoff);
1838 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1839 afterrightoff == minusinfoff))
1840 {
1841 elog(ERROR, "failed to add new item to the right sibling"
1842 " while splitting block %u of index \"%s\"",
1843 origpagenumber, RelationGetRelationName(rel));
1844 }
1845 afterrightoff = OffsetNumberNext(afterrightoff);
1846 }
1847 }
1848
1849 /* decide which page to put it on */
1850 if (i < firstrightoff)
1851 {
1852 if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1853 {
1854 elog(ERROR, "failed to add old item to the left sibling"
1855 " while splitting block %u of index \"%s\"",
1856 origpagenumber, RelationGetRelationName(rel));
1857 }
1858 afterleftoff = OffsetNumberNext(afterleftoff);
1859 }
1860 else
1861 {
1862 if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1863 afterrightoff == minusinfoff))
1864 {
1865 elog(ERROR, "failed to add old item to the right sibling"
1866 " while splitting block %u of index \"%s\"",
1867 origpagenumber, RelationGetRelationName(rel));
1868 }
1869 afterrightoff = OffsetNumberNext(afterrightoff);
1870 }
1871 }
1872
1873 /* Handle case where newitem goes at the end of rightpage */
1874 if (i <= newitemoff)
1875 {
1876 /*
1877 * Can't have newitemonleft here; that would imply we were told to put
1878 * *everything* on the left page, which cannot fit (if it could, we'd
1879 * not be splitting the page).
1880 */
1881 Assert(!newitemonleft && newitemoff == maxoff + 1);
1882 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1883 afterrightoff == minusinfoff))
1884 {
1885 elog(ERROR, "failed to add new item to the right sibling"
1886 " while splitting block %u of index \"%s\"",
1887 origpagenumber, RelationGetRelationName(rel));
1888 }
1889 afterrightoff = OffsetNumberNext(afterrightoff);
1890 }
1891
1892 /*
1893 * We have to grab the original right sibling (if any) and update its prev
1894 * link. We are guaranteed that this is deadlock-free, since we couple
1895 * the locks in the standard order: left to right.
1896 */
1897 if (!isrightmost)
1898 {
1899 sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1900 spage = BufferGetPage(sbuf);
1901 sopaque = BTPageGetOpaque(spage);
1902 if (sopaque->btpo_prev != origpagenumber)
1903 {
1904 ereport(ERROR,
1905 (errcode(ERRCODE_INDEX_CORRUPTED),
1906 errmsg_internal("right sibling's left-link doesn't match: "
1907 "block %u links to %u instead of expected %u in index \"%s\"",
1908 oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1910 }
1911
1912 /*
1913 * Check to see if we can set the SPLIT_END flag in the right-hand
1914 * split page; this can save some I/O for vacuum since it need not
1915 * proceed to the right sibling. We can set the flag if the right
1916 * sibling has a different cycleid: that means it could not be part of
1917 * a group of pages that were all split off from the same ancestor
1918 * page. If you're confused, imagine that page A splits to A B and
1919 * then again, yielding A C B, while vacuum is in progress. Tuples
1920 * originally in A could now be in either B or C, hence vacuum must
1921 * examine both pages. But if D, our right sibling, has a different
1922 * cycleid then it could not contain any tuples that were in A when
1923 * the vacuum started.
1924 */
1925 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1926 ropaque->btpo_flags |= BTP_SPLIT_END;
1927 }
1928
1929 /*
1930 * Right sibling is locked, new siblings are prepared, but original page
1931 * is not updated yet.
1932 *
1933 * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1934 * not starting the critical section till here because we haven't been
1935 * scribbling on the original page yet; see comments above.
1936 */
1938
1939 /*
1940 * By here, the original data page has been split into two new halves, and
1941 * these are correct. The algorithm requires that the left page never
1942 * move during a split, so we copy the new left page back on top of the
1943 * original. We need to do this before writing the WAL record, so that
1944 * XLogInsert can WAL log an image of the page if necessary.
1945 */
1946 memcpy(origpage, leftpage, BLCKSZ);
1947 /* leftpage, lopaque must not be used below here */
1948
1949 /*
1950 * Move the contents of the right page from its temporary location to the
1951 * destination buffer, before writing the WAL record. Unlike the left
1952 * page, the right page and its opaque area are still needed to complete
1953 * the update of the page, so reinitialize them.
1954 */
1955 rightpage = BufferGetPage(rbuf);
1956 memcpy(rightpage, rightpage_buf.data, BLCKSZ);
1957 ropaque = BTPageGetOpaque(rightpage);
1958
1960 MarkBufferDirty(rbuf);
1961
1962 if (!isrightmost)
1963 {
1964 sopaque->btpo_prev = rightpagenumber;
1965 MarkBufferDirty(sbuf);
1966 }
1967
1968 /*
1969 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1970 * a split
1971 */
1972 if (!isleaf)
1973 {
1974 Page cpage = BufferGetPage(cbuf);
1975 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1976
1977 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1978 MarkBufferDirty(cbuf);
1979 }
1980
1981 /* XLOG stuff */
1982 if (RelationNeedsWAL(rel))
1983 {
1984 xl_btree_split xlrec;
1985 uint8 xlinfo;
1986 XLogRecPtr recptr;
1987
1988 xlrec.level = ropaque->btpo_level;
1989 /* See comments below on newitem, orignewitem, and posting lists */
1990 xlrec.firstrightoff = firstrightoff;
1991 xlrec.newitemoff = newitemoff;
1992 xlrec.postingoff = 0;
1993 if (postingoff != 0 && origpagepostingoff < firstrightoff)
1994 xlrec.postingoff = postingoff;
1995
1998
2001 /* Log original right sibling, since we've changed its prev-pointer */
2002 if (!isrightmost)
2004 if (!isleaf)
2006
2007 /*
2008 * Log the new item, if it was inserted on the left page. (If it was
2009 * put on the right page, we don't need to explicitly WAL log it
2010 * because it's included with all the other items on the right page.)
2011 * Show the new item as belonging to the left page buffer, so that it
2012 * is not stored if XLogInsert decides it needs a full-page image of
2013 * the left page. We always store newitemoff in the record, though.
2014 *
2015 * The details are sometimes slightly different for page splits that
2016 * coincide with a posting list split. If both the replacement
2017 * posting list and newitem go on the right page, then we don't need
2018 * to log anything extra, just like the simple !newitemonleft
2019 * no-posting-split case (postingoff is set to zero in the WAL record,
2020 * so recovery doesn't need to process a posting list split at all).
2021 * Otherwise, we set postingoff and log orignewitem instead of
2022 * newitem, despite having actually inserted newitem. REDO routine
2023 * must reconstruct nposting and newitem using _bt_swap_posting().
2024 *
2025 * Note: It's possible that our page split point is the point that
2026 * makes the posting list lastleft and newitem firstright. This is
2027 * the only case where we log orignewitem/newitem despite newitem
2028 * going on the right page. If XLogInsert decides that it can omit
2029 * orignewitem due to logging a full-page image of the left page,
2030 * everything still works out, since recovery only needs to log
2031 * orignewitem for items on the left page (just like the regular
2032 * newitem-logged case).
2033 */
2034 if (newitemonleft && xlrec.postingoff == 0)
2035 XLogRegisterBufData(0, newitem, newitemsz);
2036 else if (xlrec.postingoff != 0)
2037 {
2038 Assert(isleaf);
2039 Assert(newitemonleft || firstrightoff == newitemoff);
2040 Assert(newitemsz == IndexTupleSize(orignewitem));
2041 XLogRegisterBufData(0, orignewitem, newitemsz);
2042 }
2043
2044 /* Log the left page's new high key */
2045 if (!isleaf)
2046 {
2047 /* lefthighkey isn't local copy, get current pointer */
2048 itemid = PageGetItemId(origpage, P_HIKEY);
2049 lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2050 }
2051 XLogRegisterBufData(0, lefthighkey,
2052 MAXALIGN(IndexTupleSize(lefthighkey)));
2053
2054 /*
2055 * Log the contents of the right page in the format understood by
2056 * _bt_restore_page(). The whole right page will be recreated.
2057 *
2058 * Direct access to page is not good but faster - we should implement
2059 * some new func in page API. Note we only store the tuples
2060 * themselves, knowing that they were inserted in item-number order
2061 * and so the line pointers can be reconstructed. See comments for
2062 * _bt_restore_page().
2063 */
2065 (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2066 ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2067
2068 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2069 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2070
2071 PageSetLSN(origpage, recptr);
2072 PageSetLSN(rightpage, recptr);
2073 if (!isrightmost)
2074 PageSetLSN(spage, recptr);
2075 if (!isleaf)
2076 PageSetLSN(BufferGetPage(cbuf), recptr);
2077 }
2078
2080
2081 /* release the old right sibling */
2082 if (!isrightmost)
2083 _bt_relbuf(rel, sbuf);
2084
2085 /* release the child */
2086 if (!isleaf)
2087 _bt_relbuf(rel, cbuf);
2088
2089 /* be tidy */
2090 if (isleaf)
2091 pfree(lefthighkey);
2092
2093 /* split's done */
2094 return rbuf;
2095}
2096
2097/*
2098 * _bt_insert_parent() -- Insert downlink into parent, completing split.
2099 *
2100 * On entry, buf and rbuf are the left and right split pages, which we
2101 * still hold write locks on. Both locks will be released here. We
2102 * release the rbuf lock once we have a write lock on the page that we
2103 * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2104 * The lock on buf is released at the same point as the lock on the parent
2105 * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2106 * atomic operation that completes the split by inserting a new downlink.
2107 *
2108 * stack - stack showing how we got here. Will be NULL when splitting true
2109 * root, or during concurrent root split, where we can be inefficient
2110 * isroot - we split the true root
2111 * isonly - we split a page alone on its level (might have been fast root)
2112 */
2113static void
2115 Relation heaprel,
2116 Buffer buf,
2117 Buffer rbuf,
2118 BTStack stack,
2119 bool isroot,
2120 bool isonly)
2121{
2122 Assert(heaprel != NULL);
2123
2124 /*
2125 * Here we have to do something Lehman and Yao don't talk about: deal with
2126 * a root split and construction of a new root. If our stack is empty
2127 * then we have just split a node on what had been the root level when we
2128 * descended the tree. If it was still the root then we perform a
2129 * new-root construction. If it *wasn't* the root anymore, search to find
2130 * the next higher level that someone constructed meanwhile, and find the
2131 * right place to insert as for the normal case.
2132 *
2133 * If we have to search for the parent level, we do so by re-descending
2134 * from the root. This is not super-efficient, but it's rare enough not
2135 * to matter.
2136 */
2137 if (isroot)
2138 {
2139 Buffer rootbuf;
2140
2141 Assert(stack == NULL);
2142 Assert(isonly);
2143 /* create a new root node one level up and update the metapage */
2144 rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2145 /* release the split buffers */
2146 _bt_relbuf(rel, rootbuf);
2147 _bt_relbuf(rel, rbuf);
2148 _bt_relbuf(rel, buf);
2149 }
2150 else
2151 {
2153 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2154 Page page = BufferGetPage(buf);
2155 IndexTuple new_item;
2156 BTStackData fakestack;
2157 IndexTuple ritem;
2158 Buffer pbuf;
2159
2160 if (stack == NULL)
2161 {
2162 BTPageOpaque opaque;
2163
2164 elog(DEBUG2, "concurrent ROOT page split");
2165 opaque = BTPageGetOpaque(page);
2166
2167 /*
2168 * We should never reach here when a leaf page split takes place
2169 * despite the insert of newitem being able to apply the fastpath
2170 * optimization. Make sure of that with an assertion.
2171 *
2172 * This is more of a performance issue than a correctness issue.
2173 * The fastpath won't have a descent stack. Using a phony stack
2174 * here works, but never rely on that. The fastpath should be
2175 * rejected within _bt_search_insert() when the rightmost leaf
2176 * page will split, since it's faster to go through _bt_search()
2177 * and get a stack in the usual way.
2178 */
2179 Assert(!(P_ISLEAF(opaque) &&
2181
2182 /* Find the leftmost page at the next level up */
2183 pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2184 /* Set up a phony stack entry pointing there */
2185 stack = &fakestack;
2186 stack->bts_blkno = BufferGetBlockNumber(pbuf);
2188 stack->bts_parent = NULL;
2189 _bt_relbuf(rel, pbuf);
2190 }
2191
2192 /* get high key from left, a strict lower bound for new right page */
2193 ritem = (IndexTuple) PageGetItem(page,
2194 PageGetItemId(page, P_HIKEY));
2195
2196 /* form an index tuple that points at the new right page */
2197 new_item = CopyIndexTuple(ritem);
2198 BTreeTupleSetDownLink(new_item, rbknum);
2199
2200 /*
2201 * Re-find and write lock the parent of buf.
2202 *
2203 * It's possible that the location of buf's downlink has changed since
2204 * our initial _bt_search() descent. _bt_getstackbuf() will detect
2205 * and recover from this, updating the stack, which ensures that the
2206 * new downlink will be inserted at the correct offset. Even buf's
2207 * parent may have changed.
2208 */
2209 pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2210
2211 /*
2212 * Unlock the right child. The left child will be unlocked in
2213 * _bt_insertonpg().
2214 *
2215 * Unlocking the right child must be delayed until here to ensure that
2216 * no concurrent VACUUM operation can become confused. Page deletion
2217 * cannot be allowed to fail to re-find a downlink for the rbuf page.
2218 * (Actually, this is just a vestige of how things used to work. The
2219 * page deletion code is expected to check for the INCOMPLETE_SPLIT
2220 * flag on the left child. It won't attempt deletion of the right
2221 * child until the split is complete. Despite all this, we opt to
2222 * conservatively delay unlocking the right child until here.)
2223 */
2224 _bt_relbuf(rel, rbuf);
2225
2226 if (pbuf == InvalidBuffer)
2227 ereport(ERROR,
2228 (errcode(ERRCODE_INDEX_CORRUPTED),
2229 errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2230 RelationGetRelationName(rel), bknum, rbknum)));
2231
2232 /* Recursively insert into the parent */
2233 _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2234 new_item, MAXALIGN(IndexTupleSize(new_item)),
2235 stack->bts_offset + 1, 0, isonly);
2236
2237 /* be tidy */
2238 pfree(new_item);
2239 }
2240}
2241
2242/*
2243 * _bt_finish_split() -- Finish an incomplete split
2244 *
2245 * A crash or other failure can leave a split incomplete. The insertion
2246 * routines won't allow to insert on a page that is incompletely split.
2247 * Before inserting on such a page, call _bt_finish_split().
2248 *
2249 * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2250 * and unpinned.
2251 *
2252 * Caller must provide a valid heaprel, since finishing a page split requires
2253 * allocating a new page if and when the parent page splits in turn.
2254 */
2255void
2257{
2258 Page lpage = BufferGetPage(lbuf);
2259 BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2260 Buffer rbuf;
2261 Page rpage;
2262 BTPageOpaque rpageop;
2263 bool wasroot;
2264 bool wasonly;
2265
2266 Assert(P_INCOMPLETE_SPLIT(lpageop));
2267 Assert(heaprel != NULL);
2268
2269 /* Lock right sibling, the one missing the downlink */
2270 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2271 rpage = BufferGetPage(rbuf);
2272 rpageop = BTPageGetOpaque(rpage);
2273
2274 /* Could this be a root split? */
2275 if (!stack)
2276 {
2277 Buffer metabuf;
2278 Page metapg;
2279 BTMetaPageData *metad;
2280
2281 /* acquire lock on the metapage */
2282 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2283 metapg = BufferGetPage(metabuf);
2284 metad = BTPageGetMeta(metapg);
2285
2286 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2287
2288 _bt_relbuf(rel, metabuf);
2289 }
2290 else
2291 wasroot = false;
2292
2293 /* Was this the only page on the level before split? */
2294 wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2295
2296 INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
2297 elog(DEBUG1, "finishing incomplete split of %u/%u",
2299
2300 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2301}
2302
2303/*
2304 * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2305 * tuple whose downlink points to child page.
2306 *
2307 * Caller passes child's block number, which is used to identify
2308 * associated pivot tuple in parent page using a linear search that
2309 * matches on pivot's downlink/block number. The expected location of
2310 * the pivot tuple is taken from the stack one level above the child
2311 * page. This is used as a starting point. Insertions into the
2312 * parent level could cause the pivot tuple to move right; deletions
2313 * could cause it to move left, but not left of the page we previously
2314 * found it on.
2315 *
2316 * Caller can use its stack to relocate the pivot tuple/downlink for
2317 * any same-level page to the right of the page found by its initial
2318 * descent. This is necessary because of the possibility that caller
2319 * moved right to recover from a concurrent page split. It's also
2320 * convenient for certain callers to be able to step right when there
2321 * wasn't a concurrent page split, while still using their original
2322 * stack. For example, the checkingunique _bt_doinsert() case may
2323 * have to step right when there are many physical duplicates, and its
2324 * scantid forces an insertion to the right of the "first page the
2325 * value could be on". (This is also relied on by all of our callers
2326 * when dealing with !heapkeyspace indexes.)
2327 *
2328 * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2329 * tuple not found (should not happen). Adjusts bts_blkno &
2330 * bts_offset if changed. Page split caller should insert its new
2331 * pivot tuple for its new right sibling page on parent page, at the
2332 * offset number bts_offset + 1.
2333 */
2334Buffer
2336{
2337 BlockNumber blkno;
2339
2340 blkno = stack->bts_blkno;
2341 start = stack->bts_offset;
2342
2343 for (;;)
2344 {
2345 Buffer buf;
2346 Page page;
2347 BTPageOpaque opaque;
2348
2349 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2350 page = BufferGetPage(buf);
2351 opaque = BTPageGetOpaque(page);
2352
2353 Assert(heaprel != NULL);
2354 if (P_INCOMPLETE_SPLIT(opaque))
2355 {
2356 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2357 continue;
2358 }
2359
2360 if (!P_IGNORE(opaque))
2361 {
2362 OffsetNumber offnum,
2363 minoff,
2364 maxoff;
2365 ItemId itemid;
2366 IndexTuple item;
2367
2368 minoff = P_FIRSTDATAKEY(opaque);
2369 maxoff = PageGetMaxOffsetNumber(page);
2370
2371 /*
2372 * start = InvalidOffsetNumber means "search the whole page". We
2373 * need this test anyway due to possibility that page has a high
2374 * key now when it didn't before.
2375 */
2376 if (start < minoff)
2377 start = minoff;
2378
2379 /*
2380 * Need this check too, to guard against possibility that page
2381 * split since we visited it originally.
2382 */
2383 if (start > maxoff)
2384 start = OffsetNumberNext(maxoff);
2385
2386 /*
2387 * These loops will check every item on the page --- but in an
2388 * order that's attuned to the probability of where it actually
2389 * is. Scan to the right first, then to the left.
2390 */
2391 for (offnum = start;
2392 offnum <= maxoff;
2393 offnum = OffsetNumberNext(offnum))
2394 {
2395 itemid = PageGetItemId(page, offnum);
2396 item = (IndexTuple) PageGetItem(page, itemid);
2397
2398 if (BTreeTupleGetDownLink(item) == child)
2399 {
2400 /* Return accurate pointer to where link is now */
2401 stack->bts_blkno = blkno;
2402 stack->bts_offset = offnum;
2403 return buf;
2404 }
2405 }
2406
2407 for (offnum = OffsetNumberPrev(start);
2408 offnum >= minoff;
2409 offnum = OffsetNumberPrev(offnum))
2410 {
2411 itemid = PageGetItemId(page, offnum);
2412 item = (IndexTuple) PageGetItem(page, itemid);
2413
2414 if (BTreeTupleGetDownLink(item) == child)
2415 {
2416 /* Return accurate pointer to where link is now */
2417 stack->bts_blkno = blkno;
2418 stack->bts_offset = offnum;
2419 return buf;
2420 }
2421 }
2422 }
2423
2424 /*
2425 * The item we're looking for moved right at least one page.
2426 *
2427 * Lehman and Yao couple/chain locks when moving right here, which we
2428 * can avoid. See nbtree/README.
2429 */
2430 if (P_RIGHTMOST(opaque))
2431 {
2432 _bt_relbuf(rel, buf);
2433 return InvalidBuffer;
2434 }
2435 blkno = opaque->btpo_next;
2437 _bt_relbuf(rel, buf);
2438 }
2439}
2440
2441/*
2442 * _bt_newlevel() -- Create a new level above root page.
2443 *
2444 * We've just split the old root page and need to create a new one.
2445 * In order to do this, we add a new root page to the file, then lock
2446 * the metadata page and update it. This is guaranteed to be deadlock-
2447 * free, because all readers release their locks on the metadata page
2448 * before trying to lock the root, and all writers lock the root before
2449 * trying to lock the metadata page. We have a write lock on the old
2450 * root page, so we have not introduced any cycles into the waits-for
2451 * graph.
2452 *
2453 * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2454 * locked. On exit, a new root page exists with entries for the
2455 * two new children, metapage is updated and unlocked/unpinned.
2456 * The new root buffer is returned to caller which has to unlock/unpin
2457 * lbuf, rbuf & rootbuf.
2458 */
2459static Buffer
2460_bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2461{
2462 Buffer rootbuf;
2463 Page lpage,
2464 rootpage;
2465 BlockNumber lbkno,
2466 rbkno;
2467 BlockNumber rootblknum;
2468 BTPageOpaque rootopaque;
2469 BTPageOpaque lopaque;
2470 ItemId itemid;
2471 IndexTuple item;
2472 IndexTuple left_item;
2473 Size left_item_sz;
2474 IndexTuple right_item;
2475 Size right_item_sz;
2476 Buffer metabuf;
2477 Page metapg;
2478 BTMetaPageData *metad;
2479
2480 lbkno = BufferGetBlockNumber(lbuf);
2481 rbkno = BufferGetBlockNumber(rbuf);
2482 lpage = BufferGetPage(lbuf);
2483 lopaque = BTPageGetOpaque(lpage);
2484
2485 /* get a new root page */
2486 rootbuf = _bt_allocbuf(rel, heaprel);
2487 rootpage = BufferGetPage(rootbuf);
2488 rootblknum = BufferGetBlockNumber(rootbuf);
2489
2490 /* acquire lock on the metapage */
2491 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2492 metapg = BufferGetPage(metabuf);
2493 metad = BTPageGetMeta(metapg);
2494
2495 /*
2496 * Create downlink item for left page (old root). The key value used is
2497 * "minus infinity", a sentinel value that's reliably less than any real
2498 * key value that could appear in the left page.
2499 */
2500 left_item_sz = sizeof(IndexTupleData);
2501 left_item = (IndexTuple) palloc(left_item_sz);
2502 left_item->t_info = left_item_sz;
2503 BTreeTupleSetDownLink(left_item, lbkno);
2504 BTreeTupleSetNAtts(left_item, 0, false);
2505
2506 /*
2507 * Create downlink item for right page. The key for it is obtained from
2508 * the "high key" position in the left page.
2509 */
2510 itemid = PageGetItemId(lpage, P_HIKEY);
2511 right_item_sz = ItemIdGetLength(itemid);
2512 item = (IndexTuple) PageGetItem(lpage, itemid);
2513 right_item = CopyIndexTuple(item);
2514 BTreeTupleSetDownLink(right_item, rbkno);
2515
2516 /* NO EREPORT(ERROR) from here till newroot op is logged */
2518
2519 /* upgrade metapage if needed */
2520 if (metad->btm_version < BTREE_NOVAC_VERSION)
2521 _bt_upgrademetapage(metapg);
2522
2523 /* set btree special data */
2524 rootopaque = BTPageGetOpaque(rootpage);
2525 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2526 rootopaque->btpo_flags = BTP_ROOT;
2527 rootopaque->btpo_level =
2528 (BTPageGetOpaque(lpage))->btpo_level + 1;
2529 rootopaque->btpo_cycleid = 0;
2530
2531 /* update metapage data */
2532 metad->btm_root = rootblknum;
2533 metad->btm_level = rootopaque->btpo_level;
2534 metad->btm_fastroot = rootblknum;
2535 metad->btm_fastlevel = rootopaque->btpo_level;
2536
2537 /*
2538 * Insert the left page pointer into the new root page. The root page is
2539 * the rightmost page on its level so there is no "high key" in it; the
2540 * two items will go into positions P_HIKEY and P_FIRSTKEY.
2541 *
2542 * Note: we *must* insert the two items in item-number order, for the
2543 * benefit of _bt_restore_page().
2544 */
2545 Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2546 if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
2547 elog(PANIC, "failed to add leftkey to new root page"
2548 " while splitting block %u of index \"%s\"",
2550
2551 /*
2552 * insert the right page pointer into the new root page.
2553 */
2554 Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2555 Assert(BTreeTupleGetNAtts(right_item, rel) <=
2557 if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
2558 elog(PANIC, "failed to add rightkey to new root page"
2559 " while splitting block %u of index \"%s\"",
2561
2562 /* Clear the incomplete-split flag in the left child */
2563 Assert(P_INCOMPLETE_SPLIT(lopaque));
2564 lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2565 MarkBufferDirty(lbuf);
2566
2567 MarkBufferDirty(rootbuf);
2568 MarkBufferDirty(metabuf);
2569
2570 /* XLOG stuff */
2571 if (RelationNeedsWAL(rel))
2572 {
2573 xl_btree_newroot xlrec;
2574 XLogRecPtr recptr;
2576
2577 xlrec.rootblk = rootblknum;
2578 xlrec.level = metad->btm_level;
2579
2582
2586
2588 md.version = metad->btm_version;
2589 md.root = rootblknum;
2590 md.level = metad->btm_level;
2591 md.fastroot = rootblknum;
2592 md.fastlevel = metad->btm_level;
2594 md.allequalimage = metad->btm_allequalimage;
2595
2596 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2597
2598 /*
2599 * Direct access to page is not good but faster - we should implement
2600 * some new func in page API.
2601 */
2603 (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2604 ((PageHeader) rootpage)->pd_special -
2605 ((PageHeader) rootpage)->pd_upper);
2606
2607 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2608
2609 PageSetLSN(lpage, recptr);
2610 PageSetLSN(rootpage, recptr);
2611 PageSetLSN(metapg, recptr);
2612 }
2613
2615
2616 /* done with metapage */
2617 _bt_relbuf(rel, metabuf);
2618
2619 pfree(left_item);
2620 pfree(right_item);
2621
2622 return rootbuf;
2623}
2624
2625/*
2626 * _bt_pgaddtup() -- add a data item to a particular page during split.
2627 *
2628 * The difference between this routine and a bare PageAddItem call is
2629 * that this code can deal with the first data item on an internal btree
2630 * page in passing. This data item (which is called "firstright" within
2631 * _bt_split()) has a key that must be treated as minus infinity after
2632 * the split. Therefore, we truncate away all attributes when caller
2633 * specifies it's the first data item on page (downlink is not changed,
2634 * though). This extra step is only needed for the right page of an
2635 * internal page split. There is no need to do this for the first data
2636 * item on the existing/left page, since that will already have been
2637 * truncated during an earlier page split.
2638 *
2639 * See _bt_split() for a high level explanation of why we truncate here.
2640 * Note that this routine has nothing to do with suffix truncation,
2641 * despite using some of the same infrastructure.
2642 */
2643static inline bool
2645 Size itemsize,
2646 const IndexTupleData *itup,
2647 OffsetNumber itup_off,
2648 bool newfirstdataitem)
2649{
2650 IndexTupleData trunctuple;
2651
2652 if (newfirstdataitem)
2653 {
2654 trunctuple = *itup;
2655 trunctuple.t_info = sizeof(IndexTupleData);
2656 BTreeTupleSetNAtts(&trunctuple, 0, false);
2657 itup = &trunctuple;
2658 itemsize = sizeof(IndexTupleData);
2659 }
2660
2661 if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
2662 return false;
2663
2664 return true;
2665}
2666
2667/*
2668 * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2669 *
2670 * There are three operations performed here: simple index deletion, bottom-up
2671 * index deletion, and deduplication. If all three operations fail to free
2672 * enough space for the incoming item then caller will go on to split the
2673 * page. We always consider simple deletion first. If that doesn't work out
2674 * we consider alternatives. Callers that only want us to consider simple
2675 * deletion (without any fallback) ask for that using the 'simpleonly'
2676 * argument.
2677 *
2678 * We usually pick only one alternative "complex" operation when simple
2679 * deletion alone won't prevent a page split. The 'checkingunique',
2680 * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2681 *
2682 * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2683 * level flag was found set. The flag was useful back when there wasn't
2684 * necessarily one single page for a duplicate tuple to go on (before heap TID
2685 * became a part of the key space in version 4 indexes). But we don't
2686 * actually look at the flag anymore (it's not a gating condition for our
2687 * caller). That would cause us to miss tuples that are safe to delete,
2688 * without getting any benefit in return. We know that the alternative is to
2689 * split the page; scanning the line pointer array in passing won't have
2690 * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2691 * all this because !heapkeyspace indexes must still do a "getting tired"
2692 * linear search, and so are likely to get some benefit from using it as a
2693 * gating condition.)
2694 */
2695static void
2697 BTInsertState insertstate,
2698 bool simpleonly, bool checkingunique,
2699 bool uniquedup, bool indexUnchanged)
2700{
2702 int ndeletable = 0;
2703 OffsetNumber offnum,
2704 minoff,
2705 maxoff;
2706 Buffer buffer = insertstate->buf;
2707 BTScanInsert itup_key = insertstate->itup_key;
2708 Page page = BufferGetPage(buffer);
2709 BTPageOpaque opaque = BTPageGetOpaque(page);
2710
2711 Assert(P_ISLEAF(opaque));
2712 Assert(simpleonly || itup_key->heapkeyspace);
2713 Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2714
2715 /*
2716 * Scan over all items to see which ones need to be deleted according to
2717 * LP_DEAD flags. We'll usually manage to delete a few extra items that
2718 * are not marked LP_DEAD in passing. Often the extra items that actually
2719 * end up getting deleted are items that would have had their LP_DEAD bit
2720 * set before long anyway (if we opted not to include them as extras).
2721 */
2722 minoff = P_FIRSTDATAKEY(opaque);
2723 maxoff = PageGetMaxOffsetNumber(page);
2724 for (offnum = minoff;
2725 offnum <= maxoff;
2726 offnum = OffsetNumberNext(offnum))
2727 {
2728 ItemId itemId = PageGetItemId(page, offnum);
2729
2730 if (ItemIdIsDead(itemId))
2731 deletable[ndeletable++] = offnum;
2732 }
2733
2734 if (ndeletable > 0)
2735 {
2736 _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2737 insertstate->itup, minoff, maxoff);
2738 insertstate->bounds_valid = false;
2739
2740 /* Return when a page split has already been avoided */
2741 if (PageGetFreeSpace(page) >= insertstate->itemsz)
2742 return;
2743
2744 /* Might as well assume duplicates (if checkingunique) */
2745 uniquedup = true;
2746 }
2747
2748 /*
2749 * We're done with simple deletion. Return early with callers that only
2750 * call here so that simple deletion can be considered. This includes
2751 * callers that explicitly ask for this and checkingunique callers that
2752 * probably don't have any version churn duplicates on the page.
2753 *
2754 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2755 * return at this point (or when we go on the try either or both of our
2756 * other strategies and they also fail). We do not bother expending a
2757 * separate write to clear it, however. Caller will definitely clear it
2758 * when it goes on to split the page (note also that the deduplication
2759 * process will clear the flag in passing, just to keep things tidy).
2760 */
2761 if (simpleonly || (checkingunique && !uniquedup))
2762 {
2763 Assert(!indexUnchanged);
2764 return;
2765 }
2766
2767 /* Assume bounds about to be invalidated (this is almost certain now) */
2768 insertstate->bounds_valid = false;
2769
2770 /*
2771 * Perform bottom-up index deletion pass when executor hint indicated that
2772 * incoming item is logically unchanged, or for a unique index that is
2773 * known to have physical duplicates for some other reason. (There is a
2774 * large overlap between these two cases for a unique index. It's worth
2775 * having both triggering conditions in order to apply the optimization in
2776 * the event of successive related INSERT and DELETE statements.)
2777 *
2778 * We'll go on to do a deduplication pass when a bottom-up pass fails to
2779 * delete an acceptable amount of free space (a significant fraction of
2780 * the page, or space for the new item, whichever is greater).
2781 *
2782 * Note: Bottom-up index deletion uses the same equality/equivalence
2783 * routines as deduplication internally. However, it does not merge
2784 * together index tuples, so the same correctness considerations do not
2785 * apply. We deliberately omit an index-is-allequalimage test here.
2786 */
2787 if ((indexUnchanged || uniquedup) &&
2788 _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2789 return;
2790
2791 /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2792 if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2793 _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2794 (indexUnchanged || uniquedup));
2795}
2796
2797/*
2798 * _bt_simpledel_pass - Simple index tuple deletion pass.
2799 *
2800 * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2801 * of all such tuples are determined by caller (caller passes these to us as
2802 * its 'deletable' argument).
2803 *
2804 * We might also delete extra index tuples that turn out to be safe to delete
2805 * in passing (though they must be cheap to check in passing to begin with).
2806 * There is no certainty that any extra tuples will be deleted, though. The
2807 * high level goal of the approach we take is to get the most out of each call
2808 * here (without noticeably increasing the per-call overhead compared to what
2809 * we need to do just to be able to delete the page's LP_DEAD-marked index
2810 * tuples).
2811 *
2812 * The number of extra index tuples that turn out to be deletable might
2813 * greatly exceed the number of LP_DEAD-marked index tuples due to various
2814 * locality related effects. For example, it's possible that the total number
2815 * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2816 * quite low, in which case we might end up checking if it's possible to
2817 * delete _most_ index tuples on the page (without the tableam needing to
2818 * access additional table blocks). The tableam will sometimes stumble upon
2819 * _many_ extra deletable index tuples in indexes where this pattern is
2820 * common.
2821 *
2822 * See nbtree/README for further details on simple index tuple deletion.
2823 */
2824static void
2826 OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2827 OffsetNumber minoff, OffsetNumber maxoff)
2828{
2829 Page page = BufferGetPage(buffer);
2830 BlockNumber *deadblocks;
2831 int ndeadblocks;
2832 TM_IndexDeleteOp delstate;
2833 OffsetNumber offnum;
2834
2835 /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2836 deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2837 &ndeadblocks);
2838
2839 /* Initialize tableam state that describes index deletion operation */
2840 delstate.irel = rel;
2841 delstate.iblknum = BufferGetBlockNumber(buffer);
2842 delstate.bottomup = false;
2843 delstate.bottomupfreespace = 0;
2844 delstate.ndeltids = 0;
2845 delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2846 delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2847
2848 for (offnum = minoff;
2849 offnum <= maxoff;
2850 offnum = OffsetNumberNext(offnum))
2851 {
2852 ItemId itemid = PageGetItemId(page, offnum);
2853 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2854 TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2855 TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2856 BlockNumber tidblock;
2857 void *match;
2858
2859 if (!BTreeTupleIsPosting(itup))
2860 {
2861 tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2862 match = bsearch(&tidblock, deadblocks, ndeadblocks,
2863 sizeof(BlockNumber), _bt_blk_cmp);
2864
2865 if (!match)
2866 {
2867 Assert(!ItemIdIsDead(itemid));
2868 continue;
2869 }
2870
2871 /*
2872 * TID's table block is among those pointed to by the TIDs from
2873 * LP_DEAD-bit set tuples on page -- add TID to deltids
2874 */
2875 odeltid->tid = itup->t_tid;
2876 odeltid->id = delstate.ndeltids;
2877 ostatus->idxoffnum = offnum;
2878 ostatus->knowndeletable = ItemIdIsDead(itemid);
2879 ostatus->promising = false; /* unused */
2880 ostatus->freespace = 0; /* unused */
2881
2882 delstate.ndeltids++;
2883 }
2884 else
2885 {
2886 int nitem = BTreeTupleGetNPosting(itup);
2887
2888 for (int p = 0; p < nitem; p++)
2889 {
2890 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2891
2892 tidblock = ItemPointerGetBlockNumber(tid);
2893 match = bsearch(&tidblock, deadblocks, ndeadblocks,
2894 sizeof(BlockNumber), _bt_blk_cmp);
2895
2896 if (!match)
2897 {
2898 Assert(!ItemIdIsDead(itemid));
2899 continue;
2900 }
2901
2902 /*
2903 * TID's table block is among those pointed to by the TIDs
2904 * from LP_DEAD-bit set tuples on page -- add TID to deltids
2905 */
2906 odeltid->tid = *tid;
2907 odeltid->id = delstate.ndeltids;
2908 ostatus->idxoffnum = offnum;
2909 ostatus->knowndeletable = ItemIdIsDead(itemid);
2910 ostatus->promising = false; /* unused */
2911 ostatus->freespace = 0; /* unused */
2912
2913 odeltid++;
2914 ostatus++;
2915 delstate.ndeltids++;
2916 }
2917 }
2918 }
2919
2920 pfree(deadblocks);
2921
2922 Assert(delstate.ndeltids >= ndeletable);
2923
2924 /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2925 _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2926
2927 pfree(delstate.deltids);
2928 pfree(delstate.status);
2929}
2930
2931/*
2932 * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2933 *
2934 * Builds sorted and unique-ified array of table block numbers from index
2935 * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2936 * block from incoming newitem just in case it isn't among the LP_DEAD-related
2937 * table blocks.
2938 *
2939 * Always counting the newitem's table block as an LP_DEAD related block makes
2940 * sense because the cost is consistently low; it is practically certain that
2941 * the table block will not incur a buffer miss in tableam. On the other hand
2942 * the benefit is often quite high. There is a decent chance that there will
2943 * be some deletable items from this block, since in general most garbage
2944 * tuples became garbage in the recent past (in many cases this won't be the
2945 * first logical row that core code added to/modified in table block
2946 * recently).
2947 *
2948 * Returns final array, and sets *nblocks to its final size for caller.
2949 */
2950static BlockNumber *
2951_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2952 IndexTuple newitem, int *nblocks)
2953{
2954 int spacentids,
2955 ntids;
2956 BlockNumber *tidblocks;
2957
2958 /*
2959 * Accumulate each TID's block in array whose initial size has space for
2960 * one table block per LP_DEAD-set tuple (plus space for the newitem table
2961 * block). Array will only need to grow when there are LP_DEAD-marked
2962 * posting list tuples (which is not that common).
2963 */
2964 spacentids = ndeletable + 1;
2965 ntids = 0;
2966 tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2967
2968 /*
2969 * First add the table block for the incoming newitem. This is the one
2970 * case where simple deletion can visit a table block that doesn't have
2971 * any known deletable items.
2972 */
2973 Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2974 tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2975
2976 for (int i = 0; i < ndeletable; i++)
2977 {
2978 ItemId itemid = PageGetItemId(page, deletable[i]);
2979 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2980
2981 Assert(ItemIdIsDead(itemid));
2982
2983 if (!BTreeTupleIsPosting(itup))
2984 {
2985 if (ntids + 1 > spacentids)
2986 {
2987 spacentids *= 2;
2988 tidblocks = (BlockNumber *)
2989 repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2990 }
2991
2992 tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2993 }
2994 else
2995 {
2996 int nposting = BTreeTupleGetNPosting(itup);
2997
2998 if (ntids + nposting > spacentids)
2999 {
3000 spacentids = Max(spacentids * 2, ntids + nposting);
3001 tidblocks = (BlockNumber *)
3002 repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
3003 }
3004
3005 for (int j = 0; j < nposting; j++)
3006 {
3007 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
3008
3009 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
3010 }
3011 }
3012 }
3013
3014 qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3015 *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3016
3017 return tidblocks;
3018}
3019
3020/*
3021 * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3022 */
3023static inline int
3024_bt_blk_cmp(const void *arg1, const void *arg2)
3025{
3026 BlockNumber b1 = *((BlockNumber *) arg1);
3027 BlockNumber b2 = *((BlockNumber *) arg2);
3028
3029 return pg_cmp_u32(b1, b2);
3030}
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:153
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4223
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5366
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2943
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5430
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:745
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:425
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387
Size PageGetFreeSpace(const PageData *page)
Definition: bufpage.c:906
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
PageData * Page
Definition: bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:385
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
#define MAXALIGN(LEN)
Definition: c.h:813
uint8_t uint8
Definition: c.h:539
#define PG_UINT32_MAX
Definition: c.h:598
#define Max(x, y)
Definition: c.h:1000
uint16_t uint16
Definition: c.h:540
#define unlikely(x)
Definition: c.h:407
uint32_t uint32
Definition: c.h:541
uint32 TransactionId
Definition: c.h:660
size_t Size
Definition: c.h:613
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
char * BuildIndexValueDescription(Relation indexRelation, const Datum *values, const bool *isnull)
Definition: genam.c:178
IndexUniqueCheck
Definition: genam.h:143
@ UNIQUE_CHECK_NO
Definition: genam.h:144
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:147
@ UNIQUE_CHECK_PARTIAL
Definition: genam.h:146
Assert(PointerIsAligned(start, uint64))
return str start
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:456
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
#define INJECTION_POINT(name, arg)
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:719
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition: itemptr.c:51
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
struct IndexTupleData IndexTupleData
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:828
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition: lmgr.c:663
@ XLTW_InsertIndex
Definition: lmgr.h:31
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:1020
void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
Definition: nbtdedup.c:59
bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
Definition: nbtdedup.c:307
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:319
#define BTREE_FASTPATH_MIN_LEVEL
Definition: nbtinsert.c:32
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:817
Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2335
static bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtinsert.c:2644
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff)
Definition: nbtinsert.c:2825
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:104
static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
Definition: nbtinsert.c:2460
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2114
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged)
Definition: nbtinsert.c:2696
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2256
static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
Definition: nbtinsert.c:1475
static int _bt_blk_cmp(const void *arg1, const void *arg2)
Definition: nbtinsert.c:3024
static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
Definition: nbtinsert.c:1029
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1107
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:410
static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks)
Definition: nbtinsert.c:2951
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1004
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:108
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:676
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1130
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:870
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:798
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1094
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:846
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1512
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1166
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define BTPageGetMeta(p)
Definition: nbtree.h:122
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define P_HIKEY
Definition: nbtree.h:368
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define BTP_HAS_GARBAGE
Definition: nbtree.h:83
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define BTP_ROOT
Definition: nbtree.h:78
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:563
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_ISROOT(opaque)
Definition: nbtree.h:222
#define P_NONE
Definition: nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BTREE_METAPAGE
Definition: nbtree.h:149
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:84
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define BT_READ
Definition: nbtree.h:730
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:153
#define BT_WRITE
Definition: nbtree.h:731
#define P_FIRSTKEY
Definition: nbtree.h:369
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:596
#define BTMaxItemSize
Definition: nbtree.h:165
#define BTP_SPLIT_END
Definition: nbtree.h:82
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
Definition: nbtsearch.c:2612
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:107
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:479
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:693
OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
Definition: nbtsplitloc.c:130
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:4309
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:189
BTCycleId _bt_vacuum_cycleid(Relation rel)
Definition: nbtutils.c:3620
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:97
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:3883
#define XLOG_BTREE_INSERT_POST
Definition: nbtxlog.h:32
#define SizeOfBtreeInsert
Definition: nbtxlog.h:84
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:31
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:27
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:28
#define SizeOfBtreeSplit
Definition: nbtxlog.h:158
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:29
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:30
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define INDEX_MAX_KEYS
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:227
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static char * buf
Definition: pg_test_fsync.c:72
#define qsort(a, b, c, d)
Definition: port.h:500
uint64_t Datum
Definition: postgres.h:70
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3144
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition: predicate.c:4336
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition: qunique.h:21
#define RelationGetDescr(relation)
Definition: rel.h:541
#define RelationGetRelationName(relation)
Definition: rel.h:549
#define RelationGetTargetBlock(relation)
Definition: rel.h:611
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:618
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:527
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6103
#define SnapshotSelf
Definition: snapmgr.h:32
#define InitDirtySnapshot(snapshotdata)
Definition: snapmgr.h:42
OffsetNumber stricthigh
Definition: nbtree.h:836
bool bounds_valid
Definition: nbtree.h:834
OffsetNumber low
Definition: nbtree.h:835
IndexTuple itup
Definition: nbtree.h:822
BTScanInsert itup_key
Definition: nbtree.h:824
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:115
uint32 btm_level
Definition: nbtree.h:109
BlockNumber btm_fastroot
Definition: nbtree.h:110
uint32 btm_version
Definition: nbtree.h:107
BlockNumber btm_root
Definition: nbtree.h:108
bool btm_allequalimage
Definition: nbtree.h:119
uint32 btm_fastlevel
Definition: nbtree.h:111
BlockNumber btpo_next
Definition: nbtree.h:66
BlockNumber btpo_prev
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:68
uint32 btpo_level
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:69
ItemPointer scantid
Definition: nbtree.h:802
bool allequalimage
Definition: nbtree.h:798
bool heapkeyspace
Definition: nbtree.h:797
bool anynullkeys
Definition: nbtree.h:799
BlockNumber bts_blkno
Definition: nbtree.h:745
struct BTStackData * bts_parent
Definition: nbtree.h:747
OffsetNumber bts_offset
Definition: nbtree.h:746
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
char data[BLCKSZ]
Definition: c.h:1119
TransactionId xmin
Definition: snapshot.h:153
TransactionId xmax
Definition: snapshot.h:154
uint32 speculativeToken
Definition: snapshot.h:189
TM_IndexStatus * status
Definition: tableam.h:254
int bottomupfreespace
Definition: tableam.h:249
Relation irel
Definition: tableam.h:246
TM_IndexDelete * deltids
Definition: tableam.h:253
BlockNumber iblknum
Definition: tableam.h:247
ItemPointerData tid
Definition: tableam.h:212
bool knowndeletable
Definition: tableam.h:219
bool promising
Definition: tableam.h:222
int16 freespace
Definition: tableam.h:223
OffsetNumber idxoffnum
Definition: tableam.h:218
OffsetNumber offnum
Definition: nbtxlog.h:78
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
uint16 postingoff
Definition: nbtxlog.h:155
OffsetNumber firstrightoff
Definition: nbtxlog.h:153
uint32 level
Definition: nbtxlog.h:152
OffsetNumber newitemoff
Definition: nbtxlog.h:154
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
Definition: tableam.c:240
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:409
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:245
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34