PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtpage.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtpage.c
4 * BTree-specific page management code for the Postgres btree access
5 * method.
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 *
11 * IDENTIFICATION
12 * src/backend/access/nbtree/nbtpage.c
13 *
14 * NOTES
15 * Postgres btree pages look like ordinary relation pages. The opaque
16 * data at high addresses includes pointers to left and right siblings
17 * and flag data describing page state. The first page in a btree, page
18 * zero, is special -- it stores meta-information describing the tree.
19 * Pages one and higher store the actual tree data.
20 *
21 *-------------------------------------------------------------------------
22 */
23#include "postgres.h"
24
25#include "access/nbtree.h"
26#include "access/nbtxlog.h"
27#include "access/tableam.h"
28#include "access/transam.h"
29#include "access/xlog.h"
30#include "access/xloginsert.h"
31#include "common/int.h"
32#include "miscadmin.h"
33#include "storage/indexfsm.h"
34#include "storage/predicate.h"
35#include "storage/procarray.h"
36#include "utils/memdebug.h"
37#include "utils/memutils.h"
38#include "utils/snapmgr.h"
39
40static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
41static void _bt_delitems_delete(Relation rel, Buffer buf,
42 TransactionId snapshotConflictHorizon,
43 bool isCatalogRel,
44 OffsetNumber *deletable, int ndeletable,
45 BTVacuumPosting *updatable, int nupdatable);
46static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
47 OffsetNumber *updatedoffsets,
48 Size *updatedbuflen, bool needswal);
49static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,
50 Buffer leafbuf, BTStack stack);
51static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
52 BlockNumber scanblkno,
53 bool *rightsib_empty,
54 BTVacState *vstate);
55static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,
56 BlockNumber child, BTStack stack,
57 Buffer *subtreeparent, OffsetNumber *poffset,
58 BlockNumber *topparent,
59 BlockNumber *topparentrightsib);
60static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
61 FullTransactionId safexid);
62
63/*
64 * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
65 */
66void
67_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
68 bool allequalimage)
69{
70 BTMetaPageData *metad;
71 BTPageOpaque metaopaque;
72
73 _bt_pageinit(page, BLCKSZ);
74
75 metad = BTPageGetMeta(page);
76 metad->btm_magic = BTREE_MAGIC;
78 metad->btm_root = rootbknum;
79 metad->btm_level = level;
80 metad->btm_fastroot = rootbknum;
81 metad->btm_fastlevel = level;
84 metad->btm_allequalimage = allequalimage;
85
86 metaopaque = BTPageGetOpaque(page);
87 metaopaque->btpo_flags = BTP_META;
88
89 /*
90 * Set pd_lower just past the end of the metadata. This is essential,
91 * because without doing so, metadata will be lost if xlog.c compresses
92 * the page.
93 */
94 ((PageHeader) page)->pd_lower =
95 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
96}
97
98/*
99 * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
100 * 3, the last version that can be updated without broadly affecting
101 * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
102 *
103 * This routine does purely in-memory image upgrade. Caller is
104 * responsible for locking, WAL-logging etc.
105 */
106void
108{
109 BTMetaPageData *metad;
111
112 metad = BTPageGetMeta(page);
113 metaopaque = BTPageGetOpaque(page);
114
115 /* It must be really a meta page of upgradable version */
116 Assert(metaopaque->btpo_flags & BTP_META);
119
120 /* Set version number and fill extra fields added into version 3 */
124 /* Only a REINDEX can set this field */
125 Assert(!metad->btm_allequalimage);
126 metad->btm_allequalimage = false;
127
128 /* Adjust pd_lower (see _bt_initmetapage() for details) */
129 ((PageHeader) page)->pd_lower =
130 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
131}
132
133/*
134 * Get metadata from share-locked buffer containing metapage, while performing
135 * standard sanity checks.
136 *
137 * Callers that cache data returned here in local cache should note that an
138 * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
139 * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
140 */
141static BTMetaPageData *
143{
144 Page metapg;
145 BTPageOpaque metaopaque;
146 BTMetaPageData *metad;
147
148 metapg = BufferGetPage(metabuf);
149 metaopaque = BTPageGetOpaque(metapg);
150 metad = BTPageGetMeta(metapg);
151
152 /* sanity-check the metapage */
153 if (!P_ISMETA(metaopaque) ||
154 metad->btm_magic != BTREE_MAGIC)
156 (errcode(ERRCODE_INDEX_CORRUPTED),
157 errmsg("index \"%s\" is not a btree",
159
160 if (metad->btm_version < BTREE_MIN_VERSION ||
161 metad->btm_version > BTREE_VERSION)
163 (errcode(ERRCODE_INDEX_CORRUPTED),
164 errmsg("version mismatch in index \"%s\": file version %d, "
165 "current version %d, minimal supported version %d",
168
169 return metad;
170}
171
172/*
173 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
174 *
175 * Called by btvacuumcleanup when btbulkdelete was never called because no
176 * index tuples needed to be deleted.
177 */
178bool
180{
181 Buffer metabuf;
182 Page metapg;
183 BTMetaPageData *metad;
184 uint32 btm_version;
185 BlockNumber prev_num_delpages;
186
187 /*
188 * Copy details from metapage to local variables quickly.
189 *
190 * Note that we deliberately avoid using cached version of metapage here.
191 */
192 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
193 metapg = BufferGetPage(metabuf);
194 metad = BTPageGetMeta(metapg);
195 btm_version = metad->btm_version;
196
197 if (btm_version < BTREE_NOVAC_VERSION)
198 {
199 /*
200 * Metapage needs to be dynamically upgraded to store fields that are
201 * only present when btm_version >= BTREE_NOVAC_VERSION
202 */
203 _bt_relbuf(rel, metabuf);
204 return true;
205 }
206
207 prev_num_delpages = metad->btm_last_cleanup_num_delpages;
208 _bt_relbuf(rel, metabuf);
209
210 /*
211 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
212 * total size of the index. We can reasonably expect (though are not
213 * guaranteed) to be able to recycle this many pages if we decide to do a
214 * btvacuumscan call during the ongoing btvacuumcleanup. For further
215 * details see the nbtree/README section on placing deleted pages in the
216 * FSM.
217 */
218 if (prev_num_delpages > 0 &&
219 prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
220 return true;
221
222 return false;
223}
224
225/*
226 * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
227 *
228 * Called at the end of btvacuumcleanup, when num_delpages value has been
229 * finalized.
230 */
231void
233{
234 Buffer metabuf;
235 Page metapg;
236 BTMetaPageData *metad;
237
238 /*
239 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
240 * field started out as a TransactionId field called btm_oldest_btpo_xact.
241 * Both "versions" are just uint32 fields. It was convenient to repurpose
242 * the field when we began to use 64-bit XIDs in deleted pages.
243 *
244 * It's possible that a pg_upgrade'd database will contain an XID value in
245 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
246 * field. _bt_vacuum_needs_cleanup() may even believe that this value
247 * indicates that there are lots of pages that it needs to recycle, when
248 * in reality there are only one or two. The worst that can happen is
249 * that there will be a call to btvacuumscan a little earlier, which will
250 * set btm_last_cleanup_num_delpages to a sane value when we're called.
251 *
252 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
253 * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
254 * to be consistent.
255 */
256 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
257 metapg = BufferGetPage(metabuf);
258 metad = BTPageGetMeta(metapg);
259
260 /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
261 if (metad->btm_version >= BTREE_NOVAC_VERSION &&
262 metad->btm_last_cleanup_num_delpages == num_delpages)
263 {
264 /* Usually means index continues to have num_delpages of 0 */
265 _bt_relbuf(rel, metabuf);
266 return;
267 }
268
269 /* trade in our read lock for a write lock */
270 _bt_unlockbuf(rel, metabuf);
271 _bt_lockbuf(rel, metabuf, BT_WRITE);
272
274
275 /* upgrade meta-page if needed */
276 if (metad->btm_version < BTREE_NOVAC_VERSION)
277 _bt_upgrademetapage(metapg);
278
279 /* update cleanup-related information */
280 metad->btm_last_cleanup_num_delpages = num_delpages;
282 MarkBufferDirty(metabuf);
283
284 /* write wal record if needed */
285 if (RelationNeedsWAL(rel))
286 {
288 XLogRecPtr recptr;
289
292
294 md.version = metad->btm_version;
295 md.root = metad->btm_root;
296 md.level = metad->btm_level;
297 md.fastroot = metad->btm_fastroot;
298 md.fastlevel = metad->btm_fastlevel;
299 md.last_cleanup_num_delpages = num_delpages;
301
302 XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
303
304 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
305
306 PageSetLSN(metapg, recptr);
307 }
308
310
311 _bt_relbuf(rel, metabuf);
312}
313
314/*
315 * _bt_getroot() -- Get the root page of the btree.
316 *
317 * Since the root page can move around the btree file, we have to read
318 * its location from the metadata page, and then read the root page
319 * itself. If no root page exists yet, we have to create one.
320 *
321 * The access type parameter (BT_READ or BT_WRITE) controls whether
322 * a new root page will be created or not. If access = BT_READ,
323 * and no root page exists, we just return InvalidBuffer. For
324 * BT_WRITE, we try to create the root page if it doesn't exist.
325 * NOTE that the returned root page will have only a read lock set
326 * on it even if access = BT_WRITE!
327 *
328 * If access = BT_WRITE, heaprel must be set; otherwise caller can just
329 * pass NULL. See _bt_allocbuf for an explanation.
330 *
331 * The returned page is not necessarily the true root --- it could be
332 * a "fast root" (a page that is alone in its level due to deletions).
333 * Also, if the root page is split while we are "in flight" to it,
334 * what we will return is the old root, which is now just the leftmost
335 * page on a probably-not-very-wide level. For most purposes this is
336 * as good as or better than the true root, so we do not bother to
337 * insist on finding the true root. We do, however, guarantee to
338 * return a live (not deleted or half-dead) page.
339 *
340 * On successful return, the root page is pinned and read-locked.
341 * The metadata page is not locked or pinned on exit.
342 */
343Buffer
345{
346 Buffer metabuf;
347 Buffer rootbuf;
348 Page rootpage;
349 BTPageOpaque rootopaque;
350 BlockNumber rootblkno;
351 uint32 rootlevel;
352 BTMetaPageData *metad;
353
354 Assert(access == BT_READ || heaprel != NULL);
355
356 /*
357 * Try to use previously-cached metapage data to find the root. This
358 * normally saves one buffer access per index search, which is a very
359 * helpful savings in bufmgr traffic and hence contention.
360 */
361 if (rel->rd_amcache != NULL)
362 {
363 metad = (BTMetaPageData *) rel->rd_amcache;
364 /* We shouldn't have cached it if any of these fail */
365 Assert(metad->btm_magic == BTREE_MAGIC);
368 Assert(!metad->btm_allequalimage ||
370 Assert(metad->btm_root != P_NONE);
371
372 rootblkno = metad->btm_fastroot;
373 Assert(rootblkno != P_NONE);
374 rootlevel = metad->btm_fastlevel;
375
376 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
377 rootpage = BufferGetPage(rootbuf);
378 rootopaque = BTPageGetOpaque(rootpage);
379
380 /*
381 * Since the cache might be stale, we check the page more carefully
382 * here than normal. We *must* check that it's not deleted. If it's
383 * not alone on its level, then we reject too --- this may be overly
384 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
385 * because that's not set in a "fast root".
386 */
387 if (!P_IGNORE(rootopaque) &&
388 rootopaque->btpo_level == rootlevel &&
389 P_LEFTMOST(rootopaque) &&
390 P_RIGHTMOST(rootopaque))
391 {
392 /* OK, accept cached page as the root */
393 return rootbuf;
394 }
395 _bt_relbuf(rel, rootbuf);
396 /* Cache is stale, throw it away */
397 if (rel->rd_amcache)
398 pfree(rel->rd_amcache);
399 rel->rd_amcache = NULL;
400 }
401
402 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
403 metad = _bt_getmeta(rel, metabuf);
404
405 /* if no root page initialized yet, do it */
406 if (metad->btm_root == P_NONE)
407 {
408 Page metapg;
409
410 /* If access = BT_READ, caller doesn't want us to create root yet */
411 if (access == BT_READ)
412 {
413 _bt_relbuf(rel, metabuf);
414 return InvalidBuffer;
415 }
416
417 /* trade in our read lock for a write lock */
418 _bt_unlockbuf(rel, metabuf);
419 _bt_lockbuf(rel, metabuf, BT_WRITE);
420
421 /*
422 * Race condition: if someone else initialized the metadata between
423 * the time we released the read lock and acquired the write lock, we
424 * must avoid doing it again.
425 */
426 if (metad->btm_root != P_NONE)
427 {
428 /*
429 * Metadata initialized by someone else. In order to guarantee no
430 * deadlocks, we have to release the metadata page and start all
431 * over again. (Is that really true? But it's hardly worth trying
432 * to optimize this case.)
433 */
434 _bt_relbuf(rel, metabuf);
435 return _bt_getroot(rel, heaprel, access);
436 }
437
438 /*
439 * Get, initialize, write, and leave a lock of the appropriate type on
440 * the new root page. Since this is the first page in the tree, it's
441 * a leaf as well as the root.
442 */
443 rootbuf = _bt_allocbuf(rel, heaprel);
444 rootblkno = BufferGetBlockNumber(rootbuf);
445 rootpage = BufferGetPage(rootbuf);
446 rootopaque = BTPageGetOpaque(rootpage);
447 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
448 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
449 rootopaque->btpo_level = 0;
450 rootopaque->btpo_cycleid = 0;
451 /* Get raw page pointer for metapage */
452 metapg = BufferGetPage(metabuf);
453
454 /* NO ELOG(ERROR) till meta is updated */
456
457 /* upgrade metapage if needed */
458 if (metad->btm_version < BTREE_NOVAC_VERSION)
459 _bt_upgrademetapage(metapg);
460
461 metad->btm_root = rootblkno;
462 metad->btm_level = 0;
463 metad->btm_fastroot = rootblkno;
464 metad->btm_fastlevel = 0;
467
468 MarkBufferDirty(rootbuf);
469 MarkBufferDirty(metabuf);
470
471 /* XLOG stuff */
472 if (RelationNeedsWAL(rel))
473 {
474 xl_btree_newroot xlrec;
475 XLogRecPtr recptr;
477
481
483 md.version = metad->btm_version;
484 md.root = rootblkno;
485 md.level = 0;
486 md.fastroot = rootblkno;
487 md.fastlevel = 0;
490
491 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
492
493 xlrec.rootblk = rootblkno;
494 xlrec.level = 0;
495
497
498 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
499
500 PageSetLSN(rootpage, recptr);
501 PageSetLSN(metapg, recptr);
502 }
503
505
506 /*
507 * swap root write lock for read lock. There is no danger of anyone
508 * else accessing the new root page while it's unlocked, since no one
509 * else knows where it is yet.
510 */
511 _bt_unlockbuf(rel, rootbuf);
512 _bt_lockbuf(rel, rootbuf, BT_READ);
513
514 /* okay, metadata is correct, release lock on it without caching */
515 _bt_relbuf(rel, metabuf);
516 }
517 else
518 {
519 rootblkno = metad->btm_fastroot;
520 Assert(rootblkno != P_NONE);
521 rootlevel = metad->btm_fastlevel;
522
523 /*
524 * Cache the metapage data for next time
525 */
527 sizeof(BTMetaPageData));
528 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
529
530 /*
531 * We are done with the metapage; arrange to release it via first
532 * _bt_relandgetbuf call
533 */
534 rootbuf = metabuf;
535
536 for (;;)
537 {
538 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
539 rootpage = BufferGetPage(rootbuf);
540 rootopaque = BTPageGetOpaque(rootpage);
541
542 if (!P_IGNORE(rootopaque))
543 break;
544
545 /* it's dead, Jim. step right one page */
546 if (P_RIGHTMOST(rootopaque))
547 elog(ERROR, "no live root page found in index \"%s\"",
549 rootblkno = rootopaque->btpo_next;
550 }
551
552 if (rootopaque->btpo_level != rootlevel)
553 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
554 rootblkno, RelationGetRelationName(rel),
555 rootopaque->btpo_level, rootlevel);
556 }
557
558 /*
559 * By here, we have a pin and read lock on the root page, and no lock set
560 * on the metadata page. Return the root page's buffer.
561 */
562 return rootbuf;
563}
564
565/*
566 * _bt_gettrueroot() -- Get the true root page of the btree.
567 *
568 * This is the same as the BT_READ case of _bt_getroot(), except
569 * we follow the true-root link not the fast-root link.
570 *
571 * By the time we acquire lock on the root page, it might have been split and
572 * not be the true root anymore. This is okay for the present uses of this
573 * routine; we only really need to be able to move up at least one tree level
574 * from whatever non-root page we were at. If we ever do need to lock the
575 * one true root page, we could loop here, re-reading the metapage on each
576 * failure. (Note that it wouldn't do to hold the lock on the metapage while
577 * moving to the root --- that'd deadlock against any concurrent root split.)
578 */
579Buffer
581{
582 Buffer metabuf;
583 Page metapg;
584 BTPageOpaque metaopaque;
585 Buffer rootbuf;
586 Page rootpage;
587 BTPageOpaque rootopaque;
588 BlockNumber rootblkno;
589 uint32 rootlevel;
590 BTMetaPageData *metad;
591
592 /*
593 * We don't try to use cached metapage data here, since (a) this path is
594 * not performance-critical, and (b) if we are here it suggests our cache
595 * is out-of-date anyway. In light of point (b), it's probably safest to
596 * actively flush any cached metapage info.
597 */
598 if (rel->rd_amcache)
599 pfree(rel->rd_amcache);
600 rel->rd_amcache = NULL;
601
602 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
603 metapg = BufferGetPage(metabuf);
604 metaopaque = BTPageGetOpaque(metapg);
605 metad = BTPageGetMeta(metapg);
606
607 if (!P_ISMETA(metaopaque) ||
608 metad->btm_magic != BTREE_MAGIC)
610 (errcode(ERRCODE_INDEX_CORRUPTED),
611 errmsg("index \"%s\" is not a btree",
613
614 if (metad->btm_version < BTREE_MIN_VERSION ||
615 metad->btm_version > BTREE_VERSION)
617 (errcode(ERRCODE_INDEX_CORRUPTED),
618 errmsg("version mismatch in index \"%s\": file version %d, "
619 "current version %d, minimal supported version %d",
622
623 /* if no root page initialized yet, fail */
624 if (metad->btm_root == P_NONE)
625 {
626 _bt_relbuf(rel, metabuf);
627 return InvalidBuffer;
628 }
629
630 rootblkno = metad->btm_root;
631 rootlevel = metad->btm_level;
632
633 /*
634 * We are done with the metapage; arrange to release it via first
635 * _bt_relandgetbuf call
636 */
637 rootbuf = metabuf;
638
639 for (;;)
640 {
641 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
642 rootpage = BufferGetPage(rootbuf);
643 rootopaque = BTPageGetOpaque(rootpage);
644
645 if (!P_IGNORE(rootopaque))
646 break;
647
648 /* it's dead, Jim. step right one page */
649 if (P_RIGHTMOST(rootopaque))
650 elog(ERROR, "no live root page found in index \"%s\"",
652 rootblkno = rootopaque->btpo_next;
653 }
654
655 if (rootopaque->btpo_level != rootlevel)
656 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
657 rootblkno, RelationGetRelationName(rel),
658 rootopaque->btpo_level, rootlevel);
659
660 return rootbuf;
661}
662
663/*
664 * _bt_getrootheight() -- Get the height of the btree search tree.
665 *
666 * We return the level (counting from zero) of the current fast root.
667 * This represents the number of tree levels we'd have to descend through
668 * to start any btree index search.
669 *
670 * This is used by the planner for cost-estimation purposes. Since it's
671 * only an estimate, slightly-stale data is fine, hence we don't worry
672 * about updating previously cached data.
673 */
674int
676{
677 BTMetaPageData *metad;
678
679 if (rel->rd_amcache == NULL)
680 {
681 Buffer metabuf;
682
683 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
684 metad = _bt_getmeta(rel, metabuf);
685
686 /*
687 * If there's no root page yet, _bt_getroot() doesn't expect a cache
688 * to be made, so just stop here and report the index height is zero.
689 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
690 */
691 if (metad->btm_root == P_NONE)
692 {
693 _bt_relbuf(rel, metabuf);
694 return 0;
695 }
696
697 /*
698 * Cache the metapage data for next time
699 */
701 sizeof(BTMetaPageData));
702 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
703 _bt_relbuf(rel, metabuf);
704 }
705
706 /* Get cached page */
707 metad = (BTMetaPageData *) rel->rd_amcache;
708 /* We shouldn't have cached it if any of these fail */
709 Assert(metad->btm_magic == BTREE_MAGIC);
712 Assert(!metad->btm_allequalimage ||
714 Assert(metad->btm_fastroot != P_NONE);
715
716 return metad->btm_fastlevel;
717}
718
719/*
720 * _bt_metaversion() -- Get version/status info from metapage.
721 *
722 * Sets caller's *heapkeyspace and *allequalimage arguments using data
723 * from the B-Tree metapage (could be locally-cached version). This
724 * information needs to be stashed in insertion scankey, so we provide a
725 * single function that fetches both at once.
726 *
727 * This is used to determine the rules that must be used to descend a
728 * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
729 * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
730 * performance when inserting a new BTScanInsert-wise duplicate tuple
731 * among many leaf pages already full of such duplicates.
732 *
733 * Also sets allequalimage field, which indicates whether or not it is
734 * safe to apply deduplication. We rely on the assumption that
735 * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
736 * pg_upgrade'd from Postgres 12.
737 */
738void
739_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
740{
741 BTMetaPageData *metad;
742
743 if (rel->rd_amcache == NULL)
744 {
745 Buffer metabuf;
746
747 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
748 metad = _bt_getmeta(rel, metabuf);
749
750 /*
751 * If there's no root page yet, _bt_getroot() doesn't expect a cache
752 * to be made, so just stop here. (XXX perhaps _bt_getroot() should
753 * be changed to allow this case.)
754 */
755 if (metad->btm_root == P_NONE)
756 {
757 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
758 *allequalimage = metad->btm_allequalimage;
759
760 _bt_relbuf(rel, metabuf);
761 return;
762 }
763
764 /*
765 * Cache the metapage data for next time
766 *
767 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
768 * can change the nbtree version for an index without invalidating any
769 * local cache. This is okay because it can only happen when moving
770 * from version 2 to version 3, both of which are !heapkeyspace
771 * versions.
772 */
774 sizeof(BTMetaPageData));
775 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
776 _bt_relbuf(rel, metabuf);
777 }
778
779 /* Get cached page */
780 metad = (BTMetaPageData *) rel->rd_amcache;
781 /* We shouldn't have cached it if any of these fail */
782 Assert(metad->btm_magic == BTREE_MAGIC);
785 Assert(!metad->btm_allequalimage ||
787 Assert(metad->btm_fastroot != P_NONE);
788
789 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
790 *allequalimage = metad->btm_allequalimage;
791}
792
793/*
794 * _bt_checkpage() -- Verify that a freshly-read page looks sane.
795 */
796void
798{
799 Page page = BufferGetPage(buf);
800
801 /*
802 * ReadBuffer verifies that every newly-read page passes
803 * PageHeaderIsValid, which means it either contains a reasonably sane
804 * page header or is all-zero. We have to defend against the all-zero
805 * case, however.
806 */
807 if (PageIsNew(page))
809 (errcode(ERRCODE_INDEX_CORRUPTED),
810 errmsg("index \"%s\" contains unexpected zero page at block %u",
813 errhint("Please REINDEX it.")));
814
815 /*
816 * Additionally check that the special area looks sane.
817 */
818 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
820 (errcode(ERRCODE_INDEX_CORRUPTED),
821 errmsg("index \"%s\" contains corrupted page at block %u",
824 errhint("Please REINDEX it.")));
825}
826
827/*
828 * _bt_getbuf() -- Get an existing block in a buffer, for read or write.
829 *
830 * The general rule in nbtree is that it's never okay to access a
831 * page without holding both a buffer pin and a buffer lock on
832 * the page's buffer.
833 *
834 * When this routine returns, the appropriate lock is set on the
835 * requested buffer and its reference count has been incremented
836 * (ie, the buffer is "locked and pinned"). Also, we apply
837 * _bt_checkpage to sanity-check the page, and perform Valgrind
838 * client requests that help Valgrind detect unsafe page accesses.
839 *
840 * Note: raw LockBuffer() calls are disallowed in nbtree; all
841 * buffer lock requests need to go through wrapper functions such
842 * as _bt_lockbuf().
843 */
844Buffer
846{
847 Buffer buf;
848
850
851 /* Read an existing block of the relation */
852 buf = ReadBuffer(rel, blkno);
853 _bt_lockbuf(rel, buf, access);
854 _bt_checkpage(rel, buf);
855
856 return buf;
857}
858
859/*
860 * _bt_allocbuf() -- Allocate a new block/page.
861 *
862 * Returns a write-locked buffer containing an unallocated nbtree page.
863 *
864 * Callers are required to pass a valid heaprel. We need heaprel so that we
865 * can handle generating a snapshotConflictHorizon that makes reusing a page
866 * from the FSM safe for queries that may be running on standbys.
867 */
868Buffer
870{
871 Buffer buf;
872 BlockNumber blkno;
873 Page page;
874
875 Assert(heaprel != NULL);
876
877 /*
878 * First see if the FSM knows of any free pages.
879 *
880 * We can't trust the FSM's report unreservedly; we have to check that the
881 * page is still free. (For example, an already-free page could have been
882 * re-used between the time the last VACUUM scanned it and the time the
883 * VACUUM made its FSM updates.)
884 *
885 * In fact, it's worse than that: we can't even assume that it's safe to
886 * take a lock on the reported page. If somebody else has a lock on it,
887 * or even worse our own caller does, we could deadlock. (The own-caller
888 * scenario is actually not improbable. Consider an index on a serial or
889 * timestamp column. Nearly all splits will be at the rightmost page, so
890 * it's entirely likely that _bt_split will call us while holding a lock
891 * on the page most recently acquired from FSM. A VACUUM running
892 * concurrently with the previous split could well have placed that page
893 * back in FSM.)
894 *
895 * To get around that, we ask for only a conditional lock on the reported
896 * page. If we fail, then someone else is using the page, and we may
897 * reasonably assume it's not free. (If we happen to be wrong, the worst
898 * consequence is the page will be lost to use till the next VACUUM, which
899 * is no big problem.)
900 */
901 for (;;)
902 {
903 blkno = GetFreeIndexPage(rel);
904 if (blkno == InvalidBlockNumber)
905 break;
906 buf = ReadBuffer(rel, blkno);
907 if (_bt_conditionallockbuf(rel, buf))
908 {
909 page = BufferGetPage(buf);
910
911 /*
912 * It's possible to find an all-zeroes page in an index. For
913 * example, a backend might successfully extend the relation one
914 * page and then crash before it is able to make a WAL entry for
915 * adding the page. If we find a zeroed page then reclaim it
916 * immediately.
917 */
918 if (PageIsNew(page))
919 {
920 /* Okay to use page. Initialize and return it. */
922 return buf;
923 }
924
925 if (BTPageIsRecyclable(page, heaprel))
926 {
927 /*
928 * If we are generating WAL for Hot Standby then create a WAL
929 * record that will allow us to conflict with queries running
930 * on standby, in case they have snapshots older than safexid
931 * value
932 */
934 {
935 xl_btree_reuse_page xlrec_reuse;
936
937 /*
938 * Note that we don't register the buffer with the record,
939 * because this operation doesn't modify the page (that
940 * already happened, back when VACUUM deleted the page).
941 * This record only exists to provide a conflict point for
942 * Hot Standby. See record REDO routine comments.
943 */
944 xlrec_reuse.locator = rel->rd_locator;
945 xlrec_reuse.block = blkno;
947 xlrec_reuse.isCatalogRel =
949
952
953 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
954 }
955
956 /* Okay to use page. Re-initialize and return it. */
958 return buf;
959 }
960 elog(DEBUG2, "FSM returned nonrecyclable page");
961 _bt_relbuf(rel, buf);
962 }
963 else
964 {
965 elog(DEBUG2, "FSM returned nonlockable page");
966 /* couldn't get lock, so just drop pin */
968 }
969 }
970
971 /*
972 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
973 * risk a race condition against btvacuumscan --- see comments therein.
974 * This forces us to repeat the valgrind request that _bt_lockbuf()
975 * otherwise would make, as we can't use _bt_lockbuf() without introducing
976 * a race.
977 */
979 if (!RelationUsesLocalBuffers(rel))
981
982 /* Initialize the new page before returning it */
983 page = BufferGetPage(buf);
984 Assert(PageIsNew(page));
986
987 return buf;
988}
989
990/*
991 * _bt_relandgetbuf() -- release a locked buffer and get another one.
992 *
993 * This is equivalent to _bt_relbuf followed by _bt_getbuf. Also, if obuf is
994 * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
995 * simplifies some callers.
996 *
997 * The original motivation for using this was to avoid two entries to the
998 * bufmgr when one would do. However, now it's mainly just a notational
999 * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
1000 * is when the target page is the same one already in the buffer.
1001 */
1002Buffer
1004{
1005 Buffer buf;
1006
1007 Assert(BlockNumberIsValid(blkno));
1008 if (BufferIsValid(obuf))
1009 _bt_unlockbuf(rel, obuf);
1010 buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1011 _bt_lockbuf(rel, buf, access);
1012
1013 _bt_checkpage(rel, buf);
1014 return buf;
1015}
1016
1017/*
1018 * _bt_relbuf() -- release a locked buffer.
1019 *
1020 * Lock and pin (refcount) are both dropped.
1021 */
1022void
1024{
1025 _bt_unlockbuf(rel, buf);
1027}
1028
1029/*
1030 * _bt_lockbuf() -- lock a pinned buffer.
1031 *
1032 * Lock is acquired without acquiring another pin. This is like a raw
1033 * LockBuffer() call, but performs extra steps needed by Valgrind.
1034 *
1035 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1036 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1037 */
1038void
1040{
1041 /* LockBuffer() asserts that pin is held by this backend */
1043
1044 /*
1045 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1046 * an nbtree error (e.g. a unique violation error). That won't cause
1047 * Valgrind false positives.
1048 *
1049 * The nbtree client requests are superimposed on top of the bufmgr.c
1050 * buffer pin client requests. In the event of an nbtree error the buffer
1051 * will certainly get marked as defined when the backend once again
1052 * acquires its first pin on the buffer. (Of course, if the backend never
1053 * touches the buffer again then it doesn't matter that it remains
1054 * non-accessible to Valgrind.)
1055 *
1056 * Note: When an IndexTuple C pointer gets computed using an ItemId read
1057 * from a page while a lock was held, the C pointer becomes unsafe to
1058 * dereference forever as soon as the lock is released. Valgrind can only
1059 * detect cases where the pointer gets dereferenced with no _current_
1060 * lock/pin held, though.
1061 */
1062 if (!RelationUsesLocalBuffers(rel))
1064}
1065
1066/*
1067 * _bt_unlockbuf() -- unlock a pinned buffer.
1068 */
1069void
1071{
1072 /*
1073 * Buffer is pinned and locked, which means that it is expected to be
1074 * defined and addressable. Check that proactively.
1075 */
1077
1078 /* LockBuffer() asserts that pin is held by this backend */
1080
1081 if (!RelationUsesLocalBuffers(rel))
1083}
1084
1085/*
1086 * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1087 * buffer.
1088 *
1089 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1090 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1091 */
1092bool
1094{
1095 /* ConditionalLockBuffer() asserts that pin is held by this backend */
1097 return false;
1098
1099 if (!RelationUsesLocalBuffers(rel))
1101
1102 return true;
1103}
1104
1105/*
1106 * _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
1107 */
1108void
1110{
1111 /*
1112 * Buffer is pinned and locked, which means that it is expected to be
1113 * defined and addressable. Check that proactively.
1114 */
1116
1117 /* LockBuffer() asserts that pin is held by this backend */
1120}
1121
1122/*
1123 * _bt_pageinit() -- Initialize a new page.
1124 *
1125 * On return, the page header is initialized; data space is empty;
1126 * special space is zeroed out.
1127 */
1128void
1130{
1131 PageInit(page, size, sizeof(BTPageOpaqueData));
1132}
1133
1134/*
1135 * Delete item(s) from a btree leaf page during VACUUM.
1136 *
1137 * This routine assumes that the caller already has a full cleanup lock on
1138 * the buffer. Also, the given deletable and updatable arrays *must* be
1139 * sorted in ascending order.
1140 *
1141 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1142 * in an existing posting list item are to be removed. This works by
1143 * updating/overwriting an existing item with caller's new version of the item
1144 * (a version that lacks the TIDs that are to be deleted).
1145 *
1146 * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1147 * generate their own snapshotConflictHorizon directly from the tableam,
1148 * whereas VACUUMs rely on the initial VACUUM table scan performing
1149 * WAL-logging that takes care of the issue for the table's indexes
1150 * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree
1151 * deletes don't do.
1152 */
1153void
1155 OffsetNumber *deletable, int ndeletable,
1156 BTVacuumPosting *updatable, int nupdatable)
1157{
1158 Page page = BufferGetPage(buf);
1159 BTPageOpaque opaque;
1160 bool needswal = RelationNeedsWAL(rel);
1161 char *updatedbuf = NULL;
1162 Size updatedbuflen = 0;
1163 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1164
1165 /* Shouldn't be called unless there's something to do */
1166 Assert(ndeletable > 0 || nupdatable > 0);
1167
1168 /* Generate new version of posting lists without deleted TIDs */
1169 if (nupdatable > 0)
1170 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1171 updatedoffsets, &updatedbuflen,
1172 needswal);
1173
1174 /* No ereport(ERROR) until changes are logged */
1176
1177 /*
1178 * Handle posting tuple updates.
1179 *
1180 * Deliberately do this before handling simple deletes. If we did it the
1181 * other way around (i.e. WAL record order -- simple deletes before
1182 * updates) then we'd have to make compensating changes to the 'updatable'
1183 * array of offset numbers.
1184 *
1185 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1186 * happens to already be set. It's important that we not interfere with
1187 * any future simple index tuple deletion operations.
1188 */
1189 for (int i = 0; i < nupdatable; i++)
1190 {
1191 OffsetNumber updatedoffset = updatedoffsets[i];
1192 IndexTuple itup;
1193 Size itemsz;
1194
1195 itup = updatable[i]->itup;
1196 itemsz = MAXALIGN(IndexTupleSize(itup));
1197 if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
1198 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1200 }
1201
1202 /* Now handle simple deletes of entire tuples */
1203 if (ndeletable > 0)
1204 PageIndexMultiDelete(page, deletable, ndeletable);
1205
1206 /*
1207 * We can clear the vacuum cycle ID since this page has certainly been
1208 * processed by the current vacuum scan.
1209 */
1210 opaque = BTPageGetOpaque(page);
1211 opaque->btpo_cycleid = 0;
1212
1213 /*
1214 * Clear the BTP_HAS_GARBAGE page flag.
1215 *
1216 * This flag indicates the presence of LP_DEAD items on the page (though
1217 * not reliably). Note that we only rely on it with pg_upgrade'd
1218 * !heapkeyspace indexes. That's why clearing it here won't usually
1219 * interfere with simple index tuple deletion.
1220 */
1221 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1222
1224
1225 /* XLOG stuff */
1226 if (needswal)
1227 {
1228 XLogRecPtr recptr;
1229 xl_btree_vacuum xlrec_vacuum;
1230
1231 xlrec_vacuum.ndeleted = ndeletable;
1232 xlrec_vacuum.nupdated = nupdatable;
1233
1236 XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
1237
1238 if (ndeletable > 0)
1239 XLogRegisterBufData(0, deletable,
1240 ndeletable * sizeof(OffsetNumber));
1241
1242 if (nupdatable > 0)
1243 {
1244 XLogRegisterBufData(0, updatedoffsets,
1245 nupdatable * sizeof(OffsetNumber));
1246 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1247 }
1248
1249 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1250
1251 PageSetLSN(page, recptr);
1252 }
1253
1255
1256 /* can't leak memory here */
1257 if (updatedbuf != NULL)
1258 pfree(updatedbuf);
1259 /* free tuples allocated within _bt_delitems_update() */
1260 for (int i = 0; i < nupdatable; i++)
1261 pfree(updatable[i]->itup);
1262}
1263
1264/*
1265 * Delete item(s) from a btree leaf page during single-page cleanup.
1266 *
1267 * This routine assumes that the caller has pinned and write locked the
1268 * buffer. Also, the given deletable and updatable arrays *must* be sorted in
1269 * ascending order.
1270 *
1271 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1272 * in an existing posting list item are to be removed. This works by
1273 * updating/overwriting an existing item with caller's new version of the item
1274 * (a version that lacks the TIDs that are to be deleted).
1275 *
1276 * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1277 * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
1278 * (from the tableam). This is used by the REDO routine to generate recovery
1279 * conflicts. The other difference is that only _bt_delitems_vacuum will
1280 * clear page's VACUUM cycle ID.
1281 */
1282static void
1284 TransactionId snapshotConflictHorizon, bool isCatalogRel,
1285 OffsetNumber *deletable, int ndeletable,
1286 BTVacuumPosting *updatable, int nupdatable)
1287{
1288 Page page = BufferGetPage(buf);
1289 BTPageOpaque opaque;
1290 bool needswal = RelationNeedsWAL(rel);
1291 char *updatedbuf = NULL;
1292 Size updatedbuflen = 0;
1293 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1294
1295 /* Shouldn't be called unless there's something to do */
1296 Assert(ndeletable > 0 || nupdatable > 0);
1297
1298 /* Generate new versions of posting lists without deleted TIDs */
1299 if (nupdatable > 0)
1300 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1301 updatedoffsets, &updatedbuflen,
1302 needswal);
1303
1304 /* No ereport(ERROR) until changes are logged */
1306
1307 /* Handle updates and deletes just like _bt_delitems_vacuum */
1308 for (int i = 0; i < nupdatable; i++)
1309 {
1310 OffsetNumber updatedoffset = updatedoffsets[i];
1311 IndexTuple itup;
1312 Size itemsz;
1313
1314 itup = updatable[i]->itup;
1315 itemsz = MAXALIGN(IndexTupleSize(itup));
1316 if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
1317 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1319 }
1320
1321 if (ndeletable > 0)
1322 PageIndexMultiDelete(page, deletable, ndeletable);
1323
1324 /*
1325 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1326 * this point. The VACUUM command alone controls vacuum cycle IDs.
1327 */
1328 opaque = BTPageGetOpaque(page);
1329
1330 /*
1331 * Clear the BTP_HAS_GARBAGE page flag.
1332 *
1333 * This flag indicates the presence of LP_DEAD items on the page (though
1334 * not reliably). Note that we only rely on it with pg_upgrade'd
1335 * !heapkeyspace indexes.
1336 */
1337 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1338
1340
1341 /* XLOG stuff */
1342 if (needswal)
1343 {
1344 XLogRecPtr recptr;
1345 xl_btree_delete xlrec_delete;
1346
1347 xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
1348 xlrec_delete.ndeleted = ndeletable;
1349 xlrec_delete.nupdated = nupdatable;
1350 xlrec_delete.isCatalogRel = isCatalogRel;
1351
1354 XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);
1355
1356 if (ndeletable > 0)
1357 XLogRegisterBufData(0, deletable,
1358 ndeletable * sizeof(OffsetNumber));
1359
1360 if (nupdatable > 0)
1361 {
1362 XLogRegisterBufData(0, updatedoffsets,
1363 nupdatable * sizeof(OffsetNumber));
1364 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1365 }
1366
1367 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1368
1369 PageSetLSN(page, recptr);
1370 }
1371
1373
1374 /* can't leak memory here */
1375 if (updatedbuf != NULL)
1376 pfree(updatedbuf);
1377 /* free tuples allocated within _bt_delitems_update() */
1378 for (int i = 0; i < nupdatable; i++)
1379 pfree(updatable[i]->itup);
1380}
1381
1382/*
1383 * Set up state needed to delete TIDs from posting list tuples via "updating"
1384 * the tuple. Performs steps common to both _bt_delitems_vacuum and
1385 * _bt_delitems_delete. These steps must take place before each function's
1386 * critical section begins.
1387 *
1388 * updatable and nupdatable are inputs, though note that we will use
1389 * _bt_update_posting() to replace the original itup with a pointer to a final
1390 * version in palloc()'d memory. Caller should free the tuples when its done.
1391 *
1392 * The first nupdatable entries from updatedoffsets are set to the page offset
1393 * number for posting list tuples that caller updates. This is mostly useful
1394 * because caller may need to WAL-log the page offsets (though we always do
1395 * this for caller out of convenience).
1396 *
1397 * Returns buffer consisting of an array of xl_btree_update structs that
1398 * describe the steps we perform here for caller (though only when needswal is
1399 * true). Also sets *updatedbuflen to the final size of the buffer. This
1400 * buffer is used by caller when WAL logging is required.
1401 */
1402static char *
1403_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1404 OffsetNumber *updatedoffsets, Size *updatedbuflen,
1405 bool needswal)
1406{
1407 char *updatedbuf = NULL;
1408 Size buflen = 0;
1409
1410 /* Shouldn't be called unless there's something to do */
1411 Assert(nupdatable > 0);
1412
1413 for (int i = 0; i < nupdatable; i++)
1414 {
1415 BTVacuumPosting vacposting = updatable[i];
1416 Size itemsz;
1417
1418 /* Replace work area IndexTuple with updated version */
1419 _bt_update_posting(vacposting);
1420
1421 /* Keep track of size of xl_btree_update for updatedbuf in passing */
1422 itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1423 buflen += itemsz;
1424
1425 /* Build updatedoffsets buffer in passing */
1426 updatedoffsets[i] = vacposting->updatedoffset;
1427 }
1428
1429 /* XLOG stuff */
1430 if (needswal)
1431 {
1432 Size offset = 0;
1433
1434 /* Allocate, set final size for caller */
1435 updatedbuf = palloc(buflen);
1436 *updatedbuflen = buflen;
1437 for (int i = 0; i < nupdatable; i++)
1438 {
1439 BTVacuumPosting vacposting = updatable[i];
1440 Size itemsz;
1441 xl_btree_update update;
1442
1443 update.ndeletedtids = vacposting->ndeletedtids;
1444 memcpy(updatedbuf + offset, &update.ndeletedtids,
1446 offset += SizeOfBtreeUpdate;
1447
1448 itemsz = update.ndeletedtids * sizeof(uint16);
1449 memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1450 offset += itemsz;
1451 }
1452 }
1453
1454 return updatedbuf;
1455}
1456
1457/*
1458 * Comparator used by _bt_delitems_delete_check() to restore deltids array
1459 * back to its original leaf-page-wise sort order
1460 */
1461static int
1462_bt_delitems_cmp(const void *a, const void *b)
1463{
1464 TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1465 TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1466
1467 Assert(indexdelete1->id != indexdelete2->id);
1468
1469 return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
1470}
1471
1472/*
1473 * Try to delete item(s) from a btree leaf page during single-page cleanup.
1474 *
1475 * nbtree interface to table_index_delete_tuples(). Deletes a subset of index
1476 * tuples from caller's deltids array: those whose TIDs are found safe to
1477 * delete by the tableam (or already marked LP_DEAD in index, and so already
1478 * known to be deletable by our simple index deletion caller). We physically
1479 * delete index tuples from buf leaf page last of all (for index tuples where
1480 * that is known to be safe following our table_index_delete_tuples() call).
1481 *
1482 * Simple index deletion caller only includes TIDs from index tuples marked
1483 * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1484 * included without increasing the total number of distinct table blocks for
1485 * the deletion operation as a whole. This approach often allows us to delete
1486 * some extra index tuples that were practically free for tableam to check in
1487 * passing (when they actually turn out to be safe to delete). It probably
1488 * only makes sense for the tableam to go ahead with these extra checks when
1489 * it is block-oriented (otherwise the checks probably won't be practically
1490 * free, which we rely on). The tableam interface requires the tableam side
1491 * to handle the problem, though, so this is okay (we as an index AM are free
1492 * to make the simplifying assumption that all tableams must be block-based).
1493 *
1494 * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1495 * without expecting that tableam will check most of them. The tableam has
1496 * considerable discretion around which entries/blocks it checks. Our role in
1497 * costing the bottom-up deletion operation is strictly advisory.
1498 *
1499 * Note: Caller must have added deltids entries (i.e. entries that go in
1500 * delstate's main array) in leaf-page-wise order: page offset number order,
1501 * TID order among entries taken from the same posting list tuple (tiebreak on
1502 * TID). This order is convenient to work with here.
1503 *
1504 * Note: We also rely on the id field of each deltids element "capturing" this
1505 * original leaf-page-wise order. That is, we expect to be able to get back
1506 * to the original leaf-page-wise order just by sorting deltids on the id
1507 * field (tableam will sort deltids for its own reasons, so we'll need to put
1508 * it back in leaf-page-wise order afterwards).
1509 */
1510void
1512 TM_IndexDeleteOp *delstate)
1513{
1514 Page page = BufferGetPage(buf);
1515 TransactionId snapshotConflictHorizon;
1516 bool isCatalogRel;
1517 OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1518 int ndeletable = 0,
1519 nupdatable = 0;
1522
1523 /* Use tableam interface to determine which tuples to delete first */
1524 snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1525 isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1526
1527 /* Should not WAL-log snapshotConflictHorizon unless it's required */
1528 if (!XLogStandbyInfoActive())
1529 snapshotConflictHorizon = InvalidTransactionId;
1530
1531 /*
1532 * Construct a leaf-page-wise description of what _bt_delitems_delete()
1533 * needs to do to physically delete index tuples from the page.
1534 *
1535 * Must sort deltids array to restore leaf-page-wise order (original order
1536 * before call to tableam). This is the order that the loop expects.
1537 *
1538 * Note that deltids array might be a lot smaller now. It might even have
1539 * no entries at all (with bottom-up deletion caller), in which case there
1540 * is nothing left to do.
1541 */
1542 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1544 if (delstate->ndeltids == 0)
1545 {
1546 Assert(delstate->bottomup);
1547 return;
1548 }
1549
1550 /* We definitely have to delete at least one index tuple (or one TID) */
1551 for (int i = 0; i < delstate->ndeltids; i++)
1552 {
1553 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1554 OffsetNumber idxoffnum = dstatus->idxoffnum;
1555 ItemId itemid = PageGetItemId(page, idxoffnum);
1556 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1557 int nestedi,
1558 nitem;
1559 BTVacuumPosting vacposting;
1560
1561 Assert(OffsetNumberIsValid(idxoffnum));
1562
1563 if (idxoffnum == postingidxoffnum)
1564 {
1565 /*
1566 * This deltid entry is a TID from a posting list tuple that has
1567 * already been completely processed
1568 */
1571 &delstate->deltids[i].tid) < 0);
1573 &delstate->deltids[i].tid) >= 0);
1574 continue;
1575 }
1576
1577 if (!BTreeTupleIsPosting(itup))
1578 {
1579 /* Plain non-pivot tuple */
1580 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1581 if (dstatus->knowndeletable)
1582 deletable[ndeletable++] = idxoffnum;
1583 continue;
1584 }
1585
1586 /*
1587 * itup is a posting list tuple whose lowest deltids entry (which may
1588 * or may not be for the first TID from itup) is considered here now.
1589 * We should process all of the deltids entries for the posting list
1590 * together now, though (not just the lowest). Remember to skip over
1591 * later itup-related entries during later iterations of outermost
1592 * loop.
1593 */
1594 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1595 nestedi = i; /* Initialize for first itup deltids entry */
1596 vacposting = NULL; /* Describes final action for itup */
1597 nitem = BTreeTupleGetNPosting(itup);
1598 for (int p = 0; p < nitem; p++)
1599 {
1600 ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1601 int ptidcmp = -1;
1602
1603 /*
1604 * This nested loop reuses work across ptid TIDs taken from itup.
1605 * We take advantage of the fact that both itup's TIDs and deltids
1606 * entries (within a single itup/posting list grouping) must both
1607 * be in ascending TID order.
1608 */
1609 for (; nestedi < delstate->ndeltids; nestedi++)
1610 {
1611 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1612 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1613
1614 /* Stop once we get past all itup related deltids entries */
1615 Assert(tdstatus->idxoffnum >= idxoffnum);
1616 if (tdstatus->idxoffnum != idxoffnum)
1617 break;
1618
1619 /* Skip past non-deletable itup related entries up front */
1620 if (!tdstatus->knowndeletable)
1621 continue;
1622
1623 /* Entry is first partial ptid match (or an exact match)? */
1624 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1625 if (ptidcmp >= 0)
1626 {
1627 /* Greater than or equal (partial or exact) match... */
1628 break;
1629 }
1630 }
1631
1632 /* ...exact ptid match to a deletable deltids entry? */
1633 if (ptidcmp != 0)
1634 continue;
1635
1636 /* Exact match for deletable deltids entry -- ptid gets deleted */
1637 if (vacposting == NULL)
1638 {
1639 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1640 nitem * sizeof(uint16));
1641 vacposting->itup = itup;
1642 vacposting->updatedoffset = idxoffnum;
1643 vacposting->ndeletedtids = 0;
1644 }
1645 vacposting->deletetids[vacposting->ndeletedtids++] = p;
1646 }
1647
1648 /* Final decision on itup, a posting list tuple */
1649
1650 if (vacposting == NULL)
1651 {
1652 /* No TIDs to delete from itup -- do nothing */
1653 }
1654 else if (vacposting->ndeletedtids == nitem)
1655 {
1656 /* Straight delete of itup (to delete all TIDs) */
1657 deletable[ndeletable++] = idxoffnum;
1658 /* Turns out we won't need granular information */
1659 pfree(vacposting);
1660 }
1661 else
1662 {
1663 /* Delete some (but not all) TIDs from itup */
1664 Assert(vacposting->ndeletedtids > 0 &&
1665 vacposting->ndeletedtids < nitem);
1666 updatable[nupdatable++] = vacposting;
1667 }
1668 }
1669
1670 /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1671 _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1672 deletable, ndeletable, updatable, nupdatable);
1673
1674 /* be tidy */
1675 for (int i = 0; i < nupdatable; i++)
1676 pfree(updatable[i]);
1677}
1678
1679/*
1680 * Check that leftsib page (the btpo_prev of target page) is not marked with
1681 * INCOMPLETE_SPLIT flag. Used during page deletion.
1682 *
1683 * Returning true indicates that page flag is set in leftsib (which is
1684 * definitely still the left sibling of target). When that happens, the
1685 * target doesn't have a downlink in parent, and the page deletion algorithm
1686 * isn't prepared to handle that. Deletion of the target page (or the whole
1687 * subtree that contains the target page) cannot take place.
1688 *
1689 * Caller should not have a lock on the target page itself, since pages on the
1690 * same level must always be locked left to right to avoid deadlocks.
1691 */
1692static bool
1694{
1695 Buffer buf;
1696 Page page;
1697 BTPageOpaque opaque;
1698 bool result;
1699
1700 /* Easy case: No left sibling */
1701 if (leftsib == P_NONE)
1702 return false;
1703
1704 buf = _bt_getbuf(rel, leftsib, BT_READ);
1705 page = BufferGetPage(buf);
1706 opaque = BTPageGetOpaque(page);
1707
1708 /*
1709 * If the left sibling was concurrently split, so that its next-pointer
1710 * doesn't point to the current page anymore, the split that created
1711 * target must be completed. Caller can reasonably expect that there will
1712 * be a downlink to the target page that it can relocate using its stack.
1713 * (We don't allow splitting an incompletely split page again until the
1714 * previous split has been completed.)
1715 */
1716 result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1717 _bt_relbuf(rel, buf);
1718
1719 return result;
1720}
1721
1722/*
1723 * Check that leafrightsib page (the btpo_next of target leaf page) is not
1724 * marked with ISHALFDEAD flag. Used during page deletion.
1725 *
1726 * Returning true indicates that page flag is set in leafrightsib, so page
1727 * deletion cannot go ahead. Our caller is not prepared to deal with the case
1728 * where the parent page does not have a pivot tuples whose downlink points to
1729 * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1730 * seem worth going to the trouble of teaching our caller to deal with it.
1731 * The situation will be resolved after VACUUM finishes the deletion of the
1732 * half-dead page (when a future VACUUM operation reaches the target page
1733 * again).
1734 *
1735 * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1736 * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1737 * okay because of the restriction on deleting pages that are the rightmost
1738 * page of their parent (i.e. that such deletions can only take place when the
1739 * entire subtree must be deleted). The leaf level check made here will apply
1740 * to a right "cousin" leaf page rather than a simple right sibling leaf page
1741 * in cases where caller actually goes on to attempt deleting pages that are
1742 * above the leaf page. The right cousin leaf page is representative of the
1743 * left edge of the subtree to the right of the to-be-deleted subtree as a
1744 * whole, which is exactly the condition that our caller cares about.
1745 * (Besides, internal pages are never marked half-dead, so it isn't even
1746 * possible to _directly_ assess if an internal page is part of some other
1747 * to-be-deleted subtree.)
1748 */
1749static bool
1751{
1752 Buffer buf;
1753 Page page;
1754 BTPageOpaque opaque;
1755 bool result;
1756
1757 Assert(leafrightsib != P_NONE);
1758
1759 buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1760 page = BufferGetPage(buf);
1761 opaque = BTPageGetOpaque(page);
1762
1763 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1764 result = P_ISHALFDEAD(opaque);
1765 _bt_relbuf(rel, buf);
1766
1767 return result;
1768}
1769
1770/*
1771 * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1772 *
1773 * This action unlinks the leaf page from the b-tree structure, removing all
1774 * pointers leading to it --- but not touching its own left and right links.
1775 * The page cannot be physically reclaimed right away, since other processes
1776 * may currently be trying to follow links leading to the page; they have to
1777 * be allowed to use its right-link to recover. See nbtree/README.
1778 *
1779 * On entry, the target buffer must be pinned and locked (either read or write
1780 * lock is OK). The page must be an empty leaf page, which may be half-dead
1781 * already (a half-dead page should only be passed to us when an earlier
1782 * VACUUM operation was interrupted, though). Note in particular that caller
1783 * should never pass a buffer containing an existing deleted page here. The
1784 * lock and pin on caller's buffer will be dropped before we return.
1785 *
1786 * Maintains bulk delete stats for caller, which are taken from vstate. We
1787 * need to cooperate closely with caller here so that whole VACUUM operation
1788 * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
1789 * delete in passing. If such pages happen to be from a block number that is
1790 * ahead of the current scanblkno position, then caller is expected to count
1791 * them directly later on. It's simpler for us to understand caller's
1792 * requirements than it would be for caller to understand when or how a
1793 * deleted page became deleted after the fact.
1794 *
1795 * NOTE: this leaks memory. Rather than trying to clean up everything
1796 * carefully, it's better to run it in a temp context that can be reset
1797 * frequently.
1798 */
1799void
1801{
1802 BlockNumber rightsib;
1803 bool rightsib_empty;
1804 Page page;
1805 BTPageOpaque opaque;
1806
1807 /*
1808 * Save original leafbuf block number from caller. Only deleted blocks
1809 * that are <= scanblkno are added to bulk delete stat's pages_deleted
1810 * count.
1811 */
1812 BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1813
1814 /*
1815 * "stack" is a search stack leading (approximately) to the target page.
1816 * It is initially NULL, but when iterating, we keep it to avoid
1817 * duplicated search effort.
1818 *
1819 * Also, when "stack" is not NULL, we have already checked that the
1820 * current page is not the right half of an incomplete split, i.e. the
1821 * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1822 * when the current target page is to the right of caller's initial page
1823 * (the scanblkno page).
1824 */
1825 BTStack stack = NULL;
1826
1827 for (;;)
1828 {
1829 page = BufferGetPage(leafbuf);
1830 opaque = BTPageGetOpaque(page);
1831
1832 /*
1833 * Internal pages are never deleted directly, only as part of deleting
1834 * the whole subtree all the way down to leaf level.
1835 *
1836 * Also check for deleted pages here. Caller never passes us a fully
1837 * deleted page. Only VACUUM can delete pages, so there can't have
1838 * been a concurrent deletion. Assume that we reached any deleted
1839 * page encountered here by following a sibling link, and that the
1840 * index is corrupt.
1841 */
1842 Assert(!P_ISDELETED(opaque));
1843 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1844 {
1845 /*
1846 * Pre-9.4 page deletion only marked internal pages as half-dead,
1847 * but now we only use that flag on leaf pages. The old algorithm
1848 * was never supposed to leave half-dead pages in the tree, it was
1849 * just a transient state, but it was nevertheless possible in
1850 * error scenarios. We don't know how to deal with them here. They
1851 * are harmless as far as searches are considered, but inserts
1852 * into the deleted keyspace could add out-of-order downlinks in
1853 * the upper levels. Log a notice, hopefully the admin will notice
1854 * and reindex.
1855 */
1856 if (P_ISHALFDEAD(opaque))
1857 ereport(LOG,
1858 (errcode(ERRCODE_INDEX_CORRUPTED),
1859 errmsg("index \"%s\" contains a half-dead internal page",
1861 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1862
1863 if (P_ISDELETED(opaque))
1864 ereport(LOG,
1865 (errcode(ERRCODE_INDEX_CORRUPTED),
1866 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1867 BufferGetBlockNumber(leafbuf),
1868 scanblkno,
1870
1871 _bt_relbuf(rel, leafbuf);
1872 return;
1873 }
1874
1875 /*
1876 * We can never delete rightmost pages nor root pages. While at it,
1877 * check that page is empty, since it's possible that the leafbuf page
1878 * was empty a moment ago, but has since had some inserts.
1879 *
1880 * To keep the algorithm simple, we also never delete an incompletely
1881 * split page (they should be rare enough that this doesn't make any
1882 * meaningful difference to disk usage):
1883 *
1884 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1885 * left half of an incomplete split, but ensuring that it's not the
1886 * right half is more complicated. For that, we have to check that
1887 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1888 * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1889 * release the lock on scanblkno/leafbuf, check the left sibling, and
1890 * construct a search stack to scanblkno. On subsequent iterations,
1891 * we know we stepped right from a page that passed these tests, so
1892 * it's OK.
1893 */
1894 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1895 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1896 P_INCOMPLETE_SPLIT(opaque))
1897 {
1898 /* Should never fail to delete a half-dead page */
1899 Assert(!P_ISHALFDEAD(opaque));
1900
1901 _bt_relbuf(rel, leafbuf);
1902 return;
1903 }
1904
1905 /*
1906 * First, remove downlink pointing to the page (or a parent of the
1907 * page, if we are going to delete a taller subtree), and mark the
1908 * leafbuf page half-dead
1909 */
1910 if (!P_ISHALFDEAD(opaque))
1911 {
1912 /*
1913 * We need an approximate pointer to the page's parent page. We
1914 * use a variant of the standard search mechanism to search for
1915 * the page's high key; this will give us a link to either the
1916 * current parent or someplace to its left (if there are multiple
1917 * equal high keys, which is possible with !heapkeyspace indexes).
1918 *
1919 * Also check if this is the right-half of an incomplete split
1920 * (see comment above).
1921 */
1922 if (!stack)
1923 {
1924 BTScanInsert itup_key;
1925 ItemId itemid;
1926 IndexTuple targetkey;
1927 BlockNumber leftsib,
1928 leafblkno;
1929 Buffer sleafbuf;
1930
1931 itemid = PageGetItemId(page, P_HIKEY);
1932 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1933
1934 leftsib = opaque->btpo_prev;
1935 leafblkno = BufferGetBlockNumber(leafbuf);
1936
1937 /*
1938 * To avoid deadlocks, we'd better drop the leaf page lock
1939 * before going further.
1940 */
1941 _bt_unlockbuf(rel, leafbuf);
1942
1943 /*
1944 * Check that the left sibling of leafbuf (if any) is not
1945 * marked with INCOMPLETE_SPLIT flag before proceeding
1946 */
1947 Assert(leafblkno == scanblkno);
1948 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1949 {
1950 ReleaseBuffer(leafbuf);
1951 return;
1952 }
1953
1954 /*
1955 * We need an insertion scan key, so build one.
1956 *
1957 * _bt_search searches for the leaf page that contains any
1958 * matching non-pivot tuples, but we need it to "search" for
1959 * the high key pivot from the page that we're set to delete.
1960 * Compensate for the mismatch by having _bt_search locate the
1961 * last position < equal-to-untruncated-prefix non-pivots.
1962 */
1963 itup_key = _bt_mkscankey(rel, targetkey);
1964
1965 /* Set up a BTLessStrategyNumber-like insertion scan key */
1966 itup_key->nextkey = false;
1967 itup_key->backward = true;
1968 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
1969 /* won't need a second lock or pin on leafbuf */
1970 _bt_relbuf(rel, sleafbuf);
1971
1972 /*
1973 * Re-lock the leaf page, and start over to use our stack
1974 * within _bt_mark_page_halfdead. We must do it that way
1975 * because it's possible that leafbuf can no longer be
1976 * deleted. We need to recheck.
1977 *
1978 * Note: We can't simply hold on to the sleafbuf lock instead,
1979 * because it's barely possible that sleafbuf is not the same
1980 * page as leafbuf. This happens when leafbuf split after our
1981 * original lock was dropped, but before _bt_search finished
1982 * its descent. We rely on the assumption that we'll find
1983 * leafbuf isn't safe to delete anymore in this scenario.
1984 * (Page deletion can cope with the stack being to the left of
1985 * leafbuf, but not to the right of leafbuf.)
1986 */
1987 _bt_lockbuf(rel, leafbuf, BT_WRITE);
1988 continue;
1989 }
1990
1991 /*
1992 * See if it's safe to delete the leaf page, and determine how
1993 * many parent/internal pages above the leaf level will be
1994 * deleted. If it's safe then _bt_mark_page_halfdead will also
1995 * perform the first phase of deletion, which includes marking the
1996 * leafbuf page half-dead.
1997 */
1998 Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
1999 if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2000 stack))
2001 {
2002 _bt_relbuf(rel, leafbuf);
2003 return;
2004 }
2005 }
2006
2007 /*
2008 * Then unlink it from its siblings. Each call to
2009 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2010 * making it shallower. Iterate until the leafbuf page is deleted.
2011 */
2012 rightsib_empty = false;
2013 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2014 while (P_ISHALFDEAD(opaque))
2015 {
2016 /* Check for interrupts in _bt_unlink_halfdead_page */
2017 if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2018 &rightsib_empty, vstate))
2019 {
2020 /*
2021 * _bt_unlink_halfdead_page should never fail, since we
2022 * established that deletion is generally safe in
2023 * _bt_mark_page_halfdead -- index must be corrupt.
2024 *
2025 * Note that _bt_unlink_halfdead_page already released the
2026 * lock and pin on leafbuf for us.
2027 */
2028 Assert(false);
2029 return;
2030 }
2031 }
2032
2033 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2034
2035 rightsib = opaque->btpo_next;
2036
2037 _bt_relbuf(rel, leafbuf);
2038
2039 /*
2040 * Check here, as calling loops will have locks held, preventing
2041 * interrupts from being processed.
2042 */
2044
2045 /*
2046 * The page has now been deleted. If its right sibling is completely
2047 * empty, it's possible that the reason we haven't deleted it earlier
2048 * is that it was the rightmost child of the parent. Now that we
2049 * removed the downlink for this page, the right sibling might now be
2050 * the only child of the parent, and could be removed. It would be
2051 * picked up by the next vacuum anyway, but might as well try to
2052 * remove it now, so loop back to process the right sibling.
2053 *
2054 * Note: This relies on the assumption that _bt_getstackbuf() will be
2055 * able to reuse our original descent stack with a different child
2056 * block (provided that the child block is to the right of the
2057 * original leaf page reached by _bt_search()). It will even update
2058 * the descent stack each time we loop around, avoiding repeated work.
2059 */
2060 if (!rightsib_empty)
2061 break;
2062
2063 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2064 }
2065}
2066
2067/*
2068 * First stage of page deletion.
2069 *
2070 * Establish the height of the to-be-deleted subtree with leafbuf at its
2071 * lowest level, remove the downlink to the subtree, and mark leafbuf
2072 * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
2073 * but may include additional internal pages (at most one per level of the
2074 * tree below the root).
2075 *
2076 * Caller must pass a valid heaprel, since it's just about possible that our
2077 * call to _bt_lock_subtree_parent will need to allocate a new index page to
2078 * complete a page split. Every call to _bt_allocbuf needs to pass a heaprel.
2079 *
2080 * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2081 * the rightmost child of its parent (and parent has more than one downlink).
2082 * Returns 'true' when the first stage of page deletion completed
2083 * successfully.
2084 */
2085static bool
2087 BTStack stack)
2088{
2089 BlockNumber leafblkno;
2090 BlockNumber leafrightsib;
2091 BlockNumber topparent;
2092 BlockNumber topparentrightsib;
2093 ItemId itemid;
2094 Page page;
2095 BTPageOpaque opaque;
2096 Buffer subtreeparent;
2097 OffsetNumber poffset;
2098 OffsetNumber nextoffset;
2099 IndexTuple itup;
2100 IndexTupleData trunctuple;
2101
2102 page = BufferGetPage(leafbuf);
2103 opaque = BTPageGetOpaque(page);
2104
2105 Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2106 P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2107 P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2108 Assert(heaprel != NULL);
2109
2110 /*
2111 * Save info about the leaf page.
2112 */
2113 leafblkno = BufferGetBlockNumber(leafbuf);
2114 leafrightsib = opaque->btpo_next;
2115
2116 /*
2117 * Before attempting to lock the parent page, check that the right sibling
2118 * is not in half-dead state. A half-dead right sibling would have no
2119 * downlink in the parent, which would be highly confusing later when we
2120 * delete the downlink. It would fail the "right sibling of target page
2121 * is also the next child in parent page" cross-check below.
2122 */
2123 if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2124 {
2125 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2126 leafblkno, leafrightsib);
2127 return false;
2128 }
2129
2130 /*
2131 * We cannot delete a page that is the rightmost child of its immediate
2132 * parent, unless it is the only child --- in which case the parent has to
2133 * be deleted too, and the same condition applies recursively to it. We
2134 * have to check this condition all the way up before trying to delete,
2135 * and lock the parent of the root of the to-be-deleted subtree (the
2136 * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2137 * for us. We remove the downlink to the "top parent" page (subtree root
2138 * page) from the subtree parent page below.
2139 *
2140 * Initialize topparent to be leafbuf page now. The final to-be-deleted
2141 * subtree is often a degenerate one page subtree consisting only of the
2142 * leafbuf page. When that happens, the leafbuf page is the final subtree
2143 * root page/top parent page.
2144 */
2145 topparent = leafblkno;
2146 topparentrightsib = leafrightsib;
2147 if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
2148 &subtreeparent, &poffset,
2149 &topparent, &topparentrightsib))
2150 return false;
2151
2152 page = BufferGetPage(subtreeparent);
2153 opaque = BTPageGetOpaque(page);
2154
2155#ifdef USE_ASSERT_CHECKING
2156
2157 /*
2158 * This is just an assertion because _bt_lock_subtree_parent should have
2159 * guaranteed tuple has the expected contents
2160 */
2161 itemid = PageGetItemId(page, poffset);
2162 itup = (IndexTuple) PageGetItem(page, itemid);
2163 Assert(BTreeTupleGetDownLink(itup) == topparent);
2164#endif
2165
2166 nextoffset = OffsetNumberNext(poffset);
2167 itemid = PageGetItemId(page, nextoffset);
2168 itup = (IndexTuple) PageGetItem(page, itemid);
2169
2170 /*
2171 * Check that the parent-page index items we're about to delete/overwrite
2172 * in subtree parent page contain what we expect. This can fail if the
2173 * index has become corrupt for some reason. When that happens we back
2174 * out of deletion of the leafbuf subtree. (This is just like the case
2175 * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
2176 */
2177 if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2178 {
2179 ereport(LOG,
2180 (errcode(ERRCODE_INDEX_CORRUPTED),
2181 errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2182 topparentrightsib, topparent,
2184 BufferGetBlockNumber(subtreeparent),
2186
2187 _bt_relbuf(rel, subtreeparent);
2188 Assert(false);
2189 return false;
2190 }
2191
2192 /*
2193 * Any insert which would have gone on the leaf block will now go to its
2194 * right sibling. In other words, the key space moves right.
2195 */
2196 PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2197
2198 /* No ereport(ERROR) until changes are logged */
2200
2201 /*
2202 * Update parent of subtree. We want to delete the downlink to the top
2203 * parent page/root of the subtree, and the *following* key. Easiest way
2204 * is to copy the right sibling's downlink over the downlink that points
2205 * to top parent page, and then delete the right sibling's original pivot
2206 * tuple.
2207 *
2208 * Lanin and Shasha make the key space move left when deleting a page,
2209 * whereas the key space moves right here. That's why we cannot simply
2210 * delete the pivot tuple with the downlink to the top parent page. See
2211 * nbtree/README.
2212 */
2213 page = BufferGetPage(subtreeparent);
2214 opaque = BTPageGetOpaque(page);
2215
2216 itemid = PageGetItemId(page, poffset);
2217 itup = (IndexTuple) PageGetItem(page, itemid);
2218 BTreeTupleSetDownLink(itup, topparentrightsib);
2219
2220 nextoffset = OffsetNumberNext(poffset);
2221 PageIndexTupleDelete(page, nextoffset);
2222
2223 /*
2224 * Mark the leaf page as half-dead, and stamp it with a link to the top
2225 * parent page. When the leaf page is also the top parent page, the link
2226 * is set to InvalidBlockNumber.
2227 */
2228 page = BufferGetPage(leafbuf);
2229 opaque = BTPageGetOpaque(page);
2230 opaque->btpo_flags |= BTP_HALF_DEAD;
2231
2233 MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2234 trunctuple.t_info = sizeof(IndexTupleData);
2235 if (topparent != leafblkno)
2236 BTreeTupleSetTopParent(&trunctuple, topparent);
2237 else
2239
2240 if (!PageIndexTupleOverwrite(page, P_HIKEY, &trunctuple, IndexTupleSize(&trunctuple)))
2241 elog(ERROR, "could not overwrite high key in half-dead page");
2242
2243 /* Must mark buffers dirty before XLogInsert */
2244 MarkBufferDirty(subtreeparent);
2245 MarkBufferDirty(leafbuf);
2246
2247 /* XLOG stuff */
2248 if (RelationNeedsWAL(rel))
2249 {
2251 XLogRecPtr recptr;
2252
2253 xlrec.poffset = poffset;
2254 xlrec.leafblk = leafblkno;
2255 if (topparent != leafblkno)
2256 xlrec.topparent = topparent;
2257 else
2259
2262 XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2263
2264 page = BufferGetPage(leafbuf);
2265 opaque = BTPageGetOpaque(page);
2266 xlrec.leftblk = opaque->btpo_prev;
2267 xlrec.rightblk = opaque->btpo_next;
2268
2270
2271 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2272
2273 page = BufferGetPage(subtreeparent);
2274 PageSetLSN(page, recptr);
2275 page = BufferGetPage(leafbuf);
2276 PageSetLSN(page, recptr);
2277 }
2278
2280
2281 _bt_relbuf(rel, subtreeparent);
2282 return true;
2283}
2284
2285/*
2286 * Second stage of page deletion.
2287 *
2288 * Unlinks a single page (in the subtree undergoing deletion) from its
2289 * siblings. Also marks the page deleted.
2290 *
2291 * To get rid of the whole subtree, including the leaf page itself, call here
2292 * until the leaf page is deleted. The original "top parent" established in
2293 * the first stage of deletion is deleted in the first call here, while the
2294 * leaf page is deleted in the last call here. Note that the leaf page itself
2295 * is often the initial top parent page.
2296 *
2297 * Returns 'false' if the page could not be unlinked (shouldn't happen). If
2298 * the right sibling of the current target page is empty, *rightsib_empty is
2299 * set to true, allowing caller to delete the target's right sibling page in
2300 * passing. Note that *rightsib_empty is only actually used by caller when
2301 * target page is leafbuf, following last call here for leafbuf/the subtree
2302 * containing leafbuf. (We always set *rightsib_empty for caller, just to be
2303 * consistent.)
2304 *
2305 * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2306 * On success exit, we'll be holding pin and write lock. On failure exit,
2307 * we'll release both pin and lock before returning (we define it that way
2308 * to avoid having to reacquire a lock we already released).
2309 */
2310static bool
2312 bool *rightsib_empty, BTVacState *vstate)
2313{
2314 BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2315 IndexBulkDeleteResult *stats = vstate->stats;
2316 BlockNumber leafleftsib;
2317 BlockNumber leafrightsib;
2318 BlockNumber target;
2319 BlockNumber leftsib;
2320 BlockNumber rightsib;
2321 Buffer lbuf = InvalidBuffer;
2322 Buffer buf;
2323 Buffer rbuf;
2324 Buffer metabuf = InvalidBuffer;
2325 Page metapg = NULL;
2326 BTMetaPageData *metad = NULL;
2327 ItemId itemid;
2328 Page page;
2329 BTPageOpaque opaque;
2330 FullTransactionId safexid;
2331 bool rightsib_is_rightmost;
2332 uint32 targetlevel;
2333 IndexTuple leafhikey;
2334 BlockNumber leaftopparent;
2335
2336 page = BufferGetPage(leafbuf);
2337 opaque = BTPageGetOpaque(page);
2338
2339 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2340
2341 /*
2342 * Remember some information about the leaf page.
2343 */
2344 itemid = PageGetItemId(page, P_HIKEY);
2345 leafhikey = (IndexTuple) PageGetItem(page, itemid);
2346 target = BTreeTupleGetTopParent(leafhikey);
2347 leafleftsib = opaque->btpo_prev;
2348 leafrightsib = opaque->btpo_next;
2349
2350 _bt_unlockbuf(rel, leafbuf);
2351
2352 /*
2353 * Check here, as calling loops will have locks held, preventing
2354 * interrupts from being processed.
2355 */
2357
2358 /* Unlink the current top parent of the subtree */
2359 if (!BlockNumberIsValid(target))
2360 {
2361 /* Target is leaf page (or leaf page is top parent, if you prefer) */
2362 target = leafblkno;
2363
2364 buf = leafbuf;
2365 leftsib = leafleftsib;
2366 targetlevel = 0;
2367 }
2368 else
2369 {
2370 /* Target is the internal page taken from leaf's top parent link */
2371 Assert(target != leafblkno);
2372
2373 /* Fetch the block number of the target's left sibling */
2374 buf = _bt_getbuf(rel, target, BT_READ);
2375 page = BufferGetPage(buf);
2376 opaque = BTPageGetOpaque(page);
2377 leftsib = opaque->btpo_prev;
2378 targetlevel = opaque->btpo_level;
2379 Assert(targetlevel > 0);
2380
2381 /*
2382 * To avoid deadlocks, we'd better drop the target page lock before
2383 * going further.
2384 */
2385 _bt_unlockbuf(rel, buf);
2386 }
2387
2388 /*
2389 * We have to lock the pages we need to modify in the standard order:
2390 * moving right, then up. Else we will deadlock against other writers.
2391 *
2392 * So, first lock the leaf page, if it's not the target. Then find and
2393 * write-lock the current left sibling of the target page. The sibling
2394 * that was current a moment ago could have split, so we may have to move
2395 * right.
2396 */
2397 if (target != leafblkno)
2398 _bt_lockbuf(rel, leafbuf, BT_WRITE);
2399 if (leftsib != P_NONE)
2400 {
2401 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2402 page = BufferGetPage(lbuf);
2403 opaque = BTPageGetOpaque(page);
2404 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2405 {
2406 bool leftsibvalid = true;
2407
2408 /*
2409 * Before we follow the link from the page that was the left
2410 * sibling mere moments ago, validate its right link. This
2411 * reduces the opportunities for loop to fail to ever make any
2412 * progress in the presence of index corruption.
2413 *
2414 * Note: we rely on the assumption that there can only be one
2415 * vacuum process running at a time (against the same index).
2416 */
2417 if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2418 leftsib == opaque->btpo_next)
2419 leftsibvalid = false;
2420
2421 leftsib = opaque->btpo_next;
2422 _bt_relbuf(rel, lbuf);
2423
2424 if (!leftsibvalid)
2425 {
2426 /*
2427 * This is known to fail in the field; sibling link corruption
2428 * is relatively common. Press on with vacuuming rather than
2429 * just throwing an ERROR.
2430 */
2431 ereport(LOG,
2432 (errcode(ERRCODE_INDEX_CORRUPTED),
2433 errmsg_internal("valid left sibling for deletion target could not be located: "
2434 "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
2435 leftsib, target, leafblkno, scanblkno,
2436 targetlevel, RelationGetRelationName(rel))));
2437
2438 /* Must release all pins and locks on failure exit */
2440 if (target != leafblkno)
2441 _bt_relbuf(rel, leafbuf);
2442
2443 return false;
2444 }
2445
2447
2448 /* step right one page */
2449 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2450 page = BufferGetPage(lbuf);
2451 opaque = BTPageGetOpaque(page);
2452 }
2453 }
2454 else
2455 lbuf = InvalidBuffer;
2456
2457 /* Next write-lock the target page itself */
2458 _bt_lockbuf(rel, buf, BT_WRITE);
2459 page = BufferGetPage(buf);
2460 opaque = BTPageGetOpaque(page);
2461
2462 /*
2463 * Check page is still empty etc, else abandon deletion. This is just for
2464 * paranoia's sake; a half-dead page cannot resurrect because there can be
2465 * only one vacuum process running at a time.
2466 */
2467 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2468 elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2469 target, RelationGetRelationName(rel));
2470
2471 if (opaque->btpo_prev != leftsib)
2472 ereport(ERROR,
2473 (errcode(ERRCODE_INDEX_CORRUPTED),
2474 errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2475 leftsib, opaque->btpo_prev, target,
2477
2478 if (target == leafblkno)
2479 {
2480 if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2481 !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2482 elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2483 target, RelationGetRelationName(rel));
2484
2485 /* Leaf page is also target page: don't set leaftopparent */
2486 leaftopparent = InvalidBlockNumber;
2487 }
2488 else
2489 {
2490 IndexTuple finaldataitem;
2491
2492 if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2493 P_ISLEAF(opaque))
2494 elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2495 targetlevel, target, RelationGetRelationName(rel));
2496
2497 /* Target is internal: set leaftopparent for next call here... */
2498 itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2499 finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2500 leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2501 /* ...except when it would be a redundant pointer-to-self */
2502 if (leaftopparent == leafblkno)
2503 leaftopparent = InvalidBlockNumber;
2504 }
2505
2506 /* No leaftopparent for level 0 (leaf page) or level 1 target */
2507 Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2508
2509 /*
2510 * And next write-lock the (current) right sibling.
2511 */
2512 rightsib = opaque->btpo_next;
2513 rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2514 page = BufferGetPage(rbuf);
2515 opaque = BTPageGetOpaque(page);
2516
2517 /*
2518 * Validate target's right sibling page. Its left link must point back to
2519 * the target page.
2520 */
2521 if (opaque->btpo_prev != target)
2522 {
2523 /*
2524 * This is known to fail in the field; sibling link corruption is
2525 * relatively common. Press on with vacuuming rather than just
2526 * throwing an ERROR (same approach used for left-sibling's-right-link
2527 * validation check a moment ago).
2528 */
2529 ereport(LOG,
2530 (errcode(ERRCODE_INDEX_CORRUPTED),
2531 errmsg_internal("right sibling's left-link doesn't match: "
2532 "right sibling %u of target %u with leafblkno %u "
2533 "and scanblkno %u spuriously links to non-target %u "
2534 "on level %u of index \"%s\"",
2535 rightsib, target, leafblkno,
2536 scanblkno, opaque->btpo_prev,
2537 targetlevel, RelationGetRelationName(rel))));
2538
2539 /* Must release all pins and locks on failure exit */
2540 if (BufferIsValid(lbuf))
2541 _bt_relbuf(rel, lbuf);
2542 _bt_relbuf(rel, rbuf);
2543 _bt_relbuf(rel, buf);
2544 if (target != leafblkno)
2545 _bt_relbuf(rel, leafbuf);
2546
2547 return false;
2548 }
2549
2550 rightsib_is_rightmost = P_RIGHTMOST(opaque);
2551 *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2552
2553 /*
2554 * If we are deleting the next-to-last page on the target's level, then
2555 * the rightsib is a candidate to become the new fast root. (In theory, it
2556 * might be possible to push the fast root even further down, but the odds
2557 * of doing so are slim, and the locking considerations daunting.)
2558 *
2559 * We can safely acquire a lock on the metapage here --- see comments for
2560 * _bt_newlevel().
2561 */
2562 if (leftsib == P_NONE && rightsib_is_rightmost)
2563 {
2564 page = BufferGetPage(rbuf);
2565 opaque = BTPageGetOpaque(page);
2566 if (P_RIGHTMOST(opaque))
2567 {
2568 /* rightsib will be the only one left on the level */
2569 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2570 metapg = BufferGetPage(metabuf);
2571 metad = BTPageGetMeta(metapg);
2572
2573 /*
2574 * The expected case here is btm_fastlevel == targetlevel+1; if
2575 * the fastlevel is <= targetlevel, something is wrong, and we
2576 * choose to overwrite it to fix it.
2577 */
2578 if (metad->btm_fastlevel > targetlevel + 1)
2579 {
2580 /* no update wanted */
2581 _bt_relbuf(rel, metabuf);
2582 metabuf = InvalidBuffer;
2583 }
2584 }
2585 }
2586
2587 /*
2588 * Here we begin doing the deletion.
2589 */
2590
2591 /* No ereport(ERROR) until changes are logged */
2593
2594 /*
2595 * Update siblings' side-links. Note the target page's side-links will
2596 * continue to point to the siblings. Asserts here are just rechecking
2597 * things we already verified above.
2598 */
2599 if (BufferIsValid(lbuf))
2600 {
2601 page = BufferGetPage(lbuf);
2602 opaque = BTPageGetOpaque(page);
2603 Assert(opaque->btpo_next == target);
2604 opaque->btpo_next = rightsib;
2605 }
2606 page = BufferGetPage(rbuf);
2607 opaque = BTPageGetOpaque(page);
2608 Assert(opaque->btpo_prev == target);
2609 opaque->btpo_prev = leftsib;
2610
2611 /*
2612 * If we deleted a parent of the targeted leaf page, instead of the leaf
2613 * itself, update the leaf to point to the next remaining child in the
2614 * subtree.
2615 *
2616 * Note: We rely on the fact that a buffer pin on the leaf page has been
2617 * held since leafhikey was initialized. This is safe, though only
2618 * because the page was already half-dead at that point. The leaf page
2619 * cannot have been modified by any other backend during the period when
2620 * no lock was held.
2621 */
2622 if (target != leafblkno)
2623 BTreeTupleSetTopParent(leafhikey, leaftopparent);
2624
2625 /*
2626 * Mark the page itself deleted. It can be recycled when all current
2627 * transactions are gone. Storing GetTopTransactionId() would work, but
2628 * we're in VACUUM and would not otherwise have an XID. Having already
2629 * updated links to the target, ReadNextFullTransactionId() suffices as an
2630 * upper bound. Any scan having retained a now-stale link is advertising
2631 * in its PGPROC an xmin less than or equal to the value we read here. It
2632 * will continue to do so, holding back the xmin horizon, for the duration
2633 * of that scan.
2634 */
2635 page = BufferGetPage(buf);
2636 opaque = BTPageGetOpaque(page);
2637 Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2638
2639 /*
2640 * Store upper bound XID that's used to determine when deleted page is no
2641 * longer needed as a tombstone
2642 */
2643 safexid = ReadNextFullTransactionId();
2644 BTPageSetDeleted(page, safexid);
2645 opaque->btpo_cycleid = 0;
2646
2647 /* And update the metapage, if needed */
2648 if (BufferIsValid(metabuf))
2649 {
2650 /* upgrade metapage if needed */
2651 if (metad->btm_version < BTREE_NOVAC_VERSION)
2652 _bt_upgrademetapage(metapg);
2653 metad->btm_fastroot = rightsib;
2654 metad->btm_fastlevel = targetlevel;
2655 MarkBufferDirty(metabuf);
2656 }
2657
2658 /* Must mark buffers dirty before XLogInsert */
2659 MarkBufferDirty(rbuf);
2661 if (BufferIsValid(lbuf))
2662 MarkBufferDirty(lbuf);
2663 if (target != leafblkno)
2664 MarkBufferDirty(leafbuf);
2665
2666 /* XLOG stuff */
2667 if (RelationNeedsWAL(rel))
2668 {
2670 xl_btree_metadata xlmeta;
2671 uint8 xlinfo;
2672 XLogRecPtr recptr;
2673
2675
2677 if (BufferIsValid(lbuf))
2680 if (target != leafblkno)
2682
2683 /* information stored on the target/to-be-unlinked block */
2684 xlrec.leftsib = leftsib;
2685 xlrec.rightsib = rightsib;
2686 xlrec.level = targetlevel;
2687 xlrec.safexid = safexid;
2688
2689 /* information needed to recreate the leaf block (if not the target) */
2690 xlrec.leafleftsib = leafleftsib;
2691 xlrec.leafrightsib = leafrightsib;
2692 xlrec.leaftopparent = leaftopparent;
2693
2695
2696 if (BufferIsValid(metabuf))
2697 {
2699
2701 xlmeta.version = metad->btm_version;
2702 xlmeta.root = metad->btm_root;
2703 xlmeta.level = metad->btm_level;
2704 xlmeta.fastroot = metad->btm_fastroot;
2705 xlmeta.fastlevel = metad->btm_fastlevel;
2707 xlmeta.allequalimage = metad->btm_allequalimage;
2708
2709 XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));
2711 }
2712 else
2713 xlinfo = XLOG_BTREE_UNLINK_PAGE;
2714
2715 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2716
2717 if (BufferIsValid(metabuf))
2718 {
2719 PageSetLSN(metapg, recptr);
2720 }
2721 page = BufferGetPage(rbuf);
2722 PageSetLSN(page, recptr);
2723 page = BufferGetPage(buf);
2724 PageSetLSN(page, recptr);
2725 if (BufferIsValid(lbuf))
2726 {
2727 page = BufferGetPage(lbuf);
2728 PageSetLSN(page, recptr);
2729 }
2730 if (target != leafblkno)
2731 {
2732 page = BufferGetPage(leafbuf);
2733 PageSetLSN(page, recptr);
2734 }
2735 }
2736
2738
2739 /* release metapage */
2740 if (BufferIsValid(metabuf))
2741 _bt_relbuf(rel, metabuf);
2742
2743 /* release siblings */
2744 if (BufferIsValid(lbuf))
2745 _bt_relbuf(rel, lbuf);
2746 _bt_relbuf(rel, rbuf);
2747
2748 /* If the target is not leafbuf, we're done with it now -- release it */
2749 if (target != leafblkno)
2750 _bt_relbuf(rel, buf);
2751
2752 /*
2753 * Maintain pages_newly_deleted, which is simply the number of pages
2754 * deleted by the ongoing VACUUM operation.
2755 *
2756 * Maintain pages_deleted in a way that takes into account how
2757 * btvacuumpage() will count deleted pages that have yet to become
2758 * scanblkno -- only count page when it's not going to get that treatment
2759 * later on.
2760 */
2761 stats->pages_newly_deleted++;
2762 if (target <= scanblkno)
2763 stats->pages_deleted++;
2764
2765 /*
2766 * Remember information about the target page (now a newly deleted page)
2767 * in dedicated vstate space for later. The page will be considered as a
2768 * candidate to place in the FSM at the end of the current btvacuumscan()
2769 * call.
2770 */
2771 _bt_pendingfsm_add(vstate, target, safexid);
2772
2773 /* Success - hold on to lock on leafbuf (might also have been target) */
2774 return true;
2775}
2776
2777/*
2778 * Establish how tall the to-be-deleted subtree will be during the first stage
2779 * of page deletion.
2780 *
2781 * Caller's child argument is the block number of the page caller wants to
2782 * delete (this is leafbuf's block number, except when we're called
2783 * recursively). stack is a search stack leading to it. Note that we will
2784 * update the stack entry(s) to reflect current downlink positions --- this is
2785 * similar to the corresponding point in page split handling.
2786 *
2787 * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2788 * false. Returns true on success, in which case caller can use certain
2789 * details established here to perform the first stage of deletion. This
2790 * function is the last point at which page deletion may be deemed unsafe
2791 * (barring index corruption, or unexpected concurrent page deletions).
2792 *
2793 * We write lock the parent of the root of the to-be-deleted subtree for
2794 * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2795 * caller). Caller will have to remove a downlink from *subtreeparent. We
2796 * also set a *subtreeparent offset number in *poffset, to indicate the
2797 * location of the pivot tuple that contains the relevant downlink.
2798 *
2799 * The root of the to-be-deleted subtree is called the "top parent". Note
2800 * that the leafbuf page is often the final "top parent" page (you can think
2801 * of the leafbuf page as a degenerate single page subtree when that happens).
2802 * Caller should initialize *topparent to the target leafbuf page block number
2803 * (while *topparentrightsib should be set to leafbuf's right sibling block
2804 * number). We will update *topparent (and *topparentrightsib) for caller
2805 * here, though only when it turns out that caller will delete at least one
2806 * internal page (i.e. only when caller needs to store a valid link to the top
2807 * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2808 */
2809static bool
2811 BTStack stack, Buffer *subtreeparent,
2812 OffsetNumber *poffset, BlockNumber *topparent,
2813 BlockNumber *topparentrightsib)
2814{
2815 BlockNumber parent,
2816 leftsibparent;
2817 OffsetNumber parentoffset,
2818 maxoff;
2819 Buffer pbuf;
2820 Page page;
2821 BTPageOpaque opaque;
2822
2823 /*
2824 * Locate the pivot tuple whose downlink points to "child". Write lock
2825 * the parent page itself.
2826 */
2827 pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
2828 if (pbuf == InvalidBuffer)
2829 {
2830 /*
2831 * Failed to "re-find" a pivot tuple whose downlink matched our child
2832 * block number on the parent level -- the index must be corrupt.
2833 * Don't even try to delete the leafbuf subtree. Just report the
2834 * issue and press on with vacuuming the index.
2835 *
2836 * Note: _bt_getstackbuf() recovers from concurrent page splits that
2837 * take place on the parent level. Its approach is a near-exhaustive
2838 * linear search. This also gives it a surprisingly good chance of
2839 * recovering in the event of a buggy or inconsistent opclass. But we
2840 * don't rely on that here.
2841 */
2842 ereport(LOG,
2843 (errcode(ERRCODE_INDEX_CORRUPTED),
2844 errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2845 RelationGetRelationName(rel), child)));
2846 Assert(false);
2847 return false;
2848 }
2849
2850 parent = stack->bts_blkno;
2851 parentoffset = stack->bts_offset;
2852
2853 page = BufferGetPage(pbuf);
2854 opaque = BTPageGetOpaque(page);
2855 maxoff = PageGetMaxOffsetNumber(page);
2856 leftsibparent = opaque->btpo_prev;
2857
2858 /*
2859 * _bt_getstackbuf() completes page splits on returned parent buffer when
2860 * required.
2861 *
2862 * In general it's a bad idea for VACUUM to use up more disk space, which
2863 * is why page deletion does not finish incomplete page splits most of the
2864 * time. We allow this limited exception because the risk is much lower,
2865 * and the potential downside of not proceeding is much higher: A single
2866 * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2867 * prevent us from deleting hundreds of empty leaf pages from one level
2868 * down.
2869 */
2870 Assert(!P_INCOMPLETE_SPLIT(opaque));
2871
2872 if (parentoffset < maxoff)
2873 {
2874 /*
2875 * Child is not the rightmost child in parent, so it's safe to delete
2876 * the subtree whose root/topparent is child page
2877 */
2878 *subtreeparent = pbuf;
2879 *poffset = parentoffset;
2880 return true;
2881 }
2882
2883 /*
2884 * Child is the rightmost child of parent.
2885 *
2886 * Since it's the rightmost child of parent, deleting the child (or
2887 * deleting the subtree whose root/topparent is the child page) is only
2888 * safe when it's also possible to delete the parent.
2889 */
2890 Assert(parentoffset == maxoff);
2891 if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2892 {
2893 /*
2894 * Child isn't parent's only child, or parent is rightmost on its
2895 * entire level. Definitely cannot delete any pages.
2896 */
2897 _bt_relbuf(rel, pbuf);
2898 return false;
2899 }
2900
2901 /*
2902 * Now make sure that the parent deletion is itself safe by examining the
2903 * child's grandparent page. Recurse, passing the parent page as the
2904 * child page (child's grandparent is the parent on the next level up). If
2905 * parent deletion is unsafe, then child deletion must also be unsafe (in
2906 * which case caller cannot delete any pages at all).
2907 */
2908 *topparent = parent;
2909 *topparentrightsib = opaque->btpo_next;
2910
2911 /*
2912 * Release lock on parent before recursing.
2913 *
2914 * It's OK to release page locks on parent before recursive call locks
2915 * grandparent. An internal page can only acquire an entry if the child
2916 * is split, but that cannot happen as long as we still hold a lock on the
2917 * leafbuf page.
2918 */
2919 _bt_relbuf(rel, pbuf);
2920
2921 /*
2922 * Before recursing, check that the left sibling of parent (if any) is not
2923 * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2924 * parent lock).
2925 *
2926 * Note: We deliberately avoid completing incomplete splits here.
2927 */
2928 if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2929 return false;
2930
2931 /* Recurse to examine child page's grandparent page */
2932 return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
2933 subtreeparent, poffset,
2934 topparent, topparentrightsib);
2935}
2936
2937/*
2938 * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
2939 * optimization.
2940 *
2941 * Called at the start of a btvacuumscan(). Caller's cleanuponly argument
2942 * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
2943 *
2944 * We expect to allocate memory inside VACUUM's top-level memory context here.
2945 * The working buffer is subject to a limit based on work_mem. Our strategy
2946 * when the array can no longer grow within the bounds of that limit is to
2947 * stop saving additional newly deleted pages, while proceeding as usual with
2948 * the pages that we can fit.
2949 */
2950void
2951_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
2952{
2953 Size maxbufsize;
2954
2955 /*
2956 * Don't bother with optimization in cleanup-only case -- we don't expect
2957 * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2958 * can only take place because this optimization didn't work out during
2959 * the last VACUUM.
2960 */
2961 if (cleanuponly)
2962 return;
2963
2964 /*
2965 * Cap maximum size of array so that we always respect work_mem. Avoid
2966 * int overflow here.
2967 */
2968 vstate->bufsize = 256;
2969 maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
2970 maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2971 /* BTVacState.maxbufsize has type int */
2972 maxbufsize = Min(maxbufsize, INT_MAX);
2973 /* Stay sane with small work_mem */
2974 maxbufsize = Max(maxbufsize, vstate->bufsize);
2975 vstate->maxbufsize = (int) maxbufsize;
2976
2977 /* Allocate buffer, indicate that there are currently 0 pending pages */
2978 vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2979 vstate->npendingpages = 0;
2980}
2981
2982/*
2983 * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
2984 * the ongoing VACUUM operation) into the free space map -- though only when
2985 * it is actually safe to do so by now.
2986 *
2987 * Called at the end of a btvacuumscan(), just before free space map vacuuming
2988 * takes place.
2989 *
2990 * Frees memory allocated by _bt_pendingfsm_init(), if any.
2991 */
2992void
2994{
2995 IndexBulkDeleteResult *stats = vstate->stats;
2996 Relation heaprel = vstate->info->heaprel;
2997
2998 Assert(stats->pages_newly_deleted >= vstate->npendingpages);
2999 Assert(heaprel != NULL);
3000
3001 if (vstate->npendingpages == 0)
3002 {
3003 /* Just free memory when nothing to do */
3004 if (vstate->pendingpages)
3005 pfree(vstate->pendingpages);
3006
3007 return;
3008 }
3009
3010#ifdef DEBUG_BTREE_PENDING_FSM
3011
3012 /*
3013 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3014 * placing pending pages in the FSM. Note that the optimization will
3015 * never be effective without some other backend concurrently consuming an
3016 * XID.
3017 */
3018 pg_usleep(5000000L);
3019#endif
3020
3021 /*
3022 * Recompute VACUUM XID boundaries.
3023 *
3024 * We don't actually care about the oldest non-removable XID. Computing
3025 * the oldest such XID has a useful side-effect that we rely on: it
3026 * forcibly updates the XID horizon state for this backend. This step is
3027 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3028 * that it is now safe to recycle newly deleted pages without this step.
3029 */
3031
3032 for (int i = 0; i < vstate->npendingpages; i++)
3033 {
3034 BlockNumber target = vstate->pendingpages[i].target;
3035 FullTransactionId safexid = vstate->pendingpages[i].safexid;
3036
3037 /*
3038 * Do the equivalent of checking BTPageIsRecyclable(), but without
3039 * accessing the page again a second time.
3040 *
3041 * Give up on finding the first non-recyclable page -- all later pages
3042 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3043 * to the array in safexid order.
3044 */
3045 if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3046 break;
3047
3048 RecordFreeIndexPage(rel, target);
3049 stats->pages_free++;
3050 }
3051
3052 pfree(vstate->pendingpages);
3053}
3054
3055/*
3056 * Maintain array of pages that were deleted during current btvacuumscan()
3057 * call, for use in _bt_pendingfsm_finalize()
3058 */
3059static void
3061 BlockNumber target,
3062 FullTransactionId safexid)
3063{
3064 Assert(vstate->npendingpages <= vstate->bufsize);
3065 Assert(vstate->bufsize <= vstate->maxbufsize);
3066
3067#ifdef USE_ASSERT_CHECKING
3068
3069 /*
3070 * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3071 * array will always be in safexid order (since that is the order that we
3072 * save them in here)
3073 */
3074 if (vstate->npendingpages > 0)
3075 {
3076 FullTransactionId lastsafexid =
3077 vstate->pendingpages[vstate->npendingpages - 1].safexid;
3078
3079 Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3080 }
3081#endif
3082
3083 /*
3084 * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3085 * information about this page.
3086 *
3087 * Note that this also covers the case where we opted to not use the
3088 * optimization in _bt_pendingfsm_init().
3089 */
3090 if (vstate->npendingpages == vstate->maxbufsize)
3091 return;
3092
3093 /* Consider enlarging buffer */
3094 if (vstate->npendingpages == vstate->bufsize)
3095 {
3096 int newbufsize = vstate->bufsize * 2;
3097
3098 /* Respect work_mem */
3099 if (newbufsize > vstate->maxbufsize)
3100 newbufsize = vstate->maxbufsize;
3101
3102 vstate->bufsize = newbufsize;
3103 vstate->pendingpages =
3104 repalloc(vstate->pendingpages,
3105 sizeof(BTPendingFSM) * vstate->bufsize);
3106 }
3107
3108 /* Save metadata for newly deleted page */
3109 vstate->pendingpages[vstate->npendingpages].target = target;
3110 vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3111 vstate->npendingpages++;
3112}
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4224
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:2998
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:845
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5631
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5371
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2937
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5685
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5605
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:745
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:203
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:291
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:425
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:414
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define BMR_REL(p_rel)
Definition: bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:376
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1160
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, const void *newtup, Size newsize)
Definition: bufpage.c:1404
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:1051
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42
PageHeaderData * PageHeader
Definition: bufpage.h:173
static uint16 PageGetSpecialSize(const PageData *page)
Definition: bufpage.h:316
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:233
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
PageData * Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
#define Min(x, y)
Definition: c.h:1007
#define MAXALIGN(LEN)
Definition: c.h:814
uint8_t uint8
Definition: c.h:540
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:227
#define Max(x, y)
Definition: c.h:1001
uint16_t uint16
Definition: c.h:541
uint32_t uint32
Definition: c.h:542
#define MemSet(start, val, len)
Definition: c.h:1023
uint32 TransactionId
Definition: c.h:661
size_t Size
Definition: c.h:614
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
#define MaxAllocSize
Definition: fe_memutils.h:22
int work_mem
Definition: globals.c:131
Assert(PointerIsAligned(start, uint64))
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
static int pg_cmp_s16(int16 a, int16 b)
Definition: int.h:634
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int i
Definition: isn.c:77
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition: itemptr.c:51
bool ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2)
Definition: itemptr.c:35
IndexTupleData * IndexTuple
Definition: itup.h:53
struct IndexTupleData IndexTupleData
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:922
Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2326
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2810
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1750
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1800
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1154
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1693
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:142
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1283
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2086
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1511
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1403
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1462
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2993
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2951
static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
Definition: nbtpage.c:3060
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2311
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
#define BTPageGetMeta(p)
Definition: nbtree.h:122
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:261
#define BTREE_MIN_VERSION
Definition: nbtree.h:152
#define BTP_LEAF
Definition: nbtree.h:77
#define BTP_HALF_DEAD
Definition: nbtree.h:81
#define P_HIKEY
Definition: nbtree.h:368
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:627
#define P_ISMETA(opaque)
Definition: nbtree.h:224
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define BTREE_MAGIC
Definition: nbtree.h:150
#define BTP_META
Definition: nbtree.h:80
#define BTREE_VERSION
Definition: nbtree.h:151
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:621
struct BTPendingFSM BTPendingFSM
#define BTP_ROOT
Definition: nbtree.h:78
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:563
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_ISROOT(opaque)
Definition: nbtree.h:222
#define P_NONE
Definition: nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BTREE_METAPAGE
Definition: nbtree.h:149
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define BT_READ
Definition: nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:292
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
static void BTPageSetDeleted(Page page, FullTransactionId safexid)
Definition: nbtree.h:240
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:153
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define BT_WRITE
Definition: nbtree.h:731
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:107
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:97
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:234
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:268
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
#define SizeOfBtreeDelete
Definition: nbtxlog.h:253
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:291
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:35
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:36
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:38
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:328
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:192
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:34
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:72
#define qsort(a, b, c, d)
Definition: port.h:479
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3229
short access
Definition: preproc-type.c:36
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:1953
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4248
#define RelationGetRelationName(relation)
Definition: rel.h:549
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:694
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:647
@ MAIN_FORKNUM
Definition: relpath.h:58
void pg_usleep(long microsec)
Definition: signal.c:53
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:115
uint32 btm_level
Definition: nbtree.h:109
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:117
BlockNumber btm_fastroot
Definition: nbtree.h:110
uint32 btm_version
Definition: nbtree.h:107
uint32 btm_magic
Definition: nbtree.h:106
BlockNumber btm_root
Definition: nbtree.h:108
bool btm_allequalimage
Definition: nbtree.h:119
uint32 btm_fastlevel
Definition: nbtree.h:111
BlockNumber btpo_next
Definition: nbtree.h:66
BlockNumber btpo_prev
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:68
uint32 btpo_level
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:69
FullTransactionId safexid
Definition: nbtree.h:328
BlockNumber target
Definition: nbtree.h:327
BlockNumber bts_blkno
Definition: nbtree.h:745
struct BTStackData * bts_parent
Definition: nbtree.h:747
OffsetNumber bts_offset
Definition: nbtree.h:746
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
IndexVacuumInfo * info
Definition: nbtree.h:333
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917
OffsetNumber updatedoffset
Definition: nbtree.h:918
BlockNumber pages_deleted
Definition: genam.h:109
BlockNumber pages_newly_deleted
Definition: genam.h:108
BlockNumber pages_free
Definition: genam.h:110
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
Relation heaprel
Definition: genam.h:74
void * rd_amcache
Definition: rel.h:229
MemoryContext rd_indexcxt
Definition: rel.h:204
RelFileLocator rd_locator
Definition: rel.h:57
TM_IndexStatus * status
Definition: tableam.h:254
TM_IndexDelete * deltids
Definition: tableam.h:253
ItemPointerData tid
Definition: tableam.h:212
bool knowndeletable
Definition: tableam.h:219
OffsetNumber idxoffnum
Definition: tableam.h:218
TransactionId snapshotConflictHorizon
Definition: nbtxlog.h:238
bool isCatalogRel
Definition: nbtxlog.h:241
uint16 ndeleted
Definition: nbtxlog.h:239
uint16 nupdated
Definition: nbtxlog.h:240
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
uint16 ndeletedtids
Definition: nbtxlog.h:263
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1321
#define InvalidTransactionId
Definition: transam.h:31
#define FullTransactionIdFollowsOrEquals(a, b)
Definition: transam.h:54
FullTransactionId ReadNextFullTransactionId(void)
Definition: varsup.c:288
#define XLogStandbyInfoActive()
Definition: xlog.h:123
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:409
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:245
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34