PostgreSQL Source Code  git master
nbtpage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtpage.c
4  * BTree-specific page management code for the Postgres btree access
5  * method.
6  *
7  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  * src/backend/access/nbtree/nbtpage.c
13  *
14  * NOTES
15  * Postgres btree pages look like ordinary relation pages. The opaque
16  * data at high addresses includes pointers to left and right siblings
17  * and flag data describing page state. The first page in a btree, page
18  * zero, is special -- it stores meta-information describing the tree.
19  * Pages one and higher store the actual tree data.
20  *
21  *-------------------------------------------------------------------------
22  */
23 #include "postgres.h"
24 
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/tableam.h"
28 #include "access/transam.h"
29 #include "access/xlog.h"
30 #include "access/xloginsert.h"
31 #include "miscadmin.h"
32 #include "storage/indexfsm.h"
33 #include "storage/lmgr.h"
34 #include "storage/predicate.h"
35 #include "storage/procarray.h"
36 #include "utils/memdebug.h"
37 #include "utils/memutils.h"
38 #include "utils/snapmgr.h"
39 
40 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
41 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
42  FullTransactionId safexid);
43 static void _bt_delitems_delete(Relation rel, Buffer buf,
44  TransactionId snapshotConflictHorizon,
45  OffsetNumber *deletable, int ndeletable,
46  BTVacuumPosting *updatable, int nupdatable);
47 static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
48  OffsetNumber *updatedoffsets,
49  Size *updatedbuflen, bool needswal);
50 static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
51  BTStack stack);
52 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
53  BlockNumber scanblkno,
54  bool *rightsib_empty,
55  BTVacState *vstate);
56 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
57  BTStack stack,
58  Buffer *subtreeparent,
59  OffsetNumber *poffset,
60  BlockNumber *topparent,
61  BlockNumber *topparentrightsib);
62 static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
63  FullTransactionId safexid);
64 
65 /*
66  * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
67  */
68 void
69 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
70  bool allequalimage)
71 {
72  BTMetaPageData *metad;
73  BTPageOpaque metaopaque;
74 
75  _bt_pageinit(page, BLCKSZ);
76 
77  metad = BTPageGetMeta(page);
78  metad->btm_magic = BTREE_MAGIC;
79  metad->btm_version = BTREE_VERSION;
80  metad->btm_root = rootbknum;
81  metad->btm_level = level;
82  metad->btm_fastroot = rootbknum;
83  metad->btm_fastlevel = level;
86  metad->btm_allequalimage = allequalimage;
87 
88  metaopaque = BTPageGetOpaque(page);
89  metaopaque->btpo_flags = BTP_META;
90 
91  /*
92  * Set pd_lower just past the end of the metadata. This is essential,
93  * because without doing so, metadata will be lost if xlog.c compresses
94  * the page.
95  */
96  ((PageHeader) page)->pd_lower =
97  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
98 }
99 
100 /*
101  * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
102  * 3, the last version that can be updated without broadly affecting
103  * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
104  *
105  * This routine does purely in-memory image upgrade. Caller is
106  * responsible for locking, WAL-logging etc.
107  */
108 void
110 {
111  BTMetaPageData *metad;
113 
114  metad = BTPageGetMeta(page);
115  metaopaque = BTPageGetOpaque(page);
116 
117  /* It must be really a meta page of upgradable version */
118  Assert(metaopaque->btpo_flags & BTP_META);
121 
122  /* Set version number and fill extra fields added into version 3 */
125  metad->btm_last_cleanup_num_heap_tuples = -1.0;
126  /* Only a REINDEX can set this field */
127  Assert(!metad->btm_allequalimage);
128  metad->btm_allequalimage = false;
129 
130  /* Adjust pd_lower (see _bt_initmetapage() for details) */
131  ((PageHeader) page)->pd_lower =
132  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
133 }
134 
135 /*
136  * Get metadata from share-locked buffer containing metapage, while performing
137  * standard sanity checks.
138  *
139  * Callers that cache data returned here in local cache should note that an
140  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
141  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
142  */
143 static BTMetaPageData *
145 {
146  Page metapg;
147  BTPageOpaque metaopaque;
148  BTMetaPageData *metad;
149 
150  metapg = BufferGetPage(metabuf);
151  metaopaque = BTPageGetOpaque(metapg);
152  metad = BTPageGetMeta(metapg);
153 
154  /* sanity-check the metapage */
155  if (!P_ISMETA(metaopaque) ||
156  metad->btm_magic != BTREE_MAGIC)
157  ereport(ERROR,
158  (errcode(ERRCODE_INDEX_CORRUPTED),
159  errmsg("index \"%s\" is not a btree",
160  RelationGetRelationName(rel))));
161 
162  if (metad->btm_version < BTREE_MIN_VERSION ||
163  metad->btm_version > BTREE_VERSION)
164  ereport(ERROR,
165  (errcode(ERRCODE_INDEX_CORRUPTED),
166  errmsg("version mismatch in index \"%s\": file version %d, "
167  "current version %d, minimal supported version %d",
170 
171  return metad;
172 }
173 
174 /*
175  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
176  *
177  * Called by btvacuumcleanup when btbulkdelete was never called because no
178  * index tuples needed to be deleted.
179  */
180 bool
182 {
183  Buffer metabuf;
184  Page metapg;
185  BTMetaPageData *metad;
186  uint32 btm_version;
187  BlockNumber prev_num_delpages;
188 
189  /*
190  * Copy details from metapage to local variables quickly.
191  *
192  * Note that we deliberately avoid using cached version of metapage here.
193  */
194  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
195  metapg = BufferGetPage(metabuf);
196  metad = BTPageGetMeta(metapg);
197  btm_version = metad->btm_version;
198 
199  if (btm_version < BTREE_NOVAC_VERSION)
200  {
201  /*
202  * Metapage needs to be dynamically upgraded to store fields that are
203  * only present when btm_version >= BTREE_NOVAC_VERSION
204  */
205  _bt_relbuf(rel, metabuf);
206  return true;
207  }
208 
209  prev_num_delpages = metad->btm_last_cleanup_num_delpages;
210  _bt_relbuf(rel, metabuf);
211 
212  /*
213  * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
214  * total size of the index. We can reasonably expect (though are not
215  * guaranteed) to be able to recycle this many pages if we decide to do a
216  * btvacuumscan call during the ongoing btvacuumcleanup. For further
217  * details see the nbtree/README section on placing deleted pages in the
218  * FSM.
219  */
220  if (prev_num_delpages > 0 &&
221  prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
222  return true;
223 
224  return false;
225 }
226 
227 /*
228  * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
229  *
230  * Called at the end of btvacuumcleanup, when num_delpages value has been
231  * finalized.
232  */
233 void
235 {
236  Buffer metabuf;
237  Page metapg;
238  BTMetaPageData *metad;
239 
240  /*
241  * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
242  * field started out as a TransactionId field called btm_oldest_btpo_xact.
243  * Both "versions" are just uint32 fields. It was convenient to repurpose
244  * the field when we began to use 64-bit XIDs in deleted pages.
245  *
246  * It's possible that a pg_upgrade'd database will contain an XID value in
247  * what is now recognized as the metapage's btm_last_cleanup_num_delpages
248  * field. _bt_vacuum_needs_cleanup() may even believe that this value
249  * indicates that there are lots of pages that it needs to recycle, when
250  * in reality there are only one or two. The worst that can happen is
251  * that there will be a call to btvacuumscan a little earlier, which will
252  * set btm_last_cleanup_num_delpages to a sane value when we're called.
253  *
254  * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
255  * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
256  * to be consistent.
257  */
258  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
259  metapg = BufferGetPage(metabuf);
260  metad = BTPageGetMeta(metapg);
261 
262  /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
263  if (metad->btm_version >= BTREE_NOVAC_VERSION &&
264  metad->btm_last_cleanup_num_delpages == num_delpages)
265  {
266  /* Usually means index continues to have num_delpages of 0 */
267  _bt_relbuf(rel, metabuf);
268  return;
269  }
270 
271  /* trade in our read lock for a write lock */
272  _bt_unlockbuf(rel, metabuf);
273  _bt_lockbuf(rel, metabuf, BT_WRITE);
274 
276 
277  /* upgrade meta-page if needed */
278  if (metad->btm_version < BTREE_NOVAC_VERSION)
279  _bt_upgrademetapage(metapg);
280 
281  /* update cleanup-related information */
282  metad->btm_last_cleanup_num_delpages = num_delpages;
283  metad->btm_last_cleanup_num_heap_tuples = -1.0;
284  MarkBufferDirty(metabuf);
285 
286  /* write wal record if needed */
287  if (RelationNeedsWAL(rel))
288  {
290  XLogRecPtr recptr;
291 
292  XLogBeginInsert();
294 
296  md.version = metad->btm_version;
297  md.root = metad->btm_root;
298  md.level = metad->btm_level;
299  md.fastroot = metad->btm_fastroot;
300  md.fastlevel = metad->btm_fastlevel;
301  md.last_cleanup_num_delpages = num_delpages;
302  md.allequalimage = metad->btm_allequalimage;
303 
304  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
305 
306  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
307 
308  PageSetLSN(metapg, recptr);
309  }
310 
312 
313  _bt_relbuf(rel, metabuf);
314 }
315 
316 /*
317  * _bt_getroot() -- Get the root page of the btree.
318  *
319  * Since the root page can move around the btree file, we have to read
320  * its location from the metadata page, and then read the root page
321  * itself. If no root page exists yet, we have to create one.
322  *
323  * The access type parameter (BT_READ or BT_WRITE) controls whether
324  * a new root page will be created or not. If access = BT_READ,
325  * and no root page exists, we just return InvalidBuffer. For
326  * BT_WRITE, we try to create the root page if it doesn't exist.
327  * NOTE that the returned root page will have only a read lock set
328  * on it even if access = BT_WRITE!
329  *
330  * The returned page is not necessarily the true root --- it could be
331  * a "fast root" (a page that is alone in its level due to deletions).
332  * Also, if the root page is split while we are "in flight" to it,
333  * what we will return is the old root, which is now just the leftmost
334  * page on a probably-not-very-wide level. For most purposes this is
335  * as good as or better than the true root, so we do not bother to
336  * insist on finding the true root. We do, however, guarantee to
337  * return a live (not deleted or half-dead) page.
338  *
339  * On successful return, the root page is pinned and read-locked.
340  * The metadata page is not locked or pinned on exit.
341  */
342 Buffer
344 {
345  Buffer metabuf;
346  Buffer rootbuf;
347  Page rootpage;
348  BTPageOpaque rootopaque;
349  BlockNumber rootblkno;
350  uint32 rootlevel;
351  BTMetaPageData *metad;
352 
353  /*
354  * Try to use previously-cached metapage data to find the root. This
355  * normally saves one buffer access per index search, which is a very
356  * helpful savings in bufmgr traffic and hence contention.
357  */
358  if (rel->rd_amcache != NULL)
359  {
360  metad = (BTMetaPageData *) rel->rd_amcache;
361  /* We shouldn't have cached it if any of these fail */
362  Assert(metad->btm_magic == BTREE_MAGIC);
364  Assert(metad->btm_version <= BTREE_VERSION);
365  Assert(!metad->btm_allequalimage ||
367  Assert(metad->btm_root != P_NONE);
368 
369  rootblkno = metad->btm_fastroot;
370  Assert(rootblkno != P_NONE);
371  rootlevel = metad->btm_fastlevel;
372 
373  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
374  rootpage = BufferGetPage(rootbuf);
375  rootopaque = BTPageGetOpaque(rootpage);
376 
377  /*
378  * Since the cache might be stale, we check the page more carefully
379  * here than normal. We *must* check that it's not deleted. If it's
380  * not alone on its level, then we reject too --- this may be overly
381  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
382  * because that's not set in a "fast root".
383  */
384  if (!P_IGNORE(rootopaque) &&
385  rootopaque->btpo_level == rootlevel &&
386  P_LEFTMOST(rootopaque) &&
387  P_RIGHTMOST(rootopaque))
388  {
389  /* OK, accept cached page as the root */
390  return rootbuf;
391  }
392  _bt_relbuf(rel, rootbuf);
393  /* Cache is stale, throw it away */
394  if (rel->rd_amcache)
395  pfree(rel->rd_amcache);
396  rel->rd_amcache = NULL;
397  }
398 
399  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
400  metad = _bt_getmeta(rel, metabuf);
401 
402  /* if no root page initialized yet, do it */
403  if (metad->btm_root == P_NONE)
404  {
405  Page metapg;
406 
407  /* If access = BT_READ, caller doesn't want us to create root yet */
408  if (access == BT_READ)
409  {
410  _bt_relbuf(rel, metabuf);
411  return InvalidBuffer;
412  }
413 
414  /* trade in our read lock for a write lock */
415  _bt_unlockbuf(rel, metabuf);
416  _bt_lockbuf(rel, metabuf, BT_WRITE);
417 
418  /*
419  * Race condition: if someone else initialized the metadata between
420  * the time we released the read lock and acquired the write lock, we
421  * must avoid doing it again.
422  */
423  if (metad->btm_root != P_NONE)
424  {
425  /*
426  * Metadata initialized by someone else. In order to guarantee no
427  * deadlocks, we have to release the metadata page and start all
428  * over again. (Is that really true? But it's hardly worth trying
429  * to optimize this case.)
430  */
431  _bt_relbuf(rel, metabuf);
432  return _bt_getroot(rel, access);
433  }
434 
435  /*
436  * Get, initialize, write, and leave a lock of the appropriate type on
437  * the new root page. Since this is the first page in the tree, it's
438  * a leaf as well as the root.
439  */
440  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
441  rootblkno = BufferGetBlockNumber(rootbuf);
442  rootpage = BufferGetPage(rootbuf);
443  rootopaque = BTPageGetOpaque(rootpage);
444  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
445  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
446  rootopaque->btpo_level = 0;
447  rootopaque->btpo_cycleid = 0;
448  /* Get raw page pointer for metapage */
449  metapg = BufferGetPage(metabuf);
450 
451  /* NO ELOG(ERROR) till meta is updated */
453 
454  /* upgrade metapage if needed */
455  if (metad->btm_version < BTREE_NOVAC_VERSION)
456  _bt_upgrademetapage(metapg);
457 
458  metad->btm_root = rootblkno;
459  metad->btm_level = 0;
460  metad->btm_fastroot = rootblkno;
461  metad->btm_fastlevel = 0;
463  metad->btm_last_cleanup_num_heap_tuples = -1.0;
464 
465  MarkBufferDirty(rootbuf);
466  MarkBufferDirty(metabuf);
467 
468  /* XLOG stuff */
469  if (RelationNeedsWAL(rel))
470  {
471  xl_btree_newroot xlrec;
472  XLogRecPtr recptr;
474 
475  XLogBeginInsert();
478 
480  md.version = metad->btm_version;
481  md.root = rootblkno;
482  md.level = 0;
483  md.fastroot = rootblkno;
484  md.fastlevel = 0;
486  md.allequalimage = metad->btm_allequalimage;
487 
488  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
489 
490  xlrec.rootblk = rootblkno;
491  xlrec.level = 0;
492 
493  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
494 
495  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
496 
497  PageSetLSN(rootpage, recptr);
498  PageSetLSN(metapg, recptr);
499  }
500 
502 
503  /*
504  * swap root write lock for read lock. There is no danger of anyone
505  * else accessing the new root page while it's unlocked, since no one
506  * else knows where it is yet.
507  */
508  _bt_unlockbuf(rel, rootbuf);
509  _bt_lockbuf(rel, rootbuf, BT_READ);
510 
511  /* okay, metadata is correct, release lock on it without caching */
512  _bt_relbuf(rel, metabuf);
513  }
514  else
515  {
516  rootblkno = metad->btm_fastroot;
517  Assert(rootblkno != P_NONE);
518  rootlevel = metad->btm_fastlevel;
519 
520  /*
521  * Cache the metapage data for next time
522  */
524  sizeof(BTMetaPageData));
525  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
526 
527  /*
528  * We are done with the metapage; arrange to release it via first
529  * _bt_relandgetbuf call
530  */
531  rootbuf = metabuf;
532 
533  for (;;)
534  {
535  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
536  rootpage = BufferGetPage(rootbuf);
537  rootopaque = BTPageGetOpaque(rootpage);
538 
539  if (!P_IGNORE(rootopaque))
540  break;
541 
542  /* it's dead, Jim. step right one page */
543  if (P_RIGHTMOST(rootopaque))
544  elog(ERROR, "no live root page found in index \"%s\"",
546  rootblkno = rootopaque->btpo_next;
547  }
548 
549  if (rootopaque->btpo_level != rootlevel)
550  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
551  rootblkno, RelationGetRelationName(rel),
552  rootopaque->btpo_level, rootlevel);
553  }
554 
555  /*
556  * By here, we have a pin and read lock on the root page, and no lock set
557  * on the metadata page. Return the root page's buffer.
558  */
559  return rootbuf;
560 }
561 
562 /*
563  * _bt_gettrueroot() -- Get the true root page of the btree.
564  *
565  * This is the same as the BT_READ case of _bt_getroot(), except
566  * we follow the true-root link not the fast-root link.
567  *
568  * By the time we acquire lock on the root page, it might have been split and
569  * not be the true root anymore. This is okay for the present uses of this
570  * routine; we only really need to be able to move up at least one tree level
571  * from whatever non-root page we were at. If we ever do need to lock the
572  * one true root page, we could loop here, re-reading the metapage on each
573  * failure. (Note that it wouldn't do to hold the lock on the metapage while
574  * moving to the root --- that'd deadlock against any concurrent root split.)
575  */
576 Buffer
578 {
579  Buffer metabuf;
580  Page metapg;
581  BTPageOpaque metaopaque;
582  Buffer rootbuf;
583  Page rootpage;
584  BTPageOpaque rootopaque;
585  BlockNumber rootblkno;
586  uint32 rootlevel;
587  BTMetaPageData *metad;
588 
589  /*
590  * We don't try to use cached metapage data here, since (a) this path is
591  * not performance-critical, and (b) if we are here it suggests our cache
592  * is out-of-date anyway. In light of point (b), it's probably safest to
593  * actively flush any cached metapage info.
594  */
595  if (rel->rd_amcache)
596  pfree(rel->rd_amcache);
597  rel->rd_amcache = NULL;
598 
599  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
600  metapg = BufferGetPage(metabuf);
601  metaopaque = BTPageGetOpaque(metapg);
602  metad = BTPageGetMeta(metapg);
603 
604  if (!P_ISMETA(metaopaque) ||
605  metad->btm_magic != BTREE_MAGIC)
606  ereport(ERROR,
607  (errcode(ERRCODE_INDEX_CORRUPTED),
608  errmsg("index \"%s\" is not a btree",
609  RelationGetRelationName(rel))));
610 
611  if (metad->btm_version < BTREE_MIN_VERSION ||
612  metad->btm_version > BTREE_VERSION)
613  ereport(ERROR,
614  (errcode(ERRCODE_INDEX_CORRUPTED),
615  errmsg("version mismatch in index \"%s\": file version %d, "
616  "current version %d, minimal supported version %d",
619 
620  /* if no root page initialized yet, fail */
621  if (metad->btm_root == P_NONE)
622  {
623  _bt_relbuf(rel, metabuf);
624  return InvalidBuffer;
625  }
626 
627  rootblkno = metad->btm_root;
628  rootlevel = metad->btm_level;
629 
630  /*
631  * We are done with the metapage; arrange to release it via first
632  * _bt_relandgetbuf call
633  */
634  rootbuf = metabuf;
635 
636  for (;;)
637  {
638  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
639  rootpage = BufferGetPage(rootbuf);
640  rootopaque = BTPageGetOpaque(rootpage);
641 
642  if (!P_IGNORE(rootopaque))
643  break;
644 
645  /* it's dead, Jim. step right one page */
646  if (P_RIGHTMOST(rootopaque))
647  elog(ERROR, "no live root page found in index \"%s\"",
649  rootblkno = rootopaque->btpo_next;
650  }
651 
652  if (rootopaque->btpo_level != rootlevel)
653  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
654  rootblkno, RelationGetRelationName(rel),
655  rootopaque->btpo_level, rootlevel);
656 
657  return rootbuf;
658 }
659 
660 /*
661  * _bt_getrootheight() -- Get the height of the btree search tree.
662  *
663  * We return the level (counting from zero) of the current fast root.
664  * This represents the number of tree levels we'd have to descend through
665  * to start any btree index search.
666  *
667  * This is used by the planner for cost-estimation purposes. Since it's
668  * only an estimate, slightly-stale data is fine, hence we don't worry
669  * about updating previously cached data.
670  */
671 int
673 {
674  BTMetaPageData *metad;
675 
676  if (rel->rd_amcache == NULL)
677  {
678  Buffer metabuf;
679 
680  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
681  metad = _bt_getmeta(rel, metabuf);
682 
683  /*
684  * If there's no root page yet, _bt_getroot() doesn't expect a cache
685  * to be made, so just stop here and report the index height is zero.
686  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
687  */
688  if (metad->btm_root == P_NONE)
689  {
690  _bt_relbuf(rel, metabuf);
691  return 0;
692  }
693 
694  /*
695  * Cache the metapage data for next time
696  */
698  sizeof(BTMetaPageData));
699  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
700  _bt_relbuf(rel, metabuf);
701  }
702 
703  /* Get cached page */
704  metad = (BTMetaPageData *) rel->rd_amcache;
705  /* We shouldn't have cached it if any of these fail */
706  Assert(metad->btm_magic == BTREE_MAGIC);
708  Assert(metad->btm_version <= BTREE_VERSION);
709  Assert(!metad->btm_allequalimage ||
711  Assert(metad->btm_fastroot != P_NONE);
712 
713  return metad->btm_fastlevel;
714 }
715 
716 /*
717  * _bt_metaversion() -- Get version/status info from metapage.
718  *
719  * Sets caller's *heapkeyspace and *allequalimage arguments using data
720  * from the B-Tree metapage (could be locally-cached version). This
721  * information needs to be stashed in insertion scankey, so we provide a
722  * single function that fetches both at once.
723  *
724  * This is used to determine the rules that must be used to descend a
725  * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
726  * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
727  * performance when inserting a new BTScanInsert-wise duplicate tuple
728  * among many leaf pages already full of such duplicates.
729  *
730  * Also sets allequalimage field, which indicates whether or not it is
731  * safe to apply deduplication. We rely on the assumption that
732  * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
733  * pg_upgrade'd from Postgres 12.
734  */
735 void
736 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
737 {
738  BTMetaPageData *metad;
739 
740  if (rel->rd_amcache == NULL)
741  {
742  Buffer metabuf;
743 
744  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
745  metad = _bt_getmeta(rel, metabuf);
746 
747  /*
748  * If there's no root page yet, _bt_getroot() doesn't expect a cache
749  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
750  * be changed to allow this case.)
751  */
752  if (metad->btm_root == P_NONE)
753  {
754  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
755  *allequalimage = metad->btm_allequalimage;
756 
757  _bt_relbuf(rel, metabuf);
758  return;
759  }
760 
761  /*
762  * Cache the metapage data for next time
763  *
764  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
765  * can change the nbtree version for an index without invalidating any
766  * local cache. This is okay because it can only happen when moving
767  * from version 2 to version 3, both of which are !heapkeyspace
768  * versions.
769  */
771  sizeof(BTMetaPageData));
772  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
773  _bt_relbuf(rel, metabuf);
774  }
775 
776  /* Get cached page */
777  metad = (BTMetaPageData *) rel->rd_amcache;
778  /* We shouldn't have cached it if any of these fail */
779  Assert(metad->btm_magic == BTREE_MAGIC);
781  Assert(metad->btm_version <= BTREE_VERSION);
782  Assert(!metad->btm_allequalimage ||
784  Assert(metad->btm_fastroot != P_NONE);
785 
786  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
787  *allequalimage = metad->btm_allequalimage;
788 }
789 
790 /*
791  * _bt_checkpage() -- Verify that a freshly-read page looks sane.
792  */
793 void
795 {
796  Page page = BufferGetPage(buf);
797 
798  /*
799  * ReadBuffer verifies that every newly-read page passes
800  * PageHeaderIsValid, which means it either contains a reasonably sane
801  * page header or is all-zero. We have to defend against the all-zero
802  * case, however.
803  */
804  if (PageIsNew(page))
805  ereport(ERROR,
806  (errcode(ERRCODE_INDEX_CORRUPTED),
807  errmsg("index \"%s\" contains unexpected zero page at block %u",
810  errhint("Please REINDEX it.")));
811 
812  /*
813  * Additionally check that the special area looks sane.
814  */
815  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
816  ereport(ERROR,
817  (errcode(ERRCODE_INDEX_CORRUPTED),
818  errmsg("index \"%s\" contains corrupted page at block %u",
821  errhint("Please REINDEX it.")));
822 }
823 
824 /*
825  * Log the reuse of a page from the FSM.
826  */
827 static void
829 {
830  xl_btree_reuse_page xlrec_reuse;
831 
832  /*
833  * Note that we don't register the buffer with the record, because this
834  * operation doesn't modify the page. This record only exists to provide a
835  * conflict point for Hot Standby.
836  */
837 
838  /* XLOG stuff */
839  xlrec_reuse.locator = rel->rd_locator;
840  xlrec_reuse.block = blkno;
841  xlrec_reuse.snapshotConflictHorizon = safexid;
842 
843  XLogBeginInsert();
844  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
845 
846  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
847 }
848 
849 /*
850  * _bt_getbuf() -- Get a buffer by block number for read or write.
851  *
852  * blkno == P_NEW means to get an unallocated index page. The page
853  * will be initialized before returning it.
854  *
855  * The general rule in nbtree is that it's never okay to access a
856  * page without holding both a buffer pin and a buffer lock on
857  * the page's buffer.
858  *
859  * When this routine returns, the appropriate lock is set on the
860  * requested buffer and its reference count has been incremented
861  * (ie, the buffer is "locked and pinned"). Also, we apply
862  * _bt_checkpage to sanity-check the page (except in P_NEW case),
863  * and perform Valgrind client requests that help Valgrind detect
864  * unsafe page accesses.
865  *
866  * Note: raw LockBuffer() calls are disallowed in nbtree; all
867  * buffer lock requests need to go through wrapper functions such
868  * as _bt_lockbuf().
869  */
870 Buffer
872 {
873  Buffer buf;
874 
875  if (blkno != P_NEW)
876  {
877  /* Read an existing block of the relation */
878  buf = ReadBuffer(rel, blkno);
879  _bt_lockbuf(rel, buf, access);
880  _bt_checkpage(rel, buf);
881  }
882  else
883  {
884  bool needLock;
885  Page page;
886 
887  Assert(access == BT_WRITE);
888 
889  /*
890  * First see if the FSM knows of any free pages.
891  *
892  * We can't trust the FSM's report unreservedly; we have to check that
893  * the page is still free. (For example, an already-free page could
894  * have been re-used between the time the last VACUUM scanned it and
895  * the time the VACUUM made its FSM updates.)
896  *
897  * In fact, it's worse than that: we can't even assume that it's safe
898  * to take a lock on the reported page. If somebody else has a lock
899  * on it, or even worse our own caller does, we could deadlock. (The
900  * own-caller scenario is actually not improbable. Consider an index
901  * on a serial or timestamp column. Nearly all splits will be at the
902  * rightmost page, so it's entirely likely that _bt_split will call us
903  * while holding a lock on the page most recently acquired from FSM. A
904  * VACUUM running concurrently with the previous split could well have
905  * placed that page back in FSM.)
906  *
907  * To get around that, we ask for only a conditional lock on the
908  * reported page. If we fail, then someone else is using the page,
909  * and we may reasonably assume it's not free. (If we happen to be
910  * wrong, the worst consequence is the page will be lost to use till
911  * the next VACUUM, which is no big problem.)
912  */
913  for (;;)
914  {
915  blkno = GetFreeIndexPage(rel);
916  if (blkno == InvalidBlockNumber)
917  break;
918  buf = ReadBuffer(rel, blkno);
919  if (_bt_conditionallockbuf(rel, buf))
920  {
921  page = BufferGetPage(buf);
922 
923  /*
924  * It's possible to find an all-zeroes page in an index. For
925  * example, a backend might successfully extend the relation
926  * one page and then crash before it is able to make a WAL
927  * entry for adding the page. If we find a zeroed page then
928  * reclaim it immediately.
929  */
930  if (PageIsNew(page))
931  {
932  /* Okay to use page. Initialize and return it. */
934  return buf;
935  }
936 
937  if (BTPageIsRecyclable(page))
938  {
939  /*
940  * If we are generating WAL for Hot Standby then create a
941  * WAL record that will allow us to conflict with queries
942  * running on standby, in case they have snapshots older
943  * than safexid value
944  */
946  _bt_log_reuse_page(rel, blkno,
947  BTPageGetDeleteXid(page));
948 
949  /* Okay to use page. Re-initialize and return it. */
951  return buf;
952  }
953  elog(DEBUG2, "FSM returned nonrecyclable page");
954  _bt_relbuf(rel, buf);
955  }
956  else
957  {
958  elog(DEBUG2, "FSM returned nonlockable page");
959  /* couldn't get lock, so just drop pin */
961  }
962  }
963 
964  /*
965  * Extend the relation by one page.
966  *
967  * We have to use a lock to ensure no one else is extending the rel at
968  * the same time, else we will both try to initialize the same new
969  * page. We can skip locking for new or temp relations, however,
970  * since no one else could be accessing them.
971  */
972  needLock = !RELATION_IS_LOCAL(rel);
973 
974  if (needLock)
976 
977  buf = ReadBuffer(rel, P_NEW);
978 
979  /* Acquire buffer lock on new page */
980  _bt_lockbuf(rel, buf, BT_WRITE);
981 
982  /*
983  * Release the file-extension lock; it's now OK for someone else to
984  * extend the relation some more. Note that we cannot release this
985  * lock before we have buffer lock on the new page, or we risk a race
986  * condition against btvacuumscan --- see comments therein.
987  */
988  if (needLock)
990 
991  /* Initialize the new page before returning it */
992  page = BufferGetPage(buf);
993  Assert(PageIsNew(page));
995  }
996 
997  /* ref count and lock type are correct */
998  return buf;
999 }
1000 
1001 /*
1002  * _bt_relandgetbuf() -- release a locked buffer and get another one.
1003  *
1004  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
1005  * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
1006  * then it reduces to just _bt_getbuf; allowing this case simplifies some
1007  * callers.
1008  *
1009  * The original motivation for using this was to avoid two entries to the
1010  * bufmgr when one would do. However, now it's mainly just a notational
1011  * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
1012  * is when the target page is the same one already in the buffer.
1013  */
1014 Buffer
1016 {
1017  Buffer buf;
1018 
1019  Assert(blkno != P_NEW);
1020  if (BufferIsValid(obuf))
1021  _bt_unlockbuf(rel, obuf);
1022  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1023  _bt_lockbuf(rel, buf, access);
1024 
1025  _bt_checkpage(rel, buf);
1026  return buf;
1027 }
1028 
1029 /*
1030  * _bt_relbuf() -- release a locked buffer.
1031  *
1032  * Lock and pin (refcount) are both dropped.
1033  */
1034 void
1036 {
1037  _bt_unlockbuf(rel, buf);
1038  ReleaseBuffer(buf);
1039 }
1040 
1041 /*
1042  * _bt_lockbuf() -- lock a pinned buffer.
1043  *
1044  * Lock is acquired without acquiring another pin. This is like a raw
1045  * LockBuffer() call, but performs extra steps needed by Valgrind.
1046  *
1047  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1048  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1049  */
1050 void
1052 {
1053  /* LockBuffer() asserts that pin is held by this backend */
1054  LockBuffer(buf, access);
1055 
1056  /*
1057  * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1058  * an nbtree error (e.g. a unique violation error). That won't cause
1059  * Valgrind false positives.
1060  *
1061  * The nbtree client requests are superimposed on top of the bufmgr.c
1062  * buffer pin client requests. In the event of an nbtree error the buffer
1063  * will certainly get marked as defined when the backend once again
1064  * acquires its first pin on the buffer. (Of course, if the backend never
1065  * touches the buffer again then it doesn't matter that it remains
1066  * non-accessible to Valgrind.)
1067  *
1068  * Note: When an IndexTuple C pointer gets computed using an ItemId read
1069  * from a page while a lock was held, the C pointer becomes unsafe to
1070  * dereference forever as soon as the lock is released. Valgrind can only
1071  * detect cases where the pointer gets dereferenced with no _current_
1072  * lock/pin held, though.
1073  */
1074  if (!RelationUsesLocalBuffers(rel))
1076 }
1077 
1078 /*
1079  * _bt_unlockbuf() -- unlock a pinned buffer.
1080  */
1081 void
1083 {
1084  /*
1085  * Buffer is pinned and locked, which means that it is expected to be
1086  * defined and addressable. Check that proactively.
1087  */
1089 
1090  /* LockBuffer() asserts that pin is held by this backend */
1092 
1093  if (!RelationUsesLocalBuffers(rel))
1095 }
1096 
1097 /*
1098  * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1099  * buffer.
1100  *
1101  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1102  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1103  */
1104 bool
1106 {
1107  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1108  if (!ConditionalLockBuffer(buf))
1109  return false;
1110 
1111  if (!RelationUsesLocalBuffers(rel))
1113 
1114  return true;
1115 }
1116 
1117 /*
1118  * _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
1119  */
1120 void
1122 {
1123  /*
1124  * Buffer is pinned and locked, which means that it is expected to be
1125  * defined and addressable. Check that proactively.
1126  */
1128 
1129  /* LockBuffer() asserts that pin is held by this backend */
1132 }
1133 
1134 /*
1135  * _bt_pageinit() -- Initialize a new page.
1136  *
1137  * On return, the page header is initialized; data space is empty;
1138  * special space is zeroed out.
1139  */
1140 void
1142 {
1143  PageInit(page, size, sizeof(BTPageOpaqueData));
1144 }
1145 
1146 /*
1147  * Delete item(s) from a btree leaf page during VACUUM.
1148  *
1149  * This routine assumes that the caller already has a full cleanup lock on
1150  * the buffer. Also, the given deletable and updatable arrays *must* be
1151  * sorted in ascending order.
1152  *
1153  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1154  * in an existing posting list item are to be removed. This works by
1155  * updating/overwriting an existing item with caller's new version of the item
1156  * (a version that lacks the TIDs that are to be deleted).
1157  *
1158  * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1159  * generate their own snapshotConflictHorizon directly from the tableam,
1160  * whereas VACUUMs rely on the initial VACUUM table scan performing
1161  * WAL-logging that takes care of the issue for the table's indexes
1162  * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree
1163  * deletes don't do.
1164  */
1165 void
1167  OffsetNumber *deletable, int ndeletable,
1168  BTVacuumPosting *updatable, int nupdatable)
1169 {
1170  Page page = BufferGetPage(buf);
1171  BTPageOpaque opaque;
1172  bool needswal = RelationNeedsWAL(rel);
1173  char *updatedbuf = NULL;
1174  Size updatedbuflen = 0;
1175  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1176 
1177  /* Shouldn't be called unless there's something to do */
1178  Assert(ndeletable > 0 || nupdatable > 0);
1179 
1180  /* Generate new version of posting lists without deleted TIDs */
1181  if (nupdatable > 0)
1182  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1183  updatedoffsets, &updatedbuflen,
1184  needswal);
1185 
1186  /* No ereport(ERROR) until changes are logged */
1188 
1189  /*
1190  * Handle posting tuple updates.
1191  *
1192  * Deliberately do this before handling simple deletes. If we did it the
1193  * other way around (i.e. WAL record order -- simple deletes before
1194  * updates) then we'd have to make compensating changes to the 'updatable'
1195  * array of offset numbers.
1196  *
1197  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1198  * happens to already be set. It's important that we not interfere with
1199  * any future simple index tuple deletion operations.
1200  */
1201  for (int i = 0; i < nupdatable; i++)
1202  {
1203  OffsetNumber updatedoffset = updatedoffsets[i];
1204  IndexTuple itup;
1205  Size itemsz;
1206 
1207  itup = updatable[i]->itup;
1208  itemsz = MAXALIGN(IndexTupleSize(itup));
1209  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1210  itemsz))
1211  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1213  }
1214 
1215  /* Now handle simple deletes of entire tuples */
1216  if (ndeletable > 0)
1217  PageIndexMultiDelete(page, deletable, ndeletable);
1218 
1219  /*
1220  * We can clear the vacuum cycle ID since this page has certainly been
1221  * processed by the current vacuum scan.
1222  */
1223  opaque = BTPageGetOpaque(page);
1224  opaque->btpo_cycleid = 0;
1225 
1226  /*
1227  * Clear the BTP_HAS_GARBAGE page flag.
1228  *
1229  * This flag indicates the presence of LP_DEAD items on the page (though
1230  * not reliably). Note that we only rely on it with pg_upgrade'd
1231  * !heapkeyspace indexes. That's why clearing it here won't usually
1232  * interfere with simple index tuple deletion.
1233  */
1234  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1235 
1237 
1238  /* XLOG stuff */
1239  if (needswal)
1240  {
1241  XLogRecPtr recptr;
1242  xl_btree_vacuum xlrec_vacuum;
1243 
1244  xlrec_vacuum.ndeleted = ndeletable;
1245  xlrec_vacuum.nupdated = nupdatable;
1246 
1247  XLogBeginInsert();
1249  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1250 
1251  if (ndeletable > 0)
1252  XLogRegisterBufData(0, (char *) deletable,
1253  ndeletable * sizeof(OffsetNumber));
1254 
1255  if (nupdatable > 0)
1256  {
1257  XLogRegisterBufData(0, (char *) updatedoffsets,
1258  nupdatable * sizeof(OffsetNumber));
1259  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1260  }
1261 
1262  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1263 
1264  PageSetLSN(page, recptr);
1265  }
1266 
1267  END_CRIT_SECTION();
1268 
1269  /* can't leak memory here */
1270  if (updatedbuf != NULL)
1271  pfree(updatedbuf);
1272  /* free tuples allocated within _bt_delitems_update() */
1273  for (int i = 0; i < nupdatable; i++)
1274  pfree(updatable[i]->itup);
1275 }
1276 
1277 /*
1278  * Delete item(s) from a btree leaf page during single-page cleanup.
1279  *
1280  * This routine assumes that the caller has pinned and write locked the
1281  * buffer. Also, the given deletable and updatable arrays *must* be sorted in
1282  * ascending order.
1283  *
1284  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1285  * in an existing posting list item are to be removed. This works by
1286  * updating/overwriting an existing item with caller's new version of the item
1287  * (a version that lacks the TIDs that are to be deleted).
1288  *
1289  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1290  * the page, but it needs its own snapshotConflictHorizon (caller gets this
1291  * from tableam). This is used by the REDO routine to generate recovery
1292  * conflicts. The other difference is that only _bt_delitems_vacuum will
1293  * clear page's VACUUM cycle ID.
1294  */
1295 static void
1297  TransactionId snapshotConflictHorizon,
1298  OffsetNumber *deletable, int ndeletable,
1299  BTVacuumPosting *updatable, int nupdatable)
1300 {
1301  Page page = BufferGetPage(buf);
1302  BTPageOpaque opaque;
1303  bool needswal = RelationNeedsWAL(rel);
1304  char *updatedbuf = NULL;
1305  Size updatedbuflen = 0;
1306  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1307 
1308  /* Shouldn't be called unless there's something to do */
1309  Assert(ndeletable > 0 || nupdatable > 0);
1310 
1311  /* Generate new versions of posting lists without deleted TIDs */
1312  if (nupdatable > 0)
1313  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1314  updatedoffsets, &updatedbuflen,
1315  needswal);
1316 
1317  /* No ereport(ERROR) until changes are logged */
1319 
1320  /* Handle updates and deletes just like _bt_delitems_vacuum */
1321  for (int i = 0; i < nupdatable; i++)
1322  {
1323  OffsetNumber updatedoffset = updatedoffsets[i];
1324  IndexTuple itup;
1325  Size itemsz;
1326 
1327  itup = updatable[i]->itup;
1328  itemsz = MAXALIGN(IndexTupleSize(itup));
1329  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1330  itemsz))
1331  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1333  }
1334 
1335  if (ndeletable > 0)
1336  PageIndexMultiDelete(page, deletable, ndeletable);
1337 
1338  /*
1339  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1340  * this point. The VACUUM command alone controls vacuum cycle IDs.
1341  */
1342  opaque = BTPageGetOpaque(page);
1343 
1344  /*
1345  * Clear the BTP_HAS_GARBAGE page flag.
1346  *
1347  * This flag indicates the presence of LP_DEAD items on the page (though
1348  * not reliably). Note that we only rely on it with pg_upgrade'd
1349  * !heapkeyspace indexes.
1350  */
1351  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1352 
1354 
1355  /* XLOG stuff */
1356  if (needswal)
1357  {
1358  XLogRecPtr recptr;
1359  xl_btree_delete xlrec_delete;
1360 
1361  xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
1362  xlrec_delete.ndeleted = ndeletable;
1363  xlrec_delete.nupdated = nupdatable;
1364 
1365  XLogBeginInsert();
1367  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1368 
1369  if (ndeletable > 0)
1370  XLogRegisterBufData(0, (char *) deletable,
1371  ndeletable * sizeof(OffsetNumber));
1372 
1373  if (nupdatable > 0)
1374  {
1375  XLogRegisterBufData(0, (char *) updatedoffsets,
1376  nupdatable * sizeof(OffsetNumber));
1377  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1378  }
1379 
1380  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1381 
1382  PageSetLSN(page, recptr);
1383  }
1384 
1385  END_CRIT_SECTION();
1386 
1387  /* can't leak memory here */
1388  if (updatedbuf != NULL)
1389  pfree(updatedbuf);
1390  /* free tuples allocated within _bt_delitems_update() */
1391  for (int i = 0; i < nupdatable; i++)
1392  pfree(updatable[i]->itup);
1393 }
1394 
1395 /*
1396  * Set up state needed to delete TIDs from posting list tuples via "updating"
1397  * the tuple. Performs steps common to both _bt_delitems_vacuum and
1398  * _bt_delitems_delete. These steps must take place before each function's
1399  * critical section begins.
1400  *
1401  * updatable and nupdatable are inputs, though note that we will use
1402  * _bt_update_posting() to replace the original itup with a pointer to a final
1403  * version in palloc()'d memory. Caller should free the tuples when its done.
1404  *
1405  * The first nupdatable entries from updatedoffsets are set to the page offset
1406  * number for posting list tuples that caller updates. This is mostly useful
1407  * because caller may need to WAL-log the page offsets (though we always do
1408  * this for caller out of convenience).
1409  *
1410  * Returns buffer consisting of an array of xl_btree_update structs that
1411  * describe the steps we perform here for caller (though only when needswal is
1412  * true). Also sets *updatedbuflen to the final size of the buffer. This
1413  * buffer is used by caller when WAL logging is required.
1414  */
1415 static char *
1416 _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1417  OffsetNumber *updatedoffsets, Size *updatedbuflen,
1418  bool needswal)
1419 {
1420  char *updatedbuf = NULL;
1421  Size buflen = 0;
1422 
1423  /* Shouldn't be called unless there's something to do */
1424  Assert(nupdatable > 0);
1425 
1426  for (int i = 0; i < nupdatable; i++)
1427  {
1428  BTVacuumPosting vacposting = updatable[i];
1429  Size itemsz;
1430 
1431  /* Replace work area IndexTuple with updated version */
1432  _bt_update_posting(vacposting);
1433 
1434  /* Keep track of size of xl_btree_update for updatedbuf in passing */
1435  itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1436  buflen += itemsz;
1437 
1438  /* Build updatedoffsets buffer in passing */
1439  updatedoffsets[i] = vacposting->updatedoffset;
1440  }
1441 
1442  /* XLOG stuff */
1443  if (needswal)
1444  {
1445  Size offset = 0;
1446 
1447  /* Allocate, set final size for caller */
1448  updatedbuf = palloc(buflen);
1449  *updatedbuflen = buflen;
1450  for (int i = 0; i < nupdatable; i++)
1451  {
1452  BTVacuumPosting vacposting = updatable[i];
1453  Size itemsz;
1454  xl_btree_update update;
1455 
1456  update.ndeletedtids = vacposting->ndeletedtids;
1457  memcpy(updatedbuf + offset, &update.ndeletedtids,
1459  offset += SizeOfBtreeUpdate;
1460 
1461  itemsz = update.ndeletedtids * sizeof(uint16);
1462  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1463  offset += itemsz;
1464  }
1465  }
1466 
1467  return updatedbuf;
1468 }
1469 
1470 /*
1471  * Comparator used by _bt_delitems_delete_check() to restore deltids array
1472  * back to its original leaf-page-wise sort order
1473  */
1474 static int
1475 _bt_delitems_cmp(const void *a, const void *b)
1476 {
1477  TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1478  TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1479 
1480  if (indexdelete1->id > indexdelete2->id)
1481  return 1;
1482  if (indexdelete1->id < indexdelete2->id)
1483  return -1;
1484 
1485  Assert(false);
1486 
1487  return 0;
1488 }
1489 
1490 /*
1491  * Try to delete item(s) from a btree leaf page during single-page cleanup.
1492  *
1493  * nbtree interface to table_index_delete_tuples(). Deletes a subset of index
1494  * tuples from caller's deltids array: those whose TIDs are found safe to
1495  * delete by the tableam (or already marked LP_DEAD in index, and so already
1496  * known to be deletable by our simple index deletion caller). We physically
1497  * delete index tuples from buf leaf page last of all (for index tuples where
1498  * that is known to be safe following our table_index_delete_tuples() call).
1499  *
1500  * Simple index deletion caller only includes TIDs from index tuples marked
1501  * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1502  * included without increasing the total number of distinct table blocks for
1503  * the deletion operation as a whole. This approach often allows us to delete
1504  * some extra index tuples that were practically free for tableam to check in
1505  * passing (when they actually turn out to be safe to delete). It probably
1506  * only makes sense for the tableam to go ahead with these extra checks when
1507  * it is block-oriented (otherwise the checks probably won't be practically
1508  * free, which we rely on). The tableam interface requires the tableam side
1509  * to handle the problem, though, so this is okay (we as an index AM are free
1510  * to make the simplifying assumption that all tableams must be block-based).
1511  *
1512  * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1513  * without expecting that tableam will check most of them. The tableam has
1514  * considerable discretion around which entries/blocks it checks. Our role in
1515  * costing the bottom-up deletion operation is strictly advisory.
1516  *
1517  * Note: Caller must have added deltids entries (i.e. entries that go in
1518  * delstate's main array) in leaf-page-wise order: page offset number order,
1519  * TID order among entries taken from the same posting list tuple (tiebreak on
1520  * TID). This order is convenient to work with here.
1521  *
1522  * Note: We also rely on the id field of each deltids element "capturing" this
1523  * original leaf-page-wise order. That is, we expect to be able to get back
1524  * to the original leaf-page-wise order just by sorting deltids on the id
1525  * field (tableam will sort deltids for its own reasons, so we'll need to put
1526  * it back in leaf-page-wise order afterwards).
1527  */
1528 void
1530  TM_IndexDeleteOp *delstate)
1531 {
1532  Page page = BufferGetPage(buf);
1533  TransactionId snapshotConflictHorizon;
1534  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1535  int ndeletable = 0,
1536  nupdatable = 0;
1539 
1540  /* Use tableam interface to determine which tuples to delete first */
1541  snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1542 
1543  /* Should not WAL-log snapshotConflictHorizon unless it's required */
1544  if (!XLogStandbyInfoActive())
1545  snapshotConflictHorizon = InvalidTransactionId;
1546 
1547  /*
1548  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1549  * needs to do to physically delete index tuples from the page.
1550  *
1551  * Must sort deltids array to restore leaf-page-wise order (original order
1552  * before call to tableam). This is the order that the loop expects.
1553  *
1554  * Note that deltids array might be a lot smaller now. It might even have
1555  * no entries at all (with bottom-up deletion caller), in which case there
1556  * is nothing left to do.
1557  */
1558  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1560  if (delstate->ndeltids == 0)
1561  {
1562  Assert(delstate->bottomup);
1563  return;
1564  }
1565 
1566  /* We definitely have to delete at least one index tuple (or one TID) */
1567  for (int i = 0; i < delstate->ndeltids; i++)
1568  {
1569  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1570  OffsetNumber idxoffnum = dstatus->idxoffnum;
1571  ItemId itemid = PageGetItemId(page, idxoffnum);
1572  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1573  int nestedi,
1574  nitem;
1575  BTVacuumPosting vacposting;
1576 
1577  Assert(OffsetNumberIsValid(idxoffnum));
1578 
1579  if (idxoffnum == postingidxoffnum)
1580  {
1581  /*
1582  * This deltid entry is a TID from a posting list tuple that has
1583  * already been completely processed
1584  */
1585  Assert(BTreeTupleIsPosting(itup));
1587  &delstate->deltids[i].tid) < 0);
1589  &delstate->deltids[i].tid) >= 0);
1590  continue;
1591  }
1592 
1593  if (!BTreeTupleIsPosting(itup))
1594  {
1595  /* Plain non-pivot tuple */
1596  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1597  if (dstatus->knowndeletable)
1598  deletable[ndeletable++] = idxoffnum;
1599  continue;
1600  }
1601 
1602  /*
1603  * itup is a posting list tuple whose lowest deltids entry (which may
1604  * or may not be for the first TID from itup) is considered here now.
1605  * We should process all of the deltids entries for the posting list
1606  * together now, though (not just the lowest). Remember to skip over
1607  * later itup-related entries during later iterations of outermost
1608  * loop.
1609  */
1610  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1611  nestedi = i; /* Initialize for first itup deltids entry */
1612  vacposting = NULL; /* Describes final action for itup */
1613  nitem = BTreeTupleGetNPosting(itup);
1614  for (int p = 0; p < nitem; p++)
1615  {
1616  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1617  int ptidcmp = -1;
1618 
1619  /*
1620  * This nested loop reuses work across ptid TIDs taken from itup.
1621  * We take advantage of the fact that both itup's TIDs and deltids
1622  * entries (within a single itup/posting list grouping) must both
1623  * be in ascending TID order.
1624  */
1625  for (; nestedi < delstate->ndeltids; nestedi++)
1626  {
1627  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1628  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1629 
1630  /* Stop once we get past all itup related deltids entries */
1631  Assert(tdstatus->idxoffnum >= idxoffnum);
1632  if (tdstatus->idxoffnum != idxoffnum)
1633  break;
1634 
1635  /* Skip past non-deletable itup related entries up front */
1636  if (!tdstatus->knowndeletable)
1637  continue;
1638 
1639  /* Entry is first partial ptid match (or an exact match)? */
1640  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1641  if (ptidcmp >= 0)
1642  {
1643  /* Greater than or equal (partial or exact) match... */
1644  break;
1645  }
1646  }
1647 
1648  /* ...exact ptid match to a deletable deltids entry? */
1649  if (ptidcmp != 0)
1650  continue;
1651 
1652  /* Exact match for deletable deltids entry -- ptid gets deleted */
1653  if (vacposting == NULL)
1654  {
1655  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1656  nitem * sizeof(uint16));
1657  vacposting->itup = itup;
1658  vacposting->updatedoffset = idxoffnum;
1659  vacposting->ndeletedtids = 0;
1660  }
1661  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1662  }
1663 
1664  /* Final decision on itup, a posting list tuple */
1665 
1666  if (vacposting == NULL)
1667  {
1668  /* No TIDs to delete from itup -- do nothing */
1669  }
1670  else if (vacposting->ndeletedtids == nitem)
1671  {
1672  /* Straight delete of itup (to delete all TIDs) */
1673  deletable[ndeletable++] = idxoffnum;
1674  /* Turns out we won't need granular information */
1675  pfree(vacposting);
1676  }
1677  else
1678  {
1679  /* Delete some (but not all) TIDs from itup */
1680  Assert(vacposting->ndeletedtids > 0 &&
1681  vacposting->ndeletedtids < nitem);
1682  updatable[nupdatable++] = vacposting;
1683  }
1684  }
1685 
1686  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1687  _bt_delitems_delete(rel, buf, snapshotConflictHorizon,
1688  deletable, ndeletable, updatable, nupdatable);
1689 
1690  /* be tidy */
1691  for (int i = 0; i < nupdatable; i++)
1692  pfree(updatable[i]);
1693 }
1694 
1695 /*
1696  * Check that leftsib page (the btpo_prev of target page) is not marked with
1697  * INCOMPLETE_SPLIT flag. Used during page deletion.
1698  *
1699  * Returning true indicates that page flag is set in leftsib (which is
1700  * definitely still the left sibling of target). When that happens, the
1701  * target doesn't have a downlink in parent, and the page deletion algorithm
1702  * isn't prepared to handle that. Deletion of the target page (or the whole
1703  * subtree that contains the target page) cannot take place.
1704  *
1705  * Caller should not have a lock on the target page itself, since pages on the
1706  * same level must always be locked left to right to avoid deadlocks.
1707  */
1708 static bool
1710 {
1711  Buffer buf;
1712  Page page;
1713  BTPageOpaque opaque;
1714  bool result;
1715 
1716  /* Easy case: No left sibling */
1717  if (leftsib == P_NONE)
1718  return false;
1719 
1720  buf = _bt_getbuf(rel, leftsib, BT_READ);
1721  page = BufferGetPage(buf);
1722  opaque = BTPageGetOpaque(page);
1723 
1724  /*
1725  * If the left sibling was concurrently split, so that its next-pointer
1726  * doesn't point to the current page anymore, the split that created
1727  * target must be completed. Caller can reasonably expect that there will
1728  * be a downlink to the target page that it can relocate using its stack.
1729  * (We don't allow splitting an incompletely split page again until the
1730  * previous split has been completed.)
1731  */
1732  result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1733  _bt_relbuf(rel, buf);
1734 
1735  return result;
1736 }
1737 
1738 /*
1739  * Check that leafrightsib page (the btpo_next of target leaf page) is not
1740  * marked with ISHALFDEAD flag. Used during page deletion.
1741  *
1742  * Returning true indicates that page flag is set in leafrightsib, so page
1743  * deletion cannot go ahead. Our caller is not prepared to deal with the case
1744  * where the parent page does not have a pivot tuples whose downlink points to
1745  * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1746  * seem worth going to the trouble of teaching our caller to deal with it.
1747  * The situation will be resolved after VACUUM finishes the deletion of the
1748  * half-dead page (when a future VACUUM operation reaches the target page
1749  * again).
1750  *
1751  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1752  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1753  * okay because of the restriction on deleting pages that are the rightmost
1754  * page of their parent (i.e. that such deletions can only take place when the
1755  * entire subtree must be deleted). The leaf level check made here will apply
1756  * to a right "cousin" leaf page rather than a simple right sibling leaf page
1757  * in cases where caller actually goes on to attempt deleting pages that are
1758  * above the leaf page. The right cousin leaf page is representative of the
1759  * left edge of the subtree to the right of the to-be-deleted subtree as a
1760  * whole, which is exactly the condition that our caller cares about.
1761  * (Besides, internal pages are never marked half-dead, so it isn't even
1762  * possible to _directly_ assess if an internal page is part of some other
1763  * to-be-deleted subtree.)
1764  */
1765 static bool
1767 {
1768  Buffer buf;
1769  Page page;
1770  BTPageOpaque opaque;
1771  bool result;
1772 
1773  Assert(leafrightsib != P_NONE);
1774 
1775  buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1776  page = BufferGetPage(buf);
1777  opaque = BTPageGetOpaque(page);
1778 
1779  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1780  result = P_ISHALFDEAD(opaque);
1781  _bt_relbuf(rel, buf);
1782 
1783  return result;
1784 }
1785 
1786 /*
1787  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1788  *
1789  * This action unlinks the leaf page from the b-tree structure, removing all
1790  * pointers leading to it --- but not touching its own left and right links.
1791  * The page cannot be physically reclaimed right away, since other processes
1792  * may currently be trying to follow links leading to the page; they have to
1793  * be allowed to use its right-link to recover. See nbtree/README.
1794  *
1795  * On entry, the target buffer must be pinned and locked (either read or write
1796  * lock is OK). The page must be an empty leaf page, which may be half-dead
1797  * already (a half-dead page should only be passed to us when an earlier
1798  * VACUUM operation was interrupted, though). Note in particular that caller
1799  * should never pass a buffer containing an existing deleted page here. The
1800  * lock and pin on caller's buffer will be dropped before we return.
1801  *
1802  * Maintains bulk delete stats for caller, which are taken from vstate. We
1803  * need to cooperate closely with caller here so that whole VACUUM operation
1804  * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
1805  * delete in passing. If such pages happen to be from a block number that is
1806  * ahead of the current scanblkno position, then caller is expected to count
1807  * them directly later on. It's simpler for us to understand caller's
1808  * requirements than it would be for caller to understand when or how a
1809  * deleted page became deleted after the fact.
1810  *
1811  * NOTE: this leaks memory. Rather than trying to clean up everything
1812  * carefully, it's better to run it in a temp context that can be reset
1813  * frequently.
1814  */
1815 void
1816 _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
1817 {
1818  BlockNumber rightsib;
1819  bool rightsib_empty;
1820  Page page;
1821  BTPageOpaque opaque;
1822 
1823  /*
1824  * Save original leafbuf block number from caller. Only deleted blocks
1825  * that are <= scanblkno are added to bulk delete stat's pages_deleted
1826  * count.
1827  */
1828  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1829 
1830  /*
1831  * "stack" is a search stack leading (approximately) to the target page.
1832  * It is initially NULL, but when iterating, we keep it to avoid
1833  * duplicated search effort.
1834  *
1835  * Also, when "stack" is not NULL, we have already checked that the
1836  * current page is not the right half of an incomplete split, i.e. the
1837  * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1838  * when the current target page is to the right of caller's initial page
1839  * (the scanblkno page).
1840  */
1841  BTStack stack = NULL;
1842 
1843  for (;;)
1844  {
1845  page = BufferGetPage(leafbuf);
1846  opaque = BTPageGetOpaque(page);
1847 
1848  /*
1849  * Internal pages are never deleted directly, only as part of deleting
1850  * the whole subtree all the way down to leaf level.
1851  *
1852  * Also check for deleted pages here. Caller never passes us a fully
1853  * deleted page. Only VACUUM can delete pages, so there can't have
1854  * been a concurrent deletion. Assume that we reached any deleted
1855  * page encountered here by following a sibling link, and that the
1856  * index is corrupt.
1857  */
1858  Assert(!P_ISDELETED(opaque));
1859  if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1860  {
1861  /*
1862  * Pre-9.4 page deletion only marked internal pages as half-dead,
1863  * but now we only use that flag on leaf pages. The old algorithm
1864  * was never supposed to leave half-dead pages in the tree, it was
1865  * just a transient state, but it was nevertheless possible in
1866  * error scenarios. We don't know how to deal with them here. They
1867  * are harmless as far as searches are considered, but inserts
1868  * into the deleted keyspace could add out-of-order downlinks in
1869  * the upper levels. Log a notice, hopefully the admin will notice
1870  * and reindex.
1871  */
1872  if (P_ISHALFDEAD(opaque))
1873  ereport(LOG,
1874  (errcode(ERRCODE_INDEX_CORRUPTED),
1875  errmsg("index \"%s\" contains a half-dead internal page",
1877  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1878 
1879  if (P_ISDELETED(opaque))
1880  ereport(LOG,
1881  (errcode(ERRCODE_INDEX_CORRUPTED),
1882  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1883  BufferGetBlockNumber(leafbuf),
1884  scanblkno,
1885  RelationGetRelationName(rel))));
1886 
1887  _bt_relbuf(rel, leafbuf);
1888  return;
1889  }
1890 
1891  /*
1892  * We can never delete rightmost pages nor root pages. While at it,
1893  * check that page is empty, since it's possible that the leafbuf page
1894  * was empty a moment ago, but has since had some inserts.
1895  *
1896  * To keep the algorithm simple, we also never delete an incompletely
1897  * split page (they should be rare enough that this doesn't make any
1898  * meaningful difference to disk usage):
1899  *
1900  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1901  * left half of an incomplete split, but ensuring that it's not the
1902  * right half is more complicated. For that, we have to check that
1903  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1904  * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1905  * release the lock on scanblkno/leafbuf, check the left sibling, and
1906  * construct a search stack to scanblkno. On subsequent iterations,
1907  * we know we stepped right from a page that passed these tests, so
1908  * it's OK.
1909  */
1910  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1911  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1912  P_INCOMPLETE_SPLIT(opaque))
1913  {
1914  /* Should never fail to delete a half-dead page */
1915  Assert(!P_ISHALFDEAD(opaque));
1916 
1917  _bt_relbuf(rel, leafbuf);
1918  return;
1919  }
1920 
1921  /*
1922  * First, remove downlink pointing to the page (or a parent of the
1923  * page, if we are going to delete a taller subtree), and mark the
1924  * leafbuf page half-dead
1925  */
1926  if (!P_ISHALFDEAD(opaque))
1927  {
1928  /*
1929  * We need an approximate pointer to the page's parent page. We
1930  * use a variant of the standard search mechanism to search for
1931  * the page's high key; this will give us a link to either the
1932  * current parent or someplace to its left (if there are multiple
1933  * equal high keys, which is possible with !heapkeyspace indexes).
1934  *
1935  * Also check if this is the right-half of an incomplete split
1936  * (see comment above).
1937  */
1938  if (!stack)
1939  {
1940  BTScanInsert itup_key;
1941  ItemId itemid;
1942  IndexTuple targetkey;
1943  BlockNumber leftsib,
1944  leafblkno;
1945  Buffer sleafbuf;
1946 
1947  itemid = PageGetItemId(page, P_HIKEY);
1948  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1949 
1950  leftsib = opaque->btpo_prev;
1951  leafblkno = BufferGetBlockNumber(leafbuf);
1952 
1953  /*
1954  * To avoid deadlocks, we'd better drop the leaf page lock
1955  * before going further.
1956  */
1957  _bt_unlockbuf(rel, leafbuf);
1958 
1959  /*
1960  * Check that the left sibling of leafbuf (if any) is not
1961  * marked with INCOMPLETE_SPLIT flag before proceeding
1962  */
1963  Assert(leafblkno == scanblkno);
1964  if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1965  {
1966  ReleaseBuffer(leafbuf);
1967  return;
1968  }
1969 
1970  /* we need an insertion scan key for the search, so build one */
1971  itup_key = _bt_mkscankey(rel, targetkey);
1972  /* find the leftmost leaf page with matching pivot/high key */
1973  itup_key->pivotsearch = true;
1974  stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1975  /* won't need a second lock or pin on leafbuf */
1976  _bt_relbuf(rel, sleafbuf);
1977 
1978  /*
1979  * Re-lock the leaf page, and start over to use our stack
1980  * within _bt_mark_page_halfdead. We must do it that way
1981  * because it's possible that leafbuf can no longer be
1982  * deleted. We need to recheck.
1983  *
1984  * Note: We can't simply hold on to the sleafbuf lock instead,
1985  * because it's barely possible that sleafbuf is not the same
1986  * page as leafbuf. This happens when leafbuf split after our
1987  * original lock was dropped, but before _bt_search finished
1988  * its descent. We rely on the assumption that we'll find
1989  * leafbuf isn't safe to delete anymore in this scenario.
1990  * (Page deletion can cope with the stack being to the left of
1991  * leafbuf, but not to the right of leafbuf.)
1992  */
1993  _bt_lockbuf(rel, leafbuf, BT_WRITE);
1994  continue;
1995  }
1996 
1997  /*
1998  * See if it's safe to delete the leaf page, and determine how
1999  * many parent/internal pages above the leaf level will be
2000  * deleted. If it's safe then _bt_mark_page_halfdead will also
2001  * perform the first phase of deletion, which includes marking the
2002  * leafbuf page half-dead.
2003  */
2004  Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2005  if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
2006  {
2007  _bt_relbuf(rel, leafbuf);
2008  return;
2009  }
2010  }
2011 
2012  /*
2013  * Then unlink it from its siblings. Each call to
2014  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2015  * making it shallower. Iterate until the leafbuf page is deleted.
2016  */
2017  rightsib_empty = false;
2018  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2019  while (P_ISHALFDEAD(opaque))
2020  {
2021  /* Check for interrupts in _bt_unlink_halfdead_page */
2022  if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2023  &rightsib_empty, vstate))
2024  {
2025  /*
2026  * _bt_unlink_halfdead_page should never fail, since we
2027  * established that deletion is generally safe in
2028  * _bt_mark_page_halfdead -- index must be corrupt.
2029  *
2030  * Note that _bt_unlink_halfdead_page already released the
2031  * lock and pin on leafbuf for us.
2032  */
2033  Assert(false);
2034  return;
2035  }
2036  }
2037 
2038  Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2039 
2040  rightsib = opaque->btpo_next;
2041 
2042  _bt_relbuf(rel, leafbuf);
2043 
2044  /*
2045  * Check here, as calling loops will have locks held, preventing
2046  * interrupts from being processed.
2047  */
2049 
2050  /*
2051  * The page has now been deleted. If its right sibling is completely
2052  * empty, it's possible that the reason we haven't deleted it earlier
2053  * is that it was the rightmost child of the parent. Now that we
2054  * removed the downlink for this page, the right sibling might now be
2055  * the only child of the parent, and could be removed. It would be
2056  * picked up by the next vacuum anyway, but might as well try to
2057  * remove it now, so loop back to process the right sibling.
2058  *
2059  * Note: This relies on the assumption that _bt_getstackbuf() will be
2060  * able to reuse our original descent stack with a different child
2061  * block (provided that the child block is to the right of the
2062  * original leaf page reached by _bt_search()). It will even update
2063  * the descent stack each time we loop around, avoiding repeated work.
2064  */
2065  if (!rightsib_empty)
2066  break;
2067 
2068  leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2069  }
2070 }
2071 
2072 /*
2073  * First stage of page deletion.
2074  *
2075  * Establish the height of the to-be-deleted subtree with leafbuf at its
2076  * lowest level, remove the downlink to the subtree, and mark leafbuf
2077  * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
2078  * but may include additional internal pages (at most one per level of the
2079  * tree below the root).
2080  *
2081  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2082  * the rightmost child of its parent (and parent has more than one downlink).
2083  * Returns 'true' when the first stage of page deletion completed
2084  * successfully.
2085  */
2086 static bool
2088 {
2089  BlockNumber leafblkno;
2090  BlockNumber leafrightsib;
2091  BlockNumber topparent;
2092  BlockNumber topparentrightsib;
2093  ItemId itemid;
2094  Page page;
2095  BTPageOpaque opaque;
2096  Buffer subtreeparent;
2097  OffsetNumber poffset;
2098  OffsetNumber nextoffset;
2099  IndexTuple itup;
2100  IndexTupleData trunctuple;
2101 
2102  page = BufferGetPage(leafbuf);
2103  opaque = BTPageGetOpaque(page);
2104 
2105  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2106  P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2107  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2108 
2109  /*
2110  * Save info about the leaf page.
2111  */
2112  leafblkno = BufferGetBlockNumber(leafbuf);
2113  leafrightsib = opaque->btpo_next;
2114 
2115  /*
2116  * Before attempting to lock the parent page, check that the right sibling
2117  * is not in half-dead state. A half-dead right sibling would have no
2118  * downlink in the parent, which would be highly confusing later when we
2119  * delete the downlink. It would fail the "right sibling of target page
2120  * is also the next child in parent page" cross-check below.
2121  */
2122  if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2123  {
2124  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2125  leafblkno, leafrightsib);
2126  return false;
2127  }
2128 
2129  /*
2130  * We cannot delete a page that is the rightmost child of its immediate
2131  * parent, unless it is the only child --- in which case the parent has to
2132  * be deleted too, and the same condition applies recursively to it. We
2133  * have to check this condition all the way up before trying to delete,
2134  * and lock the parent of the root of the to-be-deleted subtree (the
2135  * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2136  * for us. We remove the downlink to the "top parent" page (subtree root
2137  * page) from the subtree parent page below.
2138  *
2139  * Initialize topparent to be leafbuf page now. The final to-be-deleted
2140  * subtree is often a degenerate one page subtree consisting only of the
2141  * leafbuf page. When that happens, the leafbuf page is the final subtree
2142  * root page/top parent page.
2143  */
2144  topparent = leafblkno;
2145  topparentrightsib = leafrightsib;
2146  if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
2147  &subtreeparent, &poffset,
2148  &topparent, &topparentrightsib))
2149  return false;
2150 
2151  /*
2152  * Check that the parent-page index items we're about to delete/overwrite
2153  * in subtree parent page contain what we expect. This can fail if the
2154  * index has become corrupt for some reason. We want to throw any error
2155  * before entering the critical section --- otherwise it'd be a PANIC.
2156  */
2157  page = BufferGetPage(subtreeparent);
2158  opaque = BTPageGetOpaque(page);
2159 
2160 #ifdef USE_ASSERT_CHECKING
2161 
2162  /*
2163  * This is just an assertion because _bt_lock_subtree_parent should have
2164  * guaranteed tuple has the expected contents
2165  */
2166  itemid = PageGetItemId(page, poffset);
2167  itup = (IndexTuple) PageGetItem(page, itemid);
2168  Assert(BTreeTupleGetDownLink(itup) == topparent);
2169 #endif
2170 
2171  nextoffset = OffsetNumberNext(poffset);
2172  itemid = PageGetItemId(page, nextoffset);
2173  itup = (IndexTuple) PageGetItem(page, itemid);
2174  if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2175  ereport(ERROR,
2176  (errcode(ERRCODE_INDEX_CORRUPTED),
2177  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2178  topparentrightsib, topparent,
2179  BTreeTupleGetDownLink(itup),
2180  BufferGetBlockNumber(subtreeparent),
2181  RelationGetRelationName(rel))));
2182 
2183  /*
2184  * Any insert which would have gone on the leaf block will now go to its
2185  * right sibling. In other words, the key space moves right.
2186  */
2187  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2188 
2189  /* No ereport(ERROR) until changes are logged */
2191 
2192  /*
2193  * Update parent of subtree. We want to delete the downlink to the top
2194  * parent page/root of the subtree, and the *following* key. Easiest way
2195  * is to copy the right sibling's downlink over the downlink that points
2196  * to top parent page, and then delete the right sibling's original pivot
2197  * tuple.
2198  *
2199  * Lanin and Shasha make the key space move left when deleting a page,
2200  * whereas the key space moves right here. That's why we cannot simply
2201  * delete the pivot tuple with the downlink to the top parent page. See
2202  * nbtree/README.
2203  */
2204  page = BufferGetPage(subtreeparent);
2205  opaque = BTPageGetOpaque(page);
2206 
2207  itemid = PageGetItemId(page, poffset);
2208  itup = (IndexTuple) PageGetItem(page, itemid);
2209  BTreeTupleSetDownLink(itup, topparentrightsib);
2210 
2211  nextoffset = OffsetNumberNext(poffset);
2212  PageIndexTupleDelete(page, nextoffset);
2213 
2214  /*
2215  * Mark the leaf page as half-dead, and stamp it with a link to the top
2216  * parent page. When the leaf page is also the top parent page, the link
2217  * is set to InvalidBlockNumber.
2218  */
2219  page = BufferGetPage(leafbuf);
2220  opaque = BTPageGetOpaque(page);
2221  opaque->btpo_flags |= BTP_HALF_DEAD;
2222 
2224  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2225  trunctuple.t_info = sizeof(IndexTupleData);
2226  if (topparent != leafblkno)
2227  BTreeTupleSetTopParent(&trunctuple, topparent);
2228  else
2230 
2231  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2232  IndexTupleSize(&trunctuple)))
2233  elog(ERROR, "could not overwrite high key in half-dead page");
2234 
2235  /* Must mark buffers dirty before XLogInsert */
2236  MarkBufferDirty(subtreeparent);
2237  MarkBufferDirty(leafbuf);
2238 
2239  /* XLOG stuff */
2240  if (RelationNeedsWAL(rel))
2241  {
2243  XLogRecPtr recptr;
2244 
2245  xlrec.poffset = poffset;
2246  xlrec.leafblk = leafblkno;
2247  if (topparent != leafblkno)
2248  xlrec.topparent = topparent;
2249  else
2250  xlrec.topparent = InvalidBlockNumber;
2251 
2252  XLogBeginInsert();
2253  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2254  XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2255 
2256  page = BufferGetPage(leafbuf);
2257  opaque = BTPageGetOpaque(page);
2258  xlrec.leftblk = opaque->btpo_prev;
2259  xlrec.rightblk = opaque->btpo_next;
2260 
2262 
2263  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2264 
2265  page = BufferGetPage(subtreeparent);
2266  PageSetLSN(page, recptr);
2267  page = BufferGetPage(leafbuf);
2268  PageSetLSN(page, recptr);
2269  }
2270 
2271  END_CRIT_SECTION();
2272 
2273  _bt_relbuf(rel, subtreeparent);
2274  return true;
2275 }
2276 
2277 /*
2278  * Second stage of page deletion.
2279  *
2280  * Unlinks a single page (in the subtree undergoing deletion) from its
2281  * siblings. Also marks the page deleted.
2282  *
2283  * To get rid of the whole subtree, including the leaf page itself, call here
2284  * until the leaf page is deleted. The original "top parent" established in
2285  * the first stage of deletion is deleted in the first call here, while the
2286  * leaf page is deleted in the last call here. Note that the leaf page itself
2287  * is often the initial top parent page.
2288  *
2289  * Returns 'false' if the page could not be unlinked (shouldn't happen). If
2290  * the right sibling of the current target page is empty, *rightsib_empty is
2291  * set to true, allowing caller to delete the target's right sibling page in
2292  * passing. Note that *rightsib_empty is only actually used by caller when
2293  * target page is leafbuf, following last call here for leafbuf/the subtree
2294  * containing leafbuf. (We always set *rightsib_empty for caller, just to be
2295  * consistent.)
2296  *
2297  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2298  * On success exit, we'll be holding pin and write lock. On failure exit,
2299  * we'll release both pin and lock before returning (we define it that way
2300  * to avoid having to reacquire a lock we already released).
2301  */
2302 static bool
2304  bool *rightsib_empty, BTVacState *vstate)
2305 {
2306  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2307  IndexBulkDeleteResult *stats = vstate->stats;
2308  BlockNumber leafleftsib;
2309  BlockNumber leafrightsib;
2310  BlockNumber target;
2311  BlockNumber leftsib;
2312  BlockNumber rightsib;
2313  Buffer lbuf = InvalidBuffer;
2314  Buffer buf;
2315  Buffer rbuf;
2316  Buffer metabuf = InvalidBuffer;
2317  Page metapg = NULL;
2318  BTMetaPageData *metad = NULL;
2319  ItemId itemid;
2320  Page page;
2321  BTPageOpaque opaque;
2322  FullTransactionId safexid;
2323  bool rightsib_is_rightmost;
2324  uint32 targetlevel;
2325  IndexTuple leafhikey;
2326  BlockNumber leaftopparent;
2327 
2328  page = BufferGetPage(leafbuf);
2329  opaque = BTPageGetOpaque(page);
2330 
2331  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2332 
2333  /*
2334  * Remember some information about the leaf page.
2335  */
2336  itemid = PageGetItemId(page, P_HIKEY);
2337  leafhikey = (IndexTuple) PageGetItem(page, itemid);
2338  target = BTreeTupleGetTopParent(leafhikey);
2339  leafleftsib = opaque->btpo_prev;
2340  leafrightsib = opaque->btpo_next;
2341 
2342  _bt_unlockbuf(rel, leafbuf);
2343 
2344  /*
2345  * Check here, as calling loops will have locks held, preventing
2346  * interrupts from being processed.
2347  */
2349 
2350  /* Unlink the current top parent of the subtree */
2351  if (!BlockNumberIsValid(target))
2352  {
2353  /* Target is leaf page (or leaf page is top parent, if you prefer) */
2354  target = leafblkno;
2355 
2356  buf = leafbuf;
2357  leftsib = leafleftsib;
2358  targetlevel = 0;
2359  }
2360  else
2361  {
2362  /* Target is the internal page taken from leaf's top parent link */
2363  Assert(target != leafblkno);
2364 
2365  /* Fetch the block number of the target's left sibling */
2366  buf = _bt_getbuf(rel, target, BT_READ);
2367  page = BufferGetPage(buf);
2368  opaque = BTPageGetOpaque(page);
2369  leftsib = opaque->btpo_prev;
2370  targetlevel = opaque->btpo_level;
2371  Assert(targetlevel > 0);
2372 
2373  /*
2374  * To avoid deadlocks, we'd better drop the target page lock before
2375  * going further.
2376  */
2377  _bt_unlockbuf(rel, buf);
2378  }
2379 
2380  /*
2381  * We have to lock the pages we need to modify in the standard order:
2382  * moving right, then up. Else we will deadlock against other writers.
2383  *
2384  * So, first lock the leaf page, if it's not the target. Then find and
2385  * write-lock the current left sibling of the target page. The sibling
2386  * that was current a moment ago could have split, so we may have to move
2387  * right.
2388  */
2389  if (target != leafblkno)
2390  _bt_lockbuf(rel, leafbuf, BT_WRITE);
2391  if (leftsib != P_NONE)
2392  {
2393  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2394  page = BufferGetPage(lbuf);
2395  opaque = BTPageGetOpaque(page);
2396  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2397  {
2398  bool leftsibvalid = true;
2399 
2400  /*
2401  * Before we follow the link from the page that was the left
2402  * sibling mere moments ago, validate its right link. This
2403  * reduces the opportunities for loop to fail to ever make any
2404  * progress in the presence of index corruption.
2405  *
2406  * Note: we rely on the assumption that there can only be one
2407  * vacuum process running at a time (against the same index).
2408  */
2409  if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2410  leftsib == opaque->btpo_next)
2411  leftsibvalid = false;
2412 
2413  leftsib = opaque->btpo_next;
2414  _bt_relbuf(rel, lbuf);
2415 
2416  if (!leftsibvalid)
2417  {
2418  if (target != leafblkno)
2419  {
2420  /* we have only a pin on target, but pin+lock on leafbuf */
2421  ReleaseBuffer(buf);
2422  _bt_relbuf(rel, leafbuf);
2423  }
2424  else
2425  {
2426  /* we have only a pin on leafbuf */
2427  ReleaseBuffer(leafbuf);
2428  }
2429 
2430  ereport(LOG,
2431  (errcode(ERRCODE_INDEX_CORRUPTED),
2432  errmsg_internal("valid left sibling for deletion target could not be located: "
2433  "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
2434  leftsib, target, leafblkno, scanblkno,
2435  RelationGetRelationName(rel))));
2436 
2437  return false;
2438  }
2439 
2441 
2442  /* step right one page */
2443  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2444  page = BufferGetPage(lbuf);
2445  opaque = BTPageGetOpaque(page);
2446  }
2447  }
2448  else
2449  lbuf = InvalidBuffer;
2450 
2451  /* Next write-lock the target page itself */
2452  _bt_lockbuf(rel, buf, BT_WRITE);
2453  page = BufferGetPage(buf);
2454  opaque = BTPageGetOpaque(page);
2455 
2456  /*
2457  * Check page is still empty etc, else abandon deletion. This is just for
2458  * paranoia's sake; a half-dead page cannot resurrect because there can be
2459  * only one vacuum process running at a time.
2460  */
2461  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2462  elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2463  target, RelationGetRelationName(rel));
2464 
2465  if (opaque->btpo_prev != leftsib)
2466  ereport(ERROR,
2467  (errcode(ERRCODE_INDEX_CORRUPTED),
2468  errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2469  leftsib, opaque->btpo_prev, target,
2470  RelationGetRelationName(rel))));
2471 
2472  if (target == leafblkno)
2473  {
2474  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2475  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2476  elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2477  target, RelationGetRelationName(rel));
2478 
2479  /* Leaf page is also target page: don't set leaftopparent */
2480  leaftopparent = InvalidBlockNumber;
2481  }
2482  else
2483  {
2484  IndexTuple finaldataitem;
2485 
2486  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2487  P_ISLEAF(opaque))
2488  elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2489  targetlevel, target, RelationGetRelationName(rel));
2490 
2491  /* Target is internal: set leaftopparent for next call here... */
2492  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2493  finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2494  leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2495  /* ...except when it would be a redundant pointer-to-self */
2496  if (leaftopparent == leafblkno)
2497  leaftopparent = InvalidBlockNumber;
2498  }
2499 
2500  /* No leaftopparent for level 0 (leaf page) or level 1 target */
2501  Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2502 
2503  /*
2504  * And next write-lock the (current) right sibling.
2505  */
2506  rightsib = opaque->btpo_next;
2507  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2508  page = BufferGetPage(rbuf);
2509  opaque = BTPageGetOpaque(page);
2510  if (opaque->btpo_prev != target)
2511  ereport(ERROR,
2512  (errcode(ERRCODE_INDEX_CORRUPTED),
2513  errmsg_internal("right sibling's left-link doesn't match: "
2514  "block %u links to %u instead of expected %u in index \"%s\"",
2515  rightsib, opaque->btpo_prev, target,
2516  RelationGetRelationName(rel))));
2517  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2518  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2519 
2520  /*
2521  * If we are deleting the next-to-last page on the target's level, then
2522  * the rightsib is a candidate to become the new fast root. (In theory, it
2523  * might be possible to push the fast root even further down, but the odds
2524  * of doing so are slim, and the locking considerations daunting.)
2525  *
2526  * We can safely acquire a lock on the metapage here --- see comments for
2527  * _bt_newroot().
2528  */
2529  if (leftsib == P_NONE && rightsib_is_rightmost)
2530  {
2531  page = BufferGetPage(rbuf);
2532  opaque = BTPageGetOpaque(page);
2533  if (P_RIGHTMOST(opaque))
2534  {
2535  /* rightsib will be the only one left on the level */
2536  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2537  metapg = BufferGetPage(metabuf);
2538  metad = BTPageGetMeta(metapg);
2539 
2540  /*
2541  * The expected case here is btm_fastlevel == targetlevel+1; if
2542  * the fastlevel is <= targetlevel, something is wrong, and we
2543  * choose to overwrite it to fix it.
2544  */
2545  if (metad->btm_fastlevel > targetlevel + 1)
2546  {
2547  /* no update wanted */
2548  _bt_relbuf(rel, metabuf);
2549  metabuf = InvalidBuffer;
2550  }
2551  }
2552  }
2553 
2554  /*
2555  * Here we begin doing the deletion.
2556  */
2557 
2558  /* No ereport(ERROR) until changes are logged */
2560 
2561  /*
2562  * Update siblings' side-links. Note the target page's side-links will
2563  * continue to point to the siblings. Asserts here are just rechecking
2564  * things we already verified above.
2565  */
2566  if (BufferIsValid(lbuf))
2567  {
2568  page = BufferGetPage(lbuf);
2569  opaque = BTPageGetOpaque(page);
2570  Assert(opaque->btpo_next == target);
2571  opaque->btpo_next = rightsib;
2572  }
2573  page = BufferGetPage(rbuf);
2574  opaque = BTPageGetOpaque(page);
2575  Assert(opaque->btpo_prev == target);
2576  opaque->btpo_prev = leftsib;
2577 
2578  /*
2579  * If we deleted a parent of the targeted leaf page, instead of the leaf
2580  * itself, update the leaf to point to the next remaining child in the
2581  * subtree.
2582  *
2583  * Note: We rely on the fact that a buffer pin on the leaf page has been
2584  * held since leafhikey was initialized. This is safe, though only
2585  * because the page was already half-dead at that point. The leaf page
2586  * cannot have been modified by any other backend during the period when
2587  * no lock was held.
2588  */
2589  if (target != leafblkno)
2590  BTreeTupleSetTopParent(leafhikey, leaftopparent);
2591 
2592  /*
2593  * Mark the page itself deleted. It can be recycled when all current
2594  * transactions are gone. Storing GetTopTransactionId() would work, but
2595  * we're in VACUUM and would not otherwise have an XID. Having already
2596  * updated links to the target, ReadNextFullTransactionId() suffices as an
2597  * upper bound. Any scan having retained a now-stale link is advertising
2598  * in its PGPROC an xmin less than or equal to the value we read here. It
2599  * will continue to do so, holding back the xmin horizon, for the duration
2600  * of that scan.
2601  */
2602  page = BufferGetPage(buf);
2603  opaque = BTPageGetOpaque(page);
2604  Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2605 
2606  /*
2607  * Store upper bound XID that's used to determine when deleted page is no
2608  * longer needed as a tombstone
2609  */
2610  safexid = ReadNextFullTransactionId();
2611  BTPageSetDeleted(page, safexid);
2612  opaque->btpo_cycleid = 0;
2613 
2614  /* And update the metapage, if needed */
2615  if (BufferIsValid(metabuf))
2616  {
2617  /* upgrade metapage if needed */
2618  if (metad->btm_version < BTREE_NOVAC_VERSION)
2619  _bt_upgrademetapage(metapg);
2620  metad->btm_fastroot = rightsib;
2621  metad->btm_fastlevel = targetlevel;
2622  MarkBufferDirty(metabuf);
2623  }
2624 
2625  /* Must mark buffers dirty before XLogInsert */
2626  MarkBufferDirty(rbuf);
2628  if (BufferIsValid(lbuf))
2629  MarkBufferDirty(lbuf);
2630  if (target != leafblkno)
2631  MarkBufferDirty(leafbuf);
2632 
2633  /* XLOG stuff */
2634  if (RelationNeedsWAL(rel))
2635  {
2636  xl_btree_unlink_page xlrec;
2637  xl_btree_metadata xlmeta;
2638  uint8 xlinfo;
2639  XLogRecPtr recptr;
2640 
2641  XLogBeginInsert();
2642 
2644  if (BufferIsValid(lbuf))
2647  if (target != leafblkno)
2648  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2649 
2650  /* information stored on the target/to-be-unlinked block */
2651  xlrec.leftsib = leftsib;
2652  xlrec.rightsib = rightsib;
2653  xlrec.level = targetlevel;
2654  xlrec.safexid = safexid;
2655 
2656  /* information needed to recreate the leaf block (if not the target) */
2657  xlrec.leafleftsib = leafleftsib;
2658  xlrec.leafrightsib = leafrightsib;
2659  xlrec.leaftopparent = leaftopparent;
2660 
2661  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2662 
2663  if (BufferIsValid(metabuf))
2664  {
2666 
2668  xlmeta.version = metad->btm_version;
2669  xlmeta.root = metad->btm_root;
2670  xlmeta.level = metad->btm_level;
2671  xlmeta.fastroot = metad->btm_fastroot;
2672  xlmeta.fastlevel = metad->btm_fastlevel;
2674  xlmeta.allequalimage = metad->btm_allequalimage;
2675 
2676  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2677  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2678  }
2679  else
2680  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2681 
2682  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2683 
2684  if (BufferIsValid(metabuf))
2685  {
2686  PageSetLSN(metapg, recptr);
2687  }
2688  page = BufferGetPage(rbuf);
2689  PageSetLSN(page, recptr);
2690  page = BufferGetPage(buf);
2691  PageSetLSN(page, recptr);
2692  if (BufferIsValid(lbuf))
2693  {
2694  page = BufferGetPage(lbuf);
2695  PageSetLSN(page, recptr);
2696  }
2697  if (target != leafblkno)
2698  {
2699  page = BufferGetPage(leafbuf);
2700  PageSetLSN(page, recptr);
2701  }
2702  }
2703 
2704  END_CRIT_SECTION();
2705 
2706  /* release metapage */
2707  if (BufferIsValid(metabuf))
2708  _bt_relbuf(rel, metabuf);
2709 
2710  /* release siblings */
2711  if (BufferIsValid(lbuf))
2712  _bt_relbuf(rel, lbuf);
2713  _bt_relbuf(rel, rbuf);
2714 
2715  /* If the target is not leafbuf, we're done with it now -- release it */
2716  if (target != leafblkno)
2717  _bt_relbuf(rel, buf);
2718 
2719  /*
2720  * Maintain pages_newly_deleted, which is simply the number of pages
2721  * deleted by the ongoing VACUUM operation.
2722  *
2723  * Maintain pages_deleted in a way that takes into account how
2724  * btvacuumpage() will count deleted pages that have yet to become
2725  * scanblkno -- only count page when it's not going to get that treatment
2726  * later on.
2727  */
2728  stats->pages_newly_deleted++;
2729  if (target <= scanblkno)
2730  stats->pages_deleted++;
2731 
2732  /*
2733  * Remember information about the target page (now a newly deleted page)
2734  * in dedicated vstate space for later. The page will be considered as a
2735  * candidate to place in the FSM at the end of the current btvacuumscan()
2736  * call.
2737  */
2738  _bt_pendingfsm_add(vstate, target, safexid);
2739 
2740  return true;
2741 }
2742 
2743 /*
2744  * Establish how tall the to-be-deleted subtree will be during the first stage
2745  * of page deletion.
2746  *
2747  * Caller's child argument is the block number of the page caller wants to
2748  * delete (this is leafbuf's block number, except when we're called
2749  * recursively). stack is a search stack leading to it. Note that we will
2750  * update the stack entry(s) to reflect current downlink positions --- this is
2751  * similar to the corresponding point in page split handling.
2752  *
2753  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2754  * false. Returns true on success, in which case caller can use certain
2755  * details established here to perform the first stage of deletion. This
2756  * function is the last point at which page deletion may be deemed unsafe
2757  * (barring index corruption, or unexpected concurrent page deletions).
2758  *
2759  * We write lock the parent of the root of the to-be-deleted subtree for
2760  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2761  * caller). Caller will have to remove a downlink from *subtreeparent. We
2762  * also set a *subtreeparent offset number in *poffset, to indicate the
2763  * location of the pivot tuple that contains the relevant downlink.
2764  *
2765  * The root of the to-be-deleted subtree is called the "top parent". Note
2766  * that the leafbuf page is often the final "top parent" page (you can think
2767  * of the leafbuf page as a degenerate single page subtree when that happens).
2768  * Caller should initialize *topparent to the target leafbuf page block number
2769  * (while *topparentrightsib should be set to leafbuf's right sibling block
2770  * number). We will update *topparent (and *topparentrightsib) for caller
2771  * here, though only when it turns out that caller will delete at least one
2772  * internal page (i.e. only when caller needs to store a valid link to the top
2773  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2774  */
2775 static bool
2777  Buffer *subtreeparent, OffsetNumber *poffset,
2778  BlockNumber *topparent, BlockNumber *topparentrightsib)
2779 {
2780  BlockNumber parent,
2781  leftsibparent;
2782  OffsetNumber parentoffset,
2783  maxoff;
2784  Buffer pbuf;
2785  Page page;
2786  BTPageOpaque opaque;
2787 
2788  /*
2789  * Locate the pivot tuple whose downlink points to "child". Write lock
2790  * the parent page itself.
2791  */
2792  pbuf = _bt_getstackbuf(rel, stack, child);
2793  if (pbuf == InvalidBuffer)
2794  {
2795  /*
2796  * Failed to "re-find" a pivot tuple whose downlink matched our child
2797  * block number on the parent level -- the index must be corrupt.
2798  * Don't even try to delete the leafbuf subtree. Just report the
2799  * issue and press on with vacuuming the index.
2800  *
2801  * Note: _bt_getstackbuf() recovers from concurrent page splits that
2802  * take place on the parent level. Its approach is a near-exhaustive
2803  * linear search. This also gives it a surprisingly good chance of
2804  * recovering in the event of a buggy or inconsistent opclass. But we
2805  * don't rely on that here.
2806  */
2807  ereport(LOG,
2808  (errcode(ERRCODE_INDEX_CORRUPTED),
2809  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2810  RelationGetRelationName(rel), child)));
2811  return false;
2812  }
2813 
2814  parent = stack->bts_blkno;
2815  parentoffset = stack->bts_offset;
2816 
2817  page = BufferGetPage(pbuf);
2818  opaque = BTPageGetOpaque(page);
2819  maxoff = PageGetMaxOffsetNumber(page);
2820  leftsibparent = opaque->btpo_prev;
2821 
2822  /*
2823  * _bt_getstackbuf() completes page splits on returned parent buffer when
2824  * required.
2825  *
2826  * In general it's a bad idea for VACUUM to use up more disk space, which
2827  * is why page deletion does not finish incomplete page splits most of the
2828  * time. We allow this limited exception because the risk is much lower,
2829  * and the potential downside of not proceeding is much higher: A single
2830  * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2831  * prevent us from deleting hundreds of empty leaf pages from one level
2832  * down.
2833  */
2834  Assert(!P_INCOMPLETE_SPLIT(opaque));
2835 
2836  if (parentoffset < maxoff)
2837  {
2838  /*
2839  * Child is not the rightmost child in parent, so it's safe to delete
2840  * the subtree whose root/topparent is child page
2841  */
2842  *subtreeparent = pbuf;
2843  *poffset = parentoffset;
2844  return true;
2845  }
2846 
2847  /*
2848  * Child is the rightmost child of parent.
2849  *
2850  * Since it's the rightmost child of parent, deleting the child (or
2851  * deleting the subtree whose root/topparent is the child page) is only
2852  * safe when it's also possible to delete the parent.
2853  */
2854  Assert(parentoffset == maxoff);
2855  if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2856  {
2857  /*
2858  * Child isn't parent's only child, or parent is rightmost on its
2859  * entire level. Definitely cannot delete any pages.
2860  */
2861  _bt_relbuf(rel, pbuf);
2862  return false;
2863  }
2864 
2865  /*
2866  * Now make sure that the parent deletion is itself safe by examining the
2867  * child's grandparent page. Recurse, passing the parent page as the
2868  * child page (child's grandparent is the parent on the next level up). If
2869  * parent deletion is unsafe, then child deletion must also be unsafe (in
2870  * which case caller cannot delete any pages at all).
2871  */
2872  *topparent = parent;
2873  *topparentrightsib = opaque->btpo_next;
2874 
2875  /*
2876  * Release lock on parent before recursing.
2877  *
2878  * It's OK to release page locks on parent before recursive call locks
2879  * grandparent. An internal page can only acquire an entry if the child
2880  * is split, but that cannot happen as long as we still hold a lock on the
2881  * leafbuf page.
2882  */
2883  _bt_relbuf(rel, pbuf);
2884 
2885  /*
2886  * Before recursing, check that the left sibling of parent (if any) is not
2887  * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2888  * parent lock).
2889  *
2890  * Note: We deliberately avoid completing incomplete splits here.
2891  */
2892  if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2893  return false;
2894 
2895  /* Recurse to examine child page's grandparent page */
2896  return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2897  subtreeparent, poffset,
2898  topparent, topparentrightsib);
2899 }
2900 
2901 /*
2902  * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
2903  * optimization.
2904  *
2905  * Called at the start of a btvacuumscan(). Caller's cleanuponly argument
2906  * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
2907  *
2908  * We expect to allocate memory inside VACUUM's top-level memory context here.
2909  * The working buffer is subject to a limit based on work_mem. Our strategy
2910  * when the array can no longer grow within the bounds of that limit is to
2911  * stop saving additional newly deleted pages, while proceeding as usual with
2912  * the pages that we can fit.
2913  */
2914 void
2915 _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
2916 {
2917  int64 maxbufsize;
2918 
2919  /*
2920  * Don't bother with optimization in cleanup-only case -- we don't expect
2921  * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2922  * can only take place because this optimization didn't work out during
2923  * the last VACUUM.
2924  */
2925  if (cleanuponly)
2926  return;
2927 
2928  /*
2929  * Cap maximum size of array so that we always respect work_mem. Avoid
2930  * int overflow here.
2931  */
2932  vstate->bufsize = 256;
2933  maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
2934  maxbufsize = Min(maxbufsize, INT_MAX);
2935  maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2936  /* Stay sane with small work_mem */
2937  maxbufsize = Max(maxbufsize, vstate->bufsize);
2938  vstate->maxbufsize = maxbufsize;
2939 
2940  /* Allocate buffer, indicate that there are currently 0 pending pages */
2941  vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2942  vstate->npendingpages = 0;
2943 }
2944 
2945 /*
2946  * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
2947  * the ongoing VACUUM operation) into the free space map -- though only when
2948  * it is actually safe to do so by now.
2949  *
2950  * Called at the end of a btvacuumscan(), just before free space map vacuuming
2951  * takes place.
2952  *
2953  * Frees memory allocated by _bt_pendingfsm_init(), if any.
2954  */
2955 void
2957 {
2958  IndexBulkDeleteResult *stats = vstate->stats;
2959 
2960  Assert(stats->pages_newly_deleted >= vstate->npendingpages);
2961 
2962  if (vstate->npendingpages == 0)
2963  {
2964  /* Just free memory when nothing to do */
2965  if (vstate->pendingpages)
2966  pfree(vstate->pendingpages);
2967 
2968  return;
2969  }
2970 
2971 #ifdef DEBUG_BTREE_PENDING_FSM
2972 
2973  /*
2974  * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
2975  * placing pending pages in the FSM. Note that the optimization will
2976  * never be effective without some other backend concurrently consuming an
2977  * XID.
2978  */
2979  pg_usleep(5000000L);
2980 #endif
2981 
2982  /*
2983  * Recompute VACUUM XID boundaries.
2984  *
2985  * We don't actually care about the oldest non-removable XID. Computing
2986  * the oldest such XID has a useful side-effect that we rely on: it
2987  * forcibly updates the XID horizon state for this backend. This step is
2988  * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
2989  * that it is now safe to recycle newly deleted pages without this step.
2990  */
2992 
2993  for (int i = 0; i < vstate->npendingpages; i++)
2994  {
2995  BlockNumber target = vstate->pendingpages[i].target;
2996  FullTransactionId safexid = vstate->pendingpages[i].safexid;
2997 
2998  /*
2999  * Do the equivalent of checking BTPageIsRecyclable(), but without
3000  * accessing the page again a second time.
3001  *
3002  * Give up on finding the first non-recyclable page -- all later pages
3003  * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3004  * to the array in safexid order.
3005  */
3006  if (!GlobalVisCheckRemovableFullXid(NULL, safexid))
3007  break;
3008 
3009  RecordFreeIndexPage(rel, target);
3010  stats->pages_free++;
3011  }
3012 
3013  pfree(vstate->pendingpages);
3014 }
3015 
3016 /*
3017  * Maintain array of pages that were deleted during current btvacuumscan()
3018  * call, for use in _bt_pendingfsm_finalize()
3019  */
3020 static void
3022  BlockNumber target,
3023  FullTransactionId safexid)
3024 {
3025  Assert(vstate->npendingpages <= vstate->bufsize);
3026  Assert(vstate->bufsize <= vstate->maxbufsize);
3027 
3028 #ifdef USE_ASSERT_CHECKING
3029 
3030  /*
3031  * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3032  * array will always be in safexid order (since that is the order that we
3033  * save them in here)
3034  */
3035  if (vstate->npendingpages > 0)
3036  {
3037  FullTransactionId lastsafexid =
3038  vstate->pendingpages[vstate->npendingpages - 1].safexid;
3039 
3040  Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3041  }
3042 #endif
3043 
3044  /*
3045  * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3046  * information about this page.
3047  *
3048  * Note that this also covers the case where we opted to not use the
3049  * optimization in _bt_pendingfsm_init().
3050  */
3051  if (vstate->npendingpages == vstate->maxbufsize)
3052  return;
3053 
3054  /* Consider enlarging buffer */
3055  if (vstate->npendingpages == vstate->bufsize)
3056  {
3057  int newbufsize = vstate->bufsize * 2;
3058 
3059  /* Respect work_mem */
3060  if (newbufsize > vstate->maxbufsize)
3061  newbufsize = vstate->maxbufsize;
3062 
3063  vstate->bufsize = newbufsize;
3064  vstate->pendingpages =
3065  repalloc(vstate->pendingpages,
3066  sizeof(BTPendingFSM) * vstate->bufsize);
3067  }
3068 
3069  /* Save metadata for newly deleted page */
3070  vstate->pendingpages[vstate->npendingpages].target = target;
3071  vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3072  vstate->npendingpages++;
3073 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2811
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1694
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4272
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4005
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1631
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4303
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4246
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:704
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:110
#define P_NEW
Definition: bufmgr.h:105
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:161
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:285
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:271
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:233
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1405
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:1052
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42
PageHeaderData * PageHeader
Definition: bufpage.h:170
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
static uint16 PageGetSpecialSize(Page page)
Definition: bufpage.h:313
unsigned short uint16
Definition: c.h:489
unsigned int uint32
Definition: c.h:490
#define Min(x, y)
Definition: c.h:988
#define MAXALIGN(LEN)
Definition: c.h:795
#define Max(x, y)
Definition: c.h:982
unsigned char uint8
Definition: c.h:488
#define MemSet(start, val, len)
Definition: c.h:1004
uint32 TransactionId
Definition: c.h:636
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:166
size_t Size
Definition: c.h:589
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
int work_mem
Definition: globals.c:125
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
Pointer Item
Definition: item.h:17
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
struct IndexTupleData IndexTupleData
#define MaxIndexTuplesPerPage
Definition: itup.h:165
Assert(fmt[strlen(fmt) - 1] !='\n')
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void pfree(void *pointer)
Definition: mcxt.c:1436
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1456
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1005
void * palloc(Size size)
Definition: mcxt.c:1210
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define MaxAllocSize
Definition: memutils.h:40
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:924
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2307
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1015
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:109
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:577
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:672
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1141
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1766
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1816
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1166
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:343
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1709
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:736
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:144
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:234
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1105
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:871
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1296
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1082
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1121
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:69
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1529
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:181
static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2087
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1416
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1475
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2956
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1051
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
Definition: nbtpage.c:828
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2915
static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, FullTransactionId safexid)
Definition: nbtpage.c:3021
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2303
static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2776
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:260
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define BTP_LEAF
Definition: nbtree.h:76
#define BTP_HALF_DEAD
Definition: nbtree.h:80
static bool BTPageIsRecyclable(Page page)
Definition: nbtree.h:291
#define P_HIKEY
Definition: nbtree.h:368
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:627
#define P_ISMETA(opaque)
Definition: nbtree.h:223
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTP_META
Definition: nbtree.h:79
#define BTREE_VERSION
Definition: nbtree.h:150
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:621
#define BTP_ROOT
Definition: nbtree.h:77
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:563
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_ISROOT(opaque)
Definition: nbtree.h:221
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BTREE_METAPAGE
Definition: nbtree.h:148
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define BT_READ
Definition: nbtree.h:720
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
static void BTPageSetDeleted(Page page, FullTransactionId safexid)
Definition: nbtree.h:239
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define BT_WRITE
Definition: nbtree.h:721
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:96
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:228
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:260
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
#define SizeOfBtreeDelete
Definition: nbtxlog.h:245
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:283
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:35
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:36
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:339
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:38
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:320
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:190
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:34
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:67
#define qsort(a, b, c, d)
Definition: port.h:445
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3163
short access
Definition: preproc-type.c:36
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:2034
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4300
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:648
#define RelationGetRelationName(relation)
Definition: rel.h:537
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
void pg_usleep(long microsec)
Definition: signal.c:53
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105
BlockNumber btm_root
Definition: nbtree.h:107
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint16 btpo_flags
Definition: nbtree.h:67
uint32 btpo_level
Definition: nbtree.h:66
BTCycleId btpo_cycleid
Definition: nbtree.h:68
FullTransactionId safexid
Definition: nbtree.h:328
BlockNumber target
Definition: nbtree.h:327
bool pivotsearch
Definition: nbtree.h:796
BlockNumber bts_blkno
Definition: nbtree.h:735
struct BTStackData * bts_parent
Definition: nbtree.h:737
OffsetNumber bts_offset
Definition: nbtree.h:736
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:917
uint16 ndeletedtids
Definition: nbtree.h:916
IndexTuple itup
Definition: nbtree.h:912
OffsetNumber updatedoffset
Definition: nbtree.h:913
BlockNumber pages_deleted
Definition: genam.h:81
BlockNumber pages_newly_deleted
Definition: genam.h:80
BlockNumber pages_free
Definition: genam.h:82
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
void * rd_amcache
Definition: rel.h:227
MemoryContext rd_indexcxt
Definition: rel.h:202
RelFileLocator rd_locator
Definition: rel.h:56
TM_IndexStatus * status
Definition: tableam.h:247
TM_IndexDelete * deltids
Definition: tableam.h:246
ItemPointerData tid
Definition: tableam.h:205
bool knowndeletable
Definition: tableam.h:212
OffsetNumber idxoffnum
Definition: tableam.h:211
TransactionId snapshotConflictHorizon
Definition: nbtxlog.h:232
uint16 ndeleted
Definition: nbtxlog.h:233
uint16 nupdated
Definition: nbtxlog.h:234
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:336
BlockNumber rootblk
Definition: nbtxlog.h:335
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
uint16 ndeletedtids
Definition: nbtxlog.h:255
uint16 ndeleted
Definition: nbtxlog.h:220
uint16 nupdated
Definition: nbtxlog.h:221
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1353
#define InvalidTransactionId
Definition: transam.h:31
#define FullTransactionIdFollowsOrEquals(a, b)
Definition: transam.h:54
FullTransactionId ReadNextFullTransactionId(void)
Definition: varsup.c:261
#define XLogStandbyInfoActive()
Definition: xlog.h:118
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:351
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:451
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:389
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33