PostgreSQL Source Code  git master
nbtpage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtpage.c
4  * BTree-specific page management code for the Postgres btree access
5  * method.
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  * src/backend/access/nbtree/nbtpage.c
13  *
14  * NOTES
15  * Postgres btree pages look like ordinary relation pages. The opaque
16  * data at high addresses includes pointers to left and right siblings
17  * and flag data describing page state. The first page in a btree, page
18  * zero, is special -- it stores meta-information describing the tree.
19  * Pages one and higher store the actual tree data.
20  *
21  *-------------------------------------------------------------------------
22  */
23 #include "postgres.h"
24 
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/tableam.h"
28 #include "access/transam.h"
29 #include "access/xlog.h"
30 #include "access/xloginsert.h"
31 #include "miscadmin.h"
32 #include "storage/indexfsm.h"
33 #include "storage/lmgr.h"
34 #include "storage/predicate.h"
35 #include "utils/memdebug.h"
36 #include "utils/snapmgr.h"
37 
38 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
39 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
40  TransactionId latestRemovedXid);
41 static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
42  OffsetNumber *deletable, int ndeletable);
43 static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
44  BTStack stack);
45 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
46  BlockNumber scanblkno,
47  bool *rightsib_empty,
48  TransactionId *oldestBtpoXact,
49  uint32 *ndeleted);
50 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
51  BTStack stack,
52  Buffer *subtreeparent,
53  OffsetNumber *poffset,
54  BlockNumber *topparent,
55  BlockNumber *topparentrightsib);
56 
57 /*
58  * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
59  */
60 void
61 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
62  bool allequalimage)
63 {
64  BTMetaPageData *metad;
65  BTPageOpaque metaopaque;
66 
67  _bt_pageinit(page, BLCKSZ);
68 
69  metad = BTPageGetMeta(page);
70  metad->btm_magic = BTREE_MAGIC;
71  metad->btm_version = BTREE_VERSION;
72  metad->btm_root = rootbknum;
73  metad->btm_level = level;
74  metad->btm_fastroot = rootbknum;
75  metad->btm_fastlevel = level;
78  metad->btm_allequalimage = allequalimage;
79 
80  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
81  metaopaque->btpo_flags = BTP_META;
82 
83  /*
84  * Set pd_lower just past the end of the metadata. This is essential,
85  * because without doing so, metadata will be lost if xlog.c compresses
86  * the page.
87  */
88  ((PageHeader) page)->pd_lower =
89  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
90 }
91 
92 /*
93  * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
94  * 3, the last version that can be updated without broadly affecting
95  * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
96  *
97  * This routine does purely in-memory image upgrade. Caller is
98  * responsible for locking, WAL-logging etc.
99  */
100 void
102 {
103  BTMetaPageData *metad;
105 
106  metad = BTPageGetMeta(page);
107  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
108 
109  /* It must be really a meta page of upgradable version */
110  Assert(metaopaque->btpo_flags & BTP_META);
113 
114  /* Set version number and fill extra fields added into version 3 */
117  metad->btm_last_cleanup_num_heap_tuples = -1.0;
118  /* Only a REINDEX can set this field */
119  Assert(!metad->btm_allequalimage);
120  metad->btm_allequalimage = false;
121 
122  /* Adjust pd_lower (see _bt_initmetapage() for details) */
123  ((PageHeader) page)->pd_lower =
124  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
125 }
126 
127 /*
128  * Get metadata from share-locked buffer containing metapage, while performing
129  * standard sanity checks.
130  *
131  * Callers that cache data returned here in local cache should note that an
132  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
133  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
134  */
135 static BTMetaPageData *
137 {
138  Page metapg;
139  BTPageOpaque metaopaque;
140  BTMetaPageData *metad;
141 
142  metapg = BufferGetPage(metabuf);
143  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
144  metad = BTPageGetMeta(metapg);
145 
146  /* sanity-check the metapage */
147  if (!P_ISMETA(metaopaque) ||
148  metad->btm_magic != BTREE_MAGIC)
149  ereport(ERROR,
150  (errcode(ERRCODE_INDEX_CORRUPTED),
151  errmsg("index \"%s\" is not a btree",
152  RelationGetRelationName(rel))));
153 
154  if (metad->btm_version < BTREE_MIN_VERSION ||
155  metad->btm_version > BTREE_VERSION)
156  ereport(ERROR,
157  (errcode(ERRCODE_INDEX_CORRUPTED),
158  errmsg("version mismatch in index \"%s\": file version %d, "
159  "current version %d, minimal supported version %d",
162 
163  return metad;
164 }
165 
166 /*
167  * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
168  * the metapage.
169  *
170  * This routine checks if provided cleanup-related information is matching
171  * to those written in the metapage. On mismatch, metapage is overwritten.
172  */
173 void
175  float8 numHeapTuples)
176 {
177  Buffer metabuf;
178  Page metapg;
179  BTMetaPageData *metad;
180  bool needsRewrite = false;
181  XLogRecPtr recptr;
182 
183  /* read the metapage and check if it needs rewrite */
184  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
185  metapg = BufferGetPage(metabuf);
186  metad = BTPageGetMeta(metapg);
187 
188  /* outdated version of metapage always needs rewrite */
189  if (metad->btm_version < BTREE_NOVAC_VERSION)
190  needsRewrite = true;
191  else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
192  metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
193  needsRewrite = true;
194 
195  if (!needsRewrite)
196  {
197  _bt_relbuf(rel, metabuf);
198  return;
199  }
200 
201  /* trade in our read lock for a write lock */
202  _bt_unlockbuf(rel, metabuf);
203  _bt_lockbuf(rel, metabuf, BT_WRITE);
204 
206 
207  /* upgrade meta-page if needed */
208  if (metad->btm_version < BTREE_NOVAC_VERSION)
209  _bt_upgrademetapage(metapg);
210 
211  /* update cleanup-related information */
212  metad->btm_oldest_btpo_xact = oldestBtpoXact;
213  metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
214  MarkBufferDirty(metabuf);
215 
216  /* write wal record if needed */
217  if (RelationNeedsWAL(rel))
218  {
220 
221  XLogBeginInsert();
223 
225  md.version = metad->btm_version;
226  md.root = metad->btm_root;
227  md.level = metad->btm_level;
228  md.fastroot = metad->btm_fastroot;
229  md.fastlevel = metad->btm_fastlevel;
230  md.oldest_btpo_xact = oldestBtpoXact;
231  md.last_cleanup_num_heap_tuples = numHeapTuples;
232  md.allequalimage = metad->btm_allequalimage;
233 
234  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
235 
236  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
237 
238  PageSetLSN(metapg, recptr);
239  }
240 
242  _bt_relbuf(rel, metabuf);
243 }
244 
245 /*
246  * _bt_getroot() -- Get the root page of the btree.
247  *
248  * Since the root page can move around the btree file, we have to read
249  * its location from the metadata page, and then read the root page
250  * itself. If no root page exists yet, we have to create one.
251  *
252  * The access type parameter (BT_READ or BT_WRITE) controls whether
253  * a new root page will be created or not. If access = BT_READ,
254  * and no root page exists, we just return InvalidBuffer. For
255  * BT_WRITE, we try to create the root page if it doesn't exist.
256  * NOTE that the returned root page will have only a read lock set
257  * on it even if access = BT_WRITE!
258  *
259  * The returned page is not necessarily the true root --- it could be
260  * a "fast root" (a page that is alone in its level due to deletions).
261  * Also, if the root page is split while we are "in flight" to it,
262  * what we will return is the old root, which is now just the leftmost
263  * page on a probably-not-very-wide level. For most purposes this is
264  * as good as or better than the true root, so we do not bother to
265  * insist on finding the true root. We do, however, guarantee to
266  * return a live (not deleted or half-dead) page.
267  *
268  * On successful return, the root page is pinned and read-locked.
269  * The metadata page is not locked or pinned on exit.
270  */
271 Buffer
272 _bt_getroot(Relation rel, int access)
273 {
274  Buffer metabuf;
275  Buffer rootbuf;
276  Page rootpage;
277  BTPageOpaque rootopaque;
278  BlockNumber rootblkno;
279  uint32 rootlevel;
280  BTMetaPageData *metad;
281 
282  /*
283  * Try to use previously-cached metapage data to find the root. This
284  * normally saves one buffer access per index search, which is a very
285  * helpful savings in bufmgr traffic and hence contention.
286  */
287  if (rel->rd_amcache != NULL)
288  {
289  metad = (BTMetaPageData *) rel->rd_amcache;
290  /* We shouldn't have cached it if any of these fail */
291  Assert(metad->btm_magic == BTREE_MAGIC);
293  Assert(metad->btm_version <= BTREE_VERSION);
294  Assert(!metad->btm_allequalimage ||
296  Assert(metad->btm_root != P_NONE);
297 
298  rootblkno = metad->btm_fastroot;
299  Assert(rootblkno != P_NONE);
300  rootlevel = metad->btm_fastlevel;
301 
302  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
303  rootpage = BufferGetPage(rootbuf);
304  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
305 
306  /*
307  * Since the cache might be stale, we check the page more carefully
308  * here than normal. We *must* check that it's not deleted. If it's
309  * not alone on its level, then we reject too --- this may be overly
310  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
311  * because that's not set in a "fast root".
312  */
313  if (!P_IGNORE(rootopaque) &&
314  rootopaque->btpo.level == rootlevel &&
315  P_LEFTMOST(rootopaque) &&
316  P_RIGHTMOST(rootopaque))
317  {
318  /* OK, accept cached page as the root */
319  return rootbuf;
320  }
321  _bt_relbuf(rel, rootbuf);
322  /* Cache is stale, throw it away */
323  if (rel->rd_amcache)
324  pfree(rel->rd_amcache);
325  rel->rd_amcache = NULL;
326  }
327 
328  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
329  metad = _bt_getmeta(rel, metabuf);
330 
331  /* if no root page initialized yet, do it */
332  if (metad->btm_root == P_NONE)
333  {
334  Page metapg;
335 
336  /* If access = BT_READ, caller doesn't want us to create root yet */
337  if (access == BT_READ)
338  {
339  _bt_relbuf(rel, metabuf);
340  return InvalidBuffer;
341  }
342 
343  /* trade in our read lock for a write lock */
344  _bt_unlockbuf(rel, metabuf);
345  _bt_lockbuf(rel, metabuf, BT_WRITE);
346 
347  /*
348  * Race condition: if someone else initialized the metadata between
349  * the time we released the read lock and acquired the write lock, we
350  * must avoid doing it again.
351  */
352  if (metad->btm_root != P_NONE)
353  {
354  /*
355  * Metadata initialized by someone else. In order to guarantee no
356  * deadlocks, we have to release the metadata page and start all
357  * over again. (Is that really true? But it's hardly worth trying
358  * to optimize this case.)
359  */
360  _bt_relbuf(rel, metabuf);
361  return _bt_getroot(rel, access);
362  }
363 
364  /*
365  * Get, initialize, write, and leave a lock of the appropriate type on
366  * the new root page. Since this is the first page in the tree, it's
367  * a leaf as well as the root.
368  */
369  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
370  rootblkno = BufferGetBlockNumber(rootbuf);
371  rootpage = BufferGetPage(rootbuf);
372  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
373  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
374  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
375  rootopaque->btpo.level = 0;
376  rootopaque->btpo_cycleid = 0;
377  /* Get raw page pointer for metapage */
378  metapg = BufferGetPage(metabuf);
379 
380  /* NO ELOG(ERROR) till meta is updated */
382 
383  /* upgrade metapage if needed */
384  if (metad->btm_version < BTREE_NOVAC_VERSION)
385  _bt_upgrademetapage(metapg);
386 
387  metad->btm_root = rootblkno;
388  metad->btm_level = 0;
389  metad->btm_fastroot = rootblkno;
390  metad->btm_fastlevel = 0;
392  metad->btm_last_cleanup_num_heap_tuples = -1.0;
393 
394  MarkBufferDirty(rootbuf);
395  MarkBufferDirty(metabuf);
396 
397  /* XLOG stuff */
398  if (RelationNeedsWAL(rel))
399  {
400  xl_btree_newroot xlrec;
401  XLogRecPtr recptr;
403 
404  XLogBeginInsert();
407 
409  md.version = metad->btm_version;
410  md.root = rootblkno;
411  md.level = 0;
412  md.fastroot = rootblkno;
413  md.fastlevel = 0;
416  md.allequalimage = metad->btm_allequalimage;
417 
418  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
419 
420  xlrec.rootblk = rootblkno;
421  xlrec.level = 0;
422 
423  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
424 
425  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
426 
427  PageSetLSN(rootpage, recptr);
428  PageSetLSN(metapg, recptr);
429  }
430 
432 
433  /*
434  * swap root write lock for read lock. There is no danger of anyone
435  * else accessing the new root page while it's unlocked, since no one
436  * else knows where it is yet.
437  */
438  _bt_unlockbuf(rel, rootbuf);
439  _bt_lockbuf(rel, rootbuf, BT_READ);
440 
441  /* okay, metadata is correct, release lock on it without caching */
442  _bt_relbuf(rel, metabuf);
443  }
444  else
445  {
446  rootblkno = metad->btm_fastroot;
447  Assert(rootblkno != P_NONE);
448  rootlevel = metad->btm_fastlevel;
449 
450  /*
451  * Cache the metapage data for next time
452  */
454  sizeof(BTMetaPageData));
455  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
456 
457  /*
458  * We are done with the metapage; arrange to release it via first
459  * _bt_relandgetbuf call
460  */
461  rootbuf = metabuf;
462 
463  for (;;)
464  {
465  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
466  rootpage = BufferGetPage(rootbuf);
467  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
468 
469  if (!P_IGNORE(rootopaque))
470  break;
471 
472  /* it's dead, Jim. step right one page */
473  if (P_RIGHTMOST(rootopaque))
474  elog(ERROR, "no live root page found in index \"%s\"",
476  rootblkno = rootopaque->btpo_next;
477  }
478 
479  /* Note: can't check btpo.level on deleted pages */
480  if (rootopaque->btpo.level != rootlevel)
481  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
482  rootblkno, RelationGetRelationName(rel),
483  rootopaque->btpo.level, rootlevel);
484  }
485 
486  /*
487  * By here, we have a pin and read lock on the root page, and no lock set
488  * on the metadata page. Return the root page's buffer.
489  */
490  return rootbuf;
491 }
492 
493 /*
494  * _bt_gettrueroot() -- Get the true root page of the btree.
495  *
496  * This is the same as the BT_READ case of _bt_getroot(), except
497  * we follow the true-root link not the fast-root link.
498  *
499  * By the time we acquire lock on the root page, it might have been split and
500  * not be the true root anymore. This is okay for the present uses of this
501  * routine; we only really need to be able to move up at least one tree level
502  * from whatever non-root page we were at. If we ever do need to lock the
503  * one true root page, we could loop here, re-reading the metapage on each
504  * failure. (Note that it wouldn't do to hold the lock on the metapage while
505  * moving to the root --- that'd deadlock against any concurrent root split.)
506  */
507 Buffer
509 {
510  Buffer metabuf;
511  Page metapg;
512  BTPageOpaque metaopaque;
513  Buffer rootbuf;
514  Page rootpage;
515  BTPageOpaque rootopaque;
516  BlockNumber rootblkno;
517  uint32 rootlevel;
518  BTMetaPageData *metad;
519 
520  /*
521  * We don't try to use cached metapage data here, since (a) this path is
522  * not performance-critical, and (b) if we are here it suggests our cache
523  * is out-of-date anyway. In light of point (b), it's probably safest to
524  * actively flush any cached metapage info.
525  */
526  if (rel->rd_amcache)
527  pfree(rel->rd_amcache);
528  rel->rd_amcache = NULL;
529 
530  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
531  metapg = BufferGetPage(metabuf);
532  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
533  metad = BTPageGetMeta(metapg);
534 
535  if (!P_ISMETA(metaopaque) ||
536  metad->btm_magic != BTREE_MAGIC)
537  ereport(ERROR,
538  (errcode(ERRCODE_INDEX_CORRUPTED),
539  errmsg("index \"%s\" is not a btree",
540  RelationGetRelationName(rel))));
541 
542  if (metad->btm_version < BTREE_MIN_VERSION ||
543  metad->btm_version > BTREE_VERSION)
544  ereport(ERROR,
545  (errcode(ERRCODE_INDEX_CORRUPTED),
546  errmsg("version mismatch in index \"%s\": file version %d, "
547  "current version %d, minimal supported version %d",
550 
551  /* if no root page initialized yet, fail */
552  if (metad->btm_root == P_NONE)
553  {
554  _bt_relbuf(rel, metabuf);
555  return InvalidBuffer;
556  }
557 
558  rootblkno = metad->btm_root;
559  rootlevel = metad->btm_level;
560 
561  /*
562  * We are done with the metapage; arrange to release it via first
563  * _bt_relandgetbuf call
564  */
565  rootbuf = metabuf;
566 
567  for (;;)
568  {
569  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
570  rootpage = BufferGetPage(rootbuf);
571  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
572 
573  if (!P_IGNORE(rootopaque))
574  break;
575 
576  /* it's dead, Jim. step right one page */
577  if (P_RIGHTMOST(rootopaque))
578  elog(ERROR, "no live root page found in index \"%s\"",
580  rootblkno = rootopaque->btpo_next;
581  }
582 
583  /* Note: can't check btpo.level on deleted pages */
584  if (rootopaque->btpo.level != rootlevel)
585  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
586  rootblkno, RelationGetRelationName(rel),
587  rootopaque->btpo.level, rootlevel);
588 
589  return rootbuf;
590 }
591 
592 /*
593  * _bt_getrootheight() -- Get the height of the btree search tree.
594  *
595  * We return the level (counting from zero) of the current fast root.
596  * This represents the number of tree levels we'd have to descend through
597  * to start any btree index search.
598  *
599  * This is used by the planner for cost-estimation purposes. Since it's
600  * only an estimate, slightly-stale data is fine, hence we don't worry
601  * about updating previously cached data.
602  */
603 int
605 {
606  BTMetaPageData *metad;
607 
608  if (rel->rd_amcache == NULL)
609  {
610  Buffer metabuf;
611 
612  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
613  metad = _bt_getmeta(rel, metabuf);
614 
615  /*
616  * If there's no root page yet, _bt_getroot() doesn't expect a cache
617  * to be made, so just stop here and report the index height is zero.
618  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
619  */
620  if (metad->btm_root == P_NONE)
621  {
622  _bt_relbuf(rel, metabuf);
623  return 0;
624  }
625 
626  /*
627  * Cache the metapage data for next time
628  */
630  sizeof(BTMetaPageData));
631  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
632  _bt_relbuf(rel, metabuf);
633  }
634 
635  /* Get cached page */
636  metad = (BTMetaPageData *) rel->rd_amcache;
637  /* We shouldn't have cached it if any of these fail */
638  Assert(metad->btm_magic == BTREE_MAGIC);
640  Assert(metad->btm_version <= BTREE_VERSION);
641  Assert(!metad->btm_allequalimage ||
643  Assert(metad->btm_fastroot != P_NONE);
644 
645  return metad->btm_fastlevel;
646 }
647 
648 /*
649  * _bt_metaversion() -- Get version/status info from metapage.
650  *
651  * Sets caller's *heapkeyspace and *allequalimage arguments using data
652  * from the B-Tree metapage (could be locally-cached version). This
653  * information needs to be stashed in insertion scankey, so we provide a
654  * single function that fetches both at once.
655  *
656  * This is used to determine the rules that must be used to descend a
657  * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
658  * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
659  * performance when inserting a new BTScanInsert-wise duplicate tuple
660  * among many leaf pages already full of such duplicates.
661  *
662  * Also sets allequalimage field, which indicates whether or not it is
663  * safe to apply deduplication. We rely on the assumption that
664  * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
665  * pg_upgrade'd from Postgres 12.
666  */
667 void
668 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
669 {
670  BTMetaPageData *metad;
671 
672  if (rel->rd_amcache == NULL)
673  {
674  Buffer metabuf;
675 
676  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
677  metad = _bt_getmeta(rel, metabuf);
678 
679  /*
680  * If there's no root page yet, _bt_getroot() doesn't expect a cache
681  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
682  * be changed to allow this case.)
683  */
684  if (metad->btm_root == P_NONE)
685  {
686  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
687  *allequalimage = metad->btm_allequalimage;
688 
689  _bt_relbuf(rel, metabuf);
690  return;
691  }
692 
693  /*
694  * Cache the metapage data for next time
695  *
696  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
697  * can change the nbtree version for an index without invalidating any
698  * local cache. This is okay because it can only happen when moving
699  * from version 2 to version 3, both of which are !heapkeyspace
700  * versions.
701  */
703  sizeof(BTMetaPageData));
704  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
705  _bt_relbuf(rel, metabuf);
706  }
707 
708  /* Get cached page */
709  metad = (BTMetaPageData *) rel->rd_amcache;
710  /* We shouldn't have cached it if any of these fail */
711  Assert(metad->btm_magic == BTREE_MAGIC);
713  Assert(metad->btm_version <= BTREE_VERSION);
714  Assert(!metad->btm_allequalimage ||
716  Assert(metad->btm_fastroot != P_NONE);
717 
718  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
719  *allequalimage = metad->btm_allequalimage;
720 }
721 
722 /*
723  * _bt_checkpage() -- Verify that a freshly-read page looks sane.
724  */
725 void
727 {
728  Page page = BufferGetPage(buf);
729 
730  /*
731  * ReadBuffer verifies that every newly-read page passes
732  * PageHeaderIsValid, which means it either contains a reasonably sane
733  * page header or is all-zero. We have to defend against the all-zero
734  * case, however.
735  */
736  if (PageIsNew(page))
737  ereport(ERROR,
738  (errcode(ERRCODE_INDEX_CORRUPTED),
739  errmsg("index \"%s\" contains unexpected zero page at block %u",
741  BufferGetBlockNumber(buf)),
742  errhint("Please REINDEX it.")));
743 
744  /*
745  * Additionally check that the special area looks sane.
746  */
747  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
748  ereport(ERROR,
749  (errcode(ERRCODE_INDEX_CORRUPTED),
750  errmsg("index \"%s\" contains corrupted page at block %u",
752  BufferGetBlockNumber(buf)),
753  errhint("Please REINDEX it.")));
754 }
755 
756 /*
757  * Log the reuse of a page from the FSM.
758  */
759 static void
761 {
762  xl_btree_reuse_page xlrec_reuse;
763 
764  /*
765  * Note that we don't register the buffer with the record, because this
766  * operation doesn't modify the page. This record only exists to provide a
767  * conflict point for Hot Standby.
768  */
769 
770  /* XLOG stuff */
771  xlrec_reuse.node = rel->rd_node;
772  xlrec_reuse.block = blkno;
773  xlrec_reuse.latestRemovedXid = latestRemovedXid;
774 
775  XLogBeginInsert();
776  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
777 
778  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
779 }
780 
781 /*
782  * _bt_getbuf() -- Get a buffer by block number for read or write.
783  *
784  * blkno == P_NEW means to get an unallocated index page. The page
785  * will be initialized before returning it.
786  *
787  * The general rule in nbtree is that it's never okay to access a
788  * page without holding both a buffer pin and a buffer lock on
789  * the page's buffer.
790  *
791  * When this routine returns, the appropriate lock is set on the
792  * requested buffer and its reference count has been incremented
793  * (ie, the buffer is "locked and pinned"). Also, we apply
794  * _bt_checkpage to sanity-check the page (except in P_NEW case),
795  * and perform Valgrind client requests that help Valgrind detect
796  * unsafe page accesses.
797  *
798  * Note: raw LockBuffer() calls are disallowed in nbtree; all
799  * buffer lock requests need to go through wrapper functions such
800  * as _bt_lockbuf().
801  */
802 Buffer
803 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
804 {
805  Buffer buf;
806 
807  if (blkno != P_NEW)
808  {
809  /* Read an existing block of the relation */
810  buf = ReadBuffer(rel, blkno);
811  _bt_lockbuf(rel, buf, access);
812  _bt_checkpage(rel, buf);
813  }
814  else
815  {
816  bool needLock;
817  Page page;
818 
819  Assert(access == BT_WRITE);
820 
821  /*
822  * First see if the FSM knows of any free pages.
823  *
824  * We can't trust the FSM's report unreservedly; we have to check that
825  * the page is still free. (For example, an already-free page could
826  * have been re-used between the time the last VACUUM scanned it and
827  * the time the VACUUM made its FSM updates.)
828  *
829  * In fact, it's worse than that: we can't even assume that it's safe
830  * to take a lock on the reported page. If somebody else has a lock
831  * on it, or even worse our own caller does, we could deadlock. (The
832  * own-caller scenario is actually not improbable. Consider an index
833  * on a serial or timestamp column. Nearly all splits will be at the
834  * rightmost page, so it's entirely likely that _bt_split will call us
835  * while holding a lock on the page most recently acquired from FSM. A
836  * VACUUM running concurrently with the previous split could well have
837  * placed that page back in FSM.)
838  *
839  * To get around that, we ask for only a conditional lock on the
840  * reported page. If we fail, then someone else is using the page,
841  * and we may reasonably assume it's not free. (If we happen to be
842  * wrong, the worst consequence is the page will be lost to use till
843  * the next VACUUM, which is no big problem.)
844  */
845  for (;;)
846  {
847  blkno = GetFreeIndexPage(rel);
848  if (blkno == InvalidBlockNumber)
849  break;
850  buf = ReadBuffer(rel, blkno);
851  if (_bt_conditionallockbuf(rel, buf))
852  {
853  page = BufferGetPage(buf);
854  if (_bt_page_recyclable(page))
855  {
856  /*
857  * If we are generating WAL for Hot Standby then create a
858  * WAL record that will allow us to conflict with queries
859  * running on standby, in case they have snapshots older
860  * than btpo.xact. This can only apply if the page does
861  * have a valid btpo.xact value, ie not if it's new. (We
862  * must check that because an all-zero page has no special
863  * space.)
864  */
865  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
866  !PageIsNew(page))
867  {
869 
870  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
871  }
872 
873  /* Okay to use page. Re-initialize and return it */
874  _bt_pageinit(page, BufferGetPageSize(buf));
875  return buf;
876  }
877  elog(DEBUG2, "FSM returned nonrecyclable page");
878  _bt_relbuf(rel, buf);
879  }
880  else
881  {
882  elog(DEBUG2, "FSM returned nonlockable page");
883  /* couldn't get lock, so just drop pin */
884  ReleaseBuffer(buf);
885  }
886  }
887 
888  /*
889  * Extend the relation by one page.
890  *
891  * We have to use a lock to ensure no one else is extending the rel at
892  * the same time, else we will both try to initialize the same new
893  * page. We can skip locking for new or temp relations, however,
894  * since no one else could be accessing them.
895  */
896  needLock = !RELATION_IS_LOCAL(rel);
897 
898  if (needLock)
900 
901  buf = ReadBuffer(rel, P_NEW);
902 
903  /* Acquire buffer lock on new page */
904  _bt_lockbuf(rel, buf, BT_WRITE);
905 
906  /*
907  * Release the file-extension lock; it's now OK for someone else to
908  * extend the relation some more. Note that we cannot release this
909  * lock before we have buffer lock on the new page, or we risk a race
910  * condition against btvacuumscan --- see comments therein.
911  */
912  if (needLock)
914 
915  /* Initialize the new page before returning it */
916  page = BufferGetPage(buf);
917  Assert(PageIsNew(page));
918  _bt_pageinit(page, BufferGetPageSize(buf));
919  }
920 
921  /* ref count and lock type are correct */
922  return buf;
923 }
924 
925 /*
926  * _bt_relandgetbuf() -- release a locked buffer and get another one.
927  *
928  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
929  * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
930  * then it reduces to just _bt_getbuf; allowing this case simplifies some
931  * callers.
932  *
933  * The original motivation for using this was to avoid two entries to the
934  * bufmgr when one would do. However, now it's mainly just a notational
935  * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
936  * is when the target page is the same one already in the buffer.
937  */
938 Buffer
939 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
940 {
941  Buffer buf;
942 
943  Assert(blkno != P_NEW);
944  if (BufferIsValid(obuf))
945  _bt_unlockbuf(rel, obuf);
946  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
947  _bt_lockbuf(rel, buf, access);
948 
949  _bt_checkpage(rel, buf);
950  return buf;
951 }
952 
953 /*
954  * _bt_relbuf() -- release a locked buffer.
955  *
956  * Lock and pin (refcount) are both dropped.
957  */
958 void
960 {
961  _bt_unlockbuf(rel, buf);
962  ReleaseBuffer(buf);
963 }
964 
965 /*
966  * _bt_lockbuf() -- lock a pinned buffer.
967  *
968  * Lock is acquired without acquiring another pin. This is like a raw
969  * LockBuffer() call, but performs extra steps needed by Valgrind.
970  *
971  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
972  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
973  */
974 void
975 _bt_lockbuf(Relation rel, Buffer buf, int access)
976 {
977  /* LockBuffer() asserts that pin is held by this backend */
978  LockBuffer(buf, access);
979 
980  /*
981  * It doesn't matter that _bt_unlockbuf() won't get called in the
982  * event of an nbtree error (e.g. a unique violation error). That
983  * won't cause Valgrind false positives.
984  *
985  * The nbtree client requests are superimposed on top of the
986  * bufmgr.c buffer pin client requests. In the event of an nbtree
987  * error the buffer will certainly get marked as defined when the
988  * backend once again acquires its first pin on the buffer. (Of
989  * course, if the backend never touches the buffer again then it
990  * doesn't matter that it remains non-accessible to Valgrind.)
991  *
992  * Note: When an IndexTuple C pointer gets computed using an
993  * ItemId read from a page while a lock was held, the C pointer
994  * becomes unsafe to dereference forever as soon as the lock is
995  * released. Valgrind can only detect cases where the pointer
996  * gets dereferenced with no _current_ lock/pin held, though.
997  */
998  if (!RelationUsesLocalBuffers(rel))
1000 }
1001 
1002 /*
1003  * _bt_unlockbuf() -- unlock a pinned buffer.
1004  */
1005 void
1007 {
1008  /*
1009  * Buffer is pinned and locked, which means that it is expected to be
1010  * defined and addressable. Check that proactively.
1011  */
1013 
1014  /* LockBuffer() asserts that pin is held by this backend */
1016 
1017  if (!RelationUsesLocalBuffers(rel))
1019 }
1020 
1021 /*
1022  * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1023  * buffer.
1024  *
1025  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1026  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1027  */
1028 bool
1030 {
1031  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1032  if (!ConditionalLockBuffer(buf))
1033  return false;
1034 
1035  if (!RelationUsesLocalBuffers(rel))
1037 
1038  return true;
1039 }
1040 
1041 /*
1042  * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
1043  * lock.
1044  */
1045 void
1047 {
1048  /*
1049  * Buffer is pinned and locked, which means that it is expected to be
1050  * defined and addressable. Check that proactively.
1051  */
1053 
1054  /* LockBuffer() asserts that pin is held by this backend */
1056  LockBufferForCleanup(buf);
1057 }
1058 
1059 /*
1060  * _bt_pageinit() -- Initialize a new page.
1061  *
1062  * On return, the page header is initialized; data space is empty;
1063  * special space is zeroed out.
1064  */
1065 void
1067 {
1068  PageInit(page, size, sizeof(BTPageOpaqueData));
1069 }
1070 
1071 /*
1072  * _bt_page_recyclable() -- Is an existing page recyclable?
1073  *
1074  * This exists to make sure _bt_getbuf and btvacuumscan have the same
1075  * policy about whether a page is safe to re-use. But note that _bt_getbuf
1076  * knows enough to distinguish the PageIsNew condition from the other one.
1077  * At some point it might be appropriate to redesign this to have a three-way
1078  * result value.
1079  */
1080 bool
1082 {
1083  BTPageOpaque opaque;
1084 
1085  /*
1086  * It's possible to find an all-zeroes page in an index --- for example, a
1087  * backend might successfully extend the relation one page and then crash
1088  * before it is able to make a WAL entry for adding the page. If we find a
1089  * zeroed page then reclaim it.
1090  */
1091  if (PageIsNew(page))
1092  return true;
1093 
1094  /*
1095  * Otherwise, recycle if deleted and too old to have any processes
1096  * interested in it.
1097  */
1098  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1099  if (P_ISDELETED(opaque) &&
1100  GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact))
1101  return true;
1102  return false;
1103 }
1104 
1105 /*
1106  * Delete item(s) from a btree leaf page during VACUUM.
1107  *
1108  * This routine assumes that the caller has a super-exclusive write lock on
1109  * the buffer. Also, the given deletable and updatable arrays *must* be
1110  * sorted in ascending order.
1111  *
1112  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1113  * in an existing posting list item are to be removed by VACUUM. This works
1114  * by updating/overwriting an existing item with caller's new version of the
1115  * item (a version that lacks the TIDs that are to be deleted).
1116  *
1117  * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1118  * generate their own latestRemovedXid by accessing the heap directly, whereas
1119  * VACUUMs rely on the initial heap scan taking care of it indirectly. Also,
1120  * only VACUUM can perform granular deletes of individual TIDs in posting list
1121  * tuples.
1122  */
1123 void
1125  OffsetNumber *deletable, int ndeletable,
1126  BTVacuumPosting *updatable, int nupdatable)
1127 {
1128  Page page = BufferGetPage(buf);
1129  BTPageOpaque opaque;
1130  Size itemsz;
1131  char *updatedbuf = NULL;
1132  Size updatedbuflen = 0;
1133  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1134 
1135  /* Shouldn't be called unless there's something to do */
1136  Assert(ndeletable > 0 || nupdatable > 0);
1137 
1138  for (int i = 0; i < nupdatable; i++)
1139  {
1140  /* Replace work area IndexTuple with updated version */
1141  _bt_update_posting(updatable[i]);
1142 
1143  /* Maintain array of updatable page offsets for WAL record */
1144  updatedoffsets[i] = updatable[i]->updatedoffset;
1145  }
1146 
1147  /* XLOG stuff -- allocate and fill buffer before critical section */
1148  if (nupdatable > 0 && RelationNeedsWAL(rel))
1149  {
1150  Size offset = 0;
1151 
1152  for (int i = 0; i < nupdatable; i++)
1153  {
1154  BTVacuumPosting vacposting = updatable[i];
1155 
1156  itemsz = SizeOfBtreeUpdate +
1157  vacposting->ndeletedtids * sizeof(uint16);
1158  updatedbuflen += itemsz;
1159  }
1160 
1161  updatedbuf = palloc(updatedbuflen);
1162  for (int i = 0; i < nupdatable; i++)
1163  {
1164  BTVacuumPosting vacposting = updatable[i];
1165  xl_btree_update update;
1166 
1167  update.ndeletedtids = vacposting->ndeletedtids;
1168  memcpy(updatedbuf + offset, &update.ndeletedtids,
1170  offset += SizeOfBtreeUpdate;
1171 
1172  itemsz = update.ndeletedtids * sizeof(uint16);
1173  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1174  offset += itemsz;
1175  }
1176  }
1177 
1178  /* No ereport(ERROR) until changes are logged */
1180 
1181  /*
1182  * Handle posting tuple updates.
1183  *
1184  * Deliberately do this before handling simple deletes. If we did it the
1185  * other way around (i.e. WAL record order -- simple deletes before
1186  * updates) then we'd have to make compensating changes to the 'updatable'
1187  * array of offset numbers.
1188  *
1189  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1190  * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
1191  * level flag, unsetting individual LP_DEAD bits should still be avoided.
1192  */
1193  for (int i = 0; i < nupdatable; i++)
1194  {
1195  OffsetNumber updatedoffset = updatedoffsets[i];
1196  IndexTuple itup;
1197 
1198  itup = updatable[i]->itup;
1199  itemsz = MAXALIGN(IndexTupleSize(itup));
1200  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1201  itemsz))
1202  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1204  }
1205 
1206  /* Now handle simple deletes of entire tuples */
1207  if (ndeletable > 0)
1208  PageIndexMultiDelete(page, deletable, ndeletable);
1209 
1210  /*
1211  * We can clear the vacuum cycle ID since this page has certainly been
1212  * processed by the current vacuum scan.
1213  */
1214  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1215  opaque->btpo_cycleid = 0;
1216 
1217  /*
1218  * Mark the page as not containing any LP_DEAD items. This is not
1219  * certainly true (there might be some that have recently been marked, but
1220  * weren't targeted by VACUUM's heap scan), but it will be true often
1221  * enough. VACUUM does not delete items purely because they have their
1222  * LP_DEAD bit set, since doing so would necessitate explicitly logging a
1223  * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
1224  *
1225  * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
1226  * limited, since we never falsely unset an LP_DEAD bit. Workloads that
1227  * are particularly dependent on LP_DEAD bits being set quickly will
1228  * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
1229  * again anyway. Furthermore, attempting a deduplication pass will remove
1230  * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
1231  * is set or not.
1232  */
1233  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1234 
1235  MarkBufferDirty(buf);
1236 
1237  /* XLOG stuff */
1238  if (RelationNeedsWAL(rel))
1239  {
1240  XLogRecPtr recptr;
1241  xl_btree_vacuum xlrec_vacuum;
1242 
1243  xlrec_vacuum.ndeleted = ndeletable;
1244  xlrec_vacuum.nupdated = nupdatable;
1245 
1246  XLogBeginInsert();
1248  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1249 
1250  if (ndeletable > 0)
1251  XLogRegisterBufData(0, (char *) deletable,
1252  ndeletable * sizeof(OffsetNumber));
1253 
1254  if (nupdatable > 0)
1255  {
1256  XLogRegisterBufData(0, (char *) updatedoffsets,
1257  nupdatable * sizeof(OffsetNumber));
1258  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1259  }
1260 
1261  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1262 
1263  PageSetLSN(page, recptr);
1264  }
1265 
1266  END_CRIT_SECTION();
1267 
1268  /* can't leak memory here */
1269  if (updatedbuf != NULL)
1270  pfree(updatedbuf);
1271  /* free tuples generated by calling _bt_update_posting() */
1272  for (int i = 0; i < nupdatable; i++)
1273  pfree(updatable[i]->itup);
1274 }
1275 
1276 /*
1277  * Delete item(s) from a btree leaf page during single-page cleanup.
1278  *
1279  * This routine assumes that the caller has pinned and write locked the
1280  * buffer. Also, the given deletable array *must* be sorted in ascending
1281  * order.
1282  *
1283  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1284  * the page, but it needs to generate its own latestRemovedXid by accessing
1285  * the heap. This is used by the REDO routine to generate recovery conflicts.
1286  * Also, it doesn't handle posting list tuples unless the entire tuple can be
1287  * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
1288  */
1289 void
1291  OffsetNumber *deletable, int ndeletable,
1292  Relation heapRel)
1293 {
1294  Page page = BufferGetPage(buf);
1295  BTPageOpaque opaque;
1296  TransactionId latestRemovedXid = InvalidTransactionId;
1297 
1298  /* Shouldn't be called unless there's something to do */
1299  Assert(ndeletable > 0);
1300 
1302  latestRemovedXid =
1303  _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
1304 
1305  /* No ereport(ERROR) until changes are logged */
1307 
1308  /* Fix the page */
1309  PageIndexMultiDelete(page, deletable, ndeletable);
1310 
1311  /*
1312  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1313  * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE
1314  * page flag, since we deleted all items with their LP_DEAD bit set.
1315  */
1316  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1317  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1318 
1319  MarkBufferDirty(buf);
1320 
1321  /* XLOG stuff */
1322  if (RelationNeedsWAL(rel))
1323  {
1324  XLogRecPtr recptr;
1325  xl_btree_delete xlrec_delete;
1326 
1327  xlrec_delete.latestRemovedXid = latestRemovedXid;
1328  xlrec_delete.ndeleted = ndeletable;
1329 
1330  XLogBeginInsert();
1332  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1333 
1334  /*
1335  * The deletable array is not in the buffer, but pretend that it is.
1336  * When XLogInsert stores the whole buffer, the array need not be
1337  * stored too.
1338  */
1339  XLogRegisterBufData(0, (char *) deletable,
1340  ndeletable * sizeof(OffsetNumber));
1341 
1342  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1343 
1344  PageSetLSN(page, recptr);
1345  }
1346 
1347  END_CRIT_SECTION();
1348 }
1349 
1350 /*
1351  * Get the latestRemovedXid from the table entries pointed to by the non-pivot
1352  * tuples being deleted.
1353  *
1354  * This is a specialized version of index_compute_xid_horizon_for_tuples().
1355  * It's needed because btree tuples don't always store table TID using the
1356  * standard index tuple header field.
1357  */
1358 static TransactionId
1360  OffsetNumber *deletable, int ndeletable)
1361 {
1362  TransactionId latestRemovedXid = InvalidTransactionId;
1363  int spacenhtids;
1364  int nhtids;
1365  ItemPointer htids;
1366 
1367  /* Array will grow iff there are posting list tuples to consider */
1368  spacenhtids = ndeletable;
1369  nhtids = 0;
1370  htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
1371  for (int i = 0; i < ndeletable; i++)
1372  {
1373  ItemId itemid;
1374  IndexTuple itup;
1375 
1376  itemid = PageGetItemId(page, deletable[i]);
1377  itup = (IndexTuple) PageGetItem(page, itemid);
1378 
1379  Assert(ItemIdIsDead(itemid));
1380  Assert(!BTreeTupleIsPivot(itup));
1381 
1382  if (!BTreeTupleIsPosting(itup))
1383  {
1384  if (nhtids + 1 > spacenhtids)
1385  {
1386  spacenhtids *= 2;
1387  htids = (ItemPointer)
1388  repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1389  }
1390 
1391  Assert(ItemPointerIsValid(&itup->t_tid));
1392  ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
1393  nhtids++;
1394  }
1395  else
1396  {
1397  int nposting = BTreeTupleGetNPosting(itup);
1398 
1399  if (nhtids + nposting > spacenhtids)
1400  {
1401  spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
1402  htids = (ItemPointer)
1403  repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1404  }
1405 
1406  for (int j = 0; j < nposting; j++)
1407  {
1408  ItemPointer htid = BTreeTupleGetPostingN(itup, j);
1409 
1410  Assert(ItemPointerIsValid(htid));
1411  ItemPointerCopy(htid, &htids[nhtids]);
1412  nhtids++;
1413  }
1414  }
1415  }
1416 
1417  Assert(nhtids >= ndeletable);
1418 
1419  latestRemovedXid =
1420  table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
1421 
1422  pfree(htids);
1423 
1424  return latestRemovedXid;
1425 }
1426 
1427 /*
1428  * Check that leftsib page (the btpo_prev of target page) is not marked with
1429  * INCOMPLETE_SPLIT flag. Used during page deletion.
1430  *
1431  * Returning true indicates that page flag is set in leftsib (which is
1432  * definitely still the left sibling of target). When that happens, the
1433  * target doesn't have a downlink in parent, and the page deletion algorithm
1434  * isn't prepared to handle that. Deletion of the target page (or the whole
1435  * subtree that contains the target page) cannot take place.
1436  *
1437  * Caller should not have a lock on the target page itself, since pages on the
1438  * same level must always be locked left to right to avoid deadlocks.
1439  */
1440 static bool
1442 {
1443  Buffer buf;
1444  Page page;
1445  BTPageOpaque opaque;
1446  bool result;
1447 
1448  /* Easy case: No left sibling */
1449  if (leftsib == P_NONE)
1450  return false;
1451 
1452  buf = _bt_getbuf(rel, leftsib, BT_READ);
1453  page = BufferGetPage(buf);
1454  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1455 
1456  /*
1457  * If the left sibling was concurrently split, so that its next-pointer
1458  * doesn't point to the current page anymore, the split that created
1459  * target must be completed. Caller can reasonably expect that there will
1460  * be a downlink to the target page that it can relocate using its stack.
1461  * (We don't allow splitting an incompletely split page again until the
1462  * previous split has been completed.)
1463  */
1464  result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1465  _bt_relbuf(rel, buf);
1466 
1467  return result;
1468 }
1469 
1470 /*
1471  * Check that leafrightsib page (the btpo_next of target leaf page) is not
1472  * marked with ISHALFDEAD flag. Used during page deletion.
1473  *
1474  * Returning true indicates that page flag is set in leafrightsib, so page
1475  * deletion cannot go ahead. Our caller is not prepared to deal with the case
1476  * where the parent page does not have a pivot tuples whose downlink points to
1477  * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1478  * seem worth going to the trouble of teaching our caller to deal with it.
1479  * The situation will be resolved after VACUUM finishes the deletion of the
1480  * half-dead page (when a future VACUUM operation reaches the target page
1481  * again).
1482  *
1483  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1484  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1485  * okay because of the restriction on deleting pages that are the rightmost
1486  * page of their parent (i.e. that such deletions can only take place when the
1487  * entire subtree must be deleted). The leaf level check made here will apply
1488  * to a right "cousin" leaf page rather than a simple right sibling leaf page
1489  * in cases where caller actually goes on to attempt deleting pages that are
1490  * above the leaf page. The right cousin leaf page is representative of the
1491  * left edge of the subtree to the right of the to-be-deleted subtree as a
1492  * whole, which is exactly the condition that our caller cares about.
1493  * (Besides, internal pages are never marked half-dead, so it isn't even
1494  * possible to _directly_ assess if an internal page is part of some other
1495  * to-be-deleted subtree.)
1496  */
1497 static bool
1499 {
1500  Buffer buf;
1501  Page page;
1502  BTPageOpaque opaque;
1503  bool result;
1504 
1505  Assert(leafrightsib != P_NONE);
1506 
1507  buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1508  page = BufferGetPage(buf);
1509  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1510 
1511  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1512  result = P_ISHALFDEAD(opaque);
1513  _bt_relbuf(rel, buf);
1514 
1515  return result;
1516 }
1517 
1518 /*
1519  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1520  *
1521  * This action unlinks the leaf page from the b-tree structure, removing all
1522  * pointers leading to it --- but not touching its own left and right links.
1523  * The page cannot be physically reclaimed right away, since other processes
1524  * may currently be trying to follow links leading to the page; they have to
1525  * be allowed to use its right-link to recover. See nbtree/README.
1526  *
1527  * On entry, the target buffer must be pinned and locked (either read or write
1528  * lock is OK). The page must be an empty leaf page, which may be half-dead
1529  * already (a half-dead page should only be passed to us when an earlier
1530  * VACUUM operation was interrupted, though). Note in particular that caller
1531  * should never pass a buffer containing an existing deleted page here. The
1532  * lock and pin on caller's buffer will be dropped before we return.
1533  *
1534  * Returns the number of pages successfully deleted (zero if page cannot
1535  * be deleted now; could be more than one if parent or right sibling pages
1536  * were deleted too). Note that this does not include pages that we delete
1537  * that the btvacuumscan scan has yet to reach; they'll get counted later
1538  * instead.
1539  *
1540  * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
1541  * responsible for maintaining *oldestBtpoXact in the case of pages that were
1542  * deleted by a previous VACUUM.
1543  *
1544  * NOTE: this leaks memory. Rather than trying to clean up everything
1545  * carefully, it's better to run it in a temp context that can be reset
1546  * frequently.
1547  */
1548 uint32
1549 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
1550 {
1551  uint32 ndeleted = 0;
1552  BlockNumber rightsib;
1553  bool rightsib_empty;
1554  Page page;
1555  BTPageOpaque opaque;
1556 
1557  /*
1558  * Save original leafbuf block number from caller. Only deleted blocks
1559  * that are <= scanblkno get counted in ndeleted return value.
1560  */
1561  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1562 
1563  /*
1564  * "stack" is a search stack leading (approximately) to the target page.
1565  * It is initially NULL, but when iterating, we keep it to avoid
1566  * duplicated search effort.
1567  *
1568  * Also, when "stack" is not NULL, we have already checked that the
1569  * current page is not the right half of an incomplete split, i.e. the
1570  * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1571  * when the current target page is to the right of caller's initial page
1572  * (the scanblkno page).
1573  */
1574  BTStack stack = NULL;
1575 
1576  for (;;)
1577  {
1578  page = BufferGetPage(leafbuf);
1579  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1580 
1581  /*
1582  * Internal pages are never deleted directly, only as part of deleting
1583  * the whole subtree all the way down to leaf level.
1584  *
1585  * Also check for deleted pages here. Caller never passes us a fully
1586  * deleted page. Only VACUUM can delete pages, so there can't have
1587  * been a concurrent deletion. Assume that we reached any deleted
1588  * page encountered here by following a sibling link, and that the
1589  * index is corrupt.
1590  */
1591  Assert(!P_ISDELETED(opaque));
1592  if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1593  {
1594  /*
1595  * Pre-9.4 page deletion only marked internal pages as half-dead,
1596  * but now we only use that flag on leaf pages. The old algorithm
1597  * was never supposed to leave half-dead pages in the tree, it was
1598  * just a transient state, but it was nevertheless possible in
1599  * error scenarios. We don't know how to deal with them here. They
1600  * are harmless as far as searches are considered, but inserts
1601  * into the deleted keyspace could add out-of-order downlinks in
1602  * the upper levels. Log a notice, hopefully the admin will notice
1603  * and reindex.
1604  */
1605  if (P_ISHALFDEAD(opaque))
1606  ereport(LOG,
1607  (errcode(ERRCODE_INDEX_CORRUPTED),
1608  errmsg("index \"%s\" contains a half-dead internal page",
1610  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1611 
1612  if (P_ISDELETED(opaque))
1613  ereport(LOG,
1614  (errcode(ERRCODE_INDEX_CORRUPTED),
1615  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1616  BufferGetBlockNumber(leafbuf),
1617  scanblkno,
1618  RelationGetRelationName(rel))));
1619 
1620  _bt_relbuf(rel, leafbuf);
1621  return ndeleted;
1622  }
1623 
1624  /*
1625  * We can never delete rightmost pages nor root pages. While at it,
1626  * check that page is empty, since it's possible that the leafbuf page
1627  * was empty a moment ago, but has since had some inserts.
1628  *
1629  * To keep the algorithm simple, we also never delete an incompletely
1630  * split page (they should be rare enough that this doesn't make any
1631  * meaningful difference to disk usage):
1632  *
1633  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1634  * left half of an incomplete split, but ensuring that it's not the
1635  * right half is more complicated. For that, we have to check that
1636  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1637  * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1638  * release the lock on scanblkno/leafbuf, check the left sibling, and
1639  * construct a search stack to scanblkno. On subsequent iterations,
1640  * we know we stepped right from a page that passed these tests, so
1641  * it's OK.
1642  */
1643  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1644  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1645  P_INCOMPLETE_SPLIT(opaque))
1646  {
1647  /* Should never fail to delete a half-dead page */
1648  Assert(!P_ISHALFDEAD(opaque));
1649 
1650  _bt_relbuf(rel, leafbuf);
1651  return ndeleted;
1652  }
1653 
1654  /*
1655  * First, remove downlink pointing to the page (or a parent of the
1656  * page, if we are going to delete a taller subtree), and mark the
1657  * leafbuf page half-dead
1658  */
1659  if (!P_ISHALFDEAD(opaque))
1660  {
1661  /*
1662  * We need an approximate pointer to the page's parent page. We
1663  * use a variant of the standard search mechanism to search for
1664  * the page's high key; this will give us a link to either the
1665  * current parent or someplace to its left (if there are multiple
1666  * equal high keys, which is possible with !heapkeyspace indexes).
1667  *
1668  * Also check if this is the right-half of an incomplete split
1669  * (see comment above).
1670  */
1671  if (!stack)
1672  {
1673  BTScanInsert itup_key;
1674  ItemId itemid;
1675  IndexTuple targetkey;
1676  BlockNumber leftsib,
1677  leafblkno;
1678  Buffer sleafbuf;
1679 
1680  itemid = PageGetItemId(page, P_HIKEY);
1681  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1682 
1683  leftsib = opaque->btpo_prev;
1684  leafblkno = BufferGetBlockNumber(leafbuf);
1685 
1686  /*
1687  * To avoid deadlocks, we'd better drop the leaf page lock
1688  * before going further.
1689  */
1690  _bt_unlockbuf(rel, leafbuf);
1691 
1692  /*
1693  * Check that the left sibling of leafbuf (if any) is not
1694  * marked with INCOMPLETE_SPLIT flag before proceeding
1695  */
1696  Assert(leafblkno == scanblkno);
1697  if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1698  {
1699  ReleaseBuffer(leafbuf);
1700  Assert(ndeleted == 0);
1701  return ndeleted;
1702  }
1703 
1704  /* we need an insertion scan key for the search, so build one */
1705  itup_key = _bt_mkscankey(rel, targetkey);
1706  /* find the leftmost leaf page with matching pivot/high key */
1707  itup_key->pivotsearch = true;
1708  stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1709  /* won't need a second lock or pin on leafbuf */
1710  _bt_relbuf(rel, sleafbuf);
1711 
1712  /*
1713  * Re-lock the leaf page, and start over to use our stack
1714  * within _bt_mark_page_halfdead. We must do it that way
1715  * because it's possible that leafbuf can no longer be
1716  * deleted. We need to recheck.
1717  *
1718  * Note: We can't simply hold on to the sleafbuf lock instead,
1719  * because it's barely possible that sleafbuf is not the same
1720  * page as leafbuf. This happens when leafbuf split after our
1721  * original lock was dropped, but before _bt_search finished
1722  * its descent. We rely on the assumption that we'll find
1723  * leafbuf isn't safe to delete anymore in this scenario.
1724  * (Page deletion can cope with the stack being to the left of
1725  * leafbuf, but not to the right of leafbuf.)
1726  */
1727  _bt_lockbuf(rel, leafbuf, BT_WRITE);
1728  continue;
1729  }
1730 
1731  /*
1732  * See if it's safe to delete the leaf page, and determine how
1733  * many parent/internal pages above the leaf level will be
1734  * deleted. If it's safe then _bt_mark_page_halfdead will also
1735  * perform the first phase of deletion, which includes marking the
1736  * leafbuf page half-dead.
1737  */
1738  Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
1739  if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
1740  {
1741  _bt_relbuf(rel, leafbuf);
1742  return ndeleted;
1743  }
1744  }
1745 
1746  /*
1747  * Then unlink it from its siblings. Each call to
1748  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
1749  * making it shallower. Iterate until the leafbuf page is deleted.
1750  *
1751  * _bt_unlink_halfdead_page should never fail, since we established
1752  * that deletion is generally safe in _bt_mark_page_halfdead.
1753  */
1754  rightsib_empty = false;
1755  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1756  while (P_ISHALFDEAD(opaque))
1757  {
1758  /* Check for interrupts in _bt_unlink_halfdead_page */
1759  if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
1760  &rightsib_empty, oldestBtpoXact,
1761  &ndeleted))
1762  {
1763  /* _bt_unlink_halfdead_page failed, released buffer */
1764  return ndeleted;
1765  }
1766  }
1767 
1768  Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
1770  *oldestBtpoXact));
1771 
1772  rightsib = opaque->btpo_next;
1773 
1774  _bt_relbuf(rel, leafbuf);
1775 
1776  /*
1777  * Check here, as calling loops will have locks held, preventing
1778  * interrupts from being processed.
1779  */
1781 
1782  /*
1783  * The page has now been deleted. If its right sibling is completely
1784  * empty, it's possible that the reason we haven't deleted it earlier
1785  * is that it was the rightmost child of the parent. Now that we
1786  * removed the downlink for this page, the right sibling might now be
1787  * the only child of the parent, and could be removed. It would be
1788  * picked up by the next vacuum anyway, but might as well try to
1789  * remove it now, so loop back to process the right sibling.
1790  *
1791  * Note: This relies on the assumption that _bt_getstackbuf() will be
1792  * able to reuse our original descent stack with a different child
1793  * block (provided that the child block is to the right of the
1794  * original leaf page reached by _bt_search()). It will even update
1795  * the descent stack each time we loop around, avoiding repeated work.
1796  */
1797  if (!rightsib_empty)
1798  break;
1799 
1800  leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1801  }
1802 
1803  return ndeleted;
1804 }
1805 
1806 /*
1807  * First stage of page deletion.
1808  *
1809  * Establish the height of the to-be-deleted subtree with leafbuf at its
1810  * lowest level, remove the downlink to the subtree, and mark leafbuf
1811  * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
1812  * but may include additional internal pages (at most one per level of the
1813  * tree below the root).
1814  *
1815  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
1816  * the rightmost child of its parent (and parent has more than one downlink).
1817  * Returns 'true' when the first stage of page deletion completed
1818  * successfully.
1819  */
1820 static bool
1822 {
1823  BlockNumber leafblkno;
1824  BlockNumber leafrightsib;
1825  BlockNumber topparent;
1826  BlockNumber topparentrightsib;
1827  ItemId itemid;
1828  Page page;
1829  BTPageOpaque opaque;
1830  Buffer subtreeparent;
1831  OffsetNumber poffset;
1832  OffsetNumber nextoffset;
1833  IndexTuple itup;
1834  IndexTupleData trunctuple;
1835 
1836  page = BufferGetPage(leafbuf);
1837  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1838 
1839  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
1840  P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
1841  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1842 
1843  /*
1844  * Save info about the leaf page.
1845  */
1846  leafblkno = BufferGetBlockNumber(leafbuf);
1847  leafrightsib = opaque->btpo_next;
1848 
1849  /*
1850  * Before attempting to lock the parent page, check that the right sibling
1851  * is not in half-dead state. A half-dead right sibling would have no
1852  * downlink in the parent, which would be highly confusing later when we
1853  * delete the downlink. It would fail the "right sibling of target page
1854  * is also the next child in parent page" cross-check below.
1855  */
1856  if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
1857  {
1858  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
1859  leafblkno, leafrightsib);
1860  return false;
1861  }
1862 
1863  /*
1864  * We cannot delete a page that is the rightmost child of its immediate
1865  * parent, unless it is the only child --- in which case the parent has to
1866  * be deleted too, and the same condition applies recursively to it. We
1867  * have to check this condition all the way up before trying to delete,
1868  * and lock the parent of the root of the to-be-deleted subtree (the
1869  * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
1870  * for us. We remove the downlink to the "top parent" page (subtree root
1871  * page) from the subtree parent page below.
1872  *
1873  * Initialize topparent to be leafbuf page now. The final to-be-deleted
1874  * subtree is often a degenerate one page subtree consisting only of the
1875  * leafbuf page. When that happens, the leafbuf page is the final subtree
1876  * root page/top parent page.
1877  */
1878  topparent = leafblkno;
1879  topparentrightsib = leafrightsib;
1880  if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
1881  &subtreeparent, &poffset,
1882  &topparent, &topparentrightsib))
1883  return false;
1884 
1885  /*
1886  * Check that the parent-page index items we're about to delete/overwrite
1887  * in subtree parent page contain what we expect. This can fail if the
1888  * index has become corrupt for some reason. We want to throw any error
1889  * before entering the critical section --- otherwise it'd be a PANIC.
1890  */
1891  page = BufferGetPage(subtreeparent);
1892  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1893 
1894 #ifdef USE_ASSERT_CHECKING
1895 
1896  /*
1897  * This is just an assertion because _bt_lock_subtree_parent should have
1898  * guaranteed tuple has the expected contents
1899  */
1900  itemid = PageGetItemId(page, poffset);
1901  itup = (IndexTuple) PageGetItem(page, itemid);
1902  Assert(BTreeTupleGetDownLink(itup) == topparent);
1903 #endif
1904 
1905  nextoffset = OffsetNumberNext(poffset);
1906  itemid = PageGetItemId(page, nextoffset);
1907  itup = (IndexTuple) PageGetItem(page, itemid);
1908  if (BTreeTupleGetDownLink(itup) != topparentrightsib)
1909  ereport(ERROR,
1910  (errcode(ERRCODE_INDEX_CORRUPTED),
1911  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1912  topparentrightsib, topparent,
1913  BTreeTupleGetDownLink(itup),
1914  BufferGetBlockNumber(subtreeparent),
1915  RelationGetRelationName(rel))));
1916 
1917  /*
1918  * Any insert which would have gone on the leaf block will now go to its
1919  * right sibling. In other words, the key space moves right.
1920  */
1921  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
1922 
1923  /* No ereport(ERROR) until changes are logged */
1925 
1926  /*
1927  * Update parent of subtree. We want to delete the downlink to the top
1928  * parent page/root of the subtree, and the *following* key. Easiest way
1929  * is to copy the right sibling's downlink over the downlink that points
1930  * to top parent page, and then delete the right sibling's original pivot
1931  * tuple.
1932  *
1933  * Lanin and Shasha make the key space move left when deleting a page,
1934  * whereas the key space moves right here. That's why we cannot simply
1935  * delete the pivot tuple with the downlink to the top parent page. See
1936  * nbtree/README.
1937  */
1938  page = BufferGetPage(subtreeparent);
1939  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1940 
1941  itemid = PageGetItemId(page, poffset);
1942  itup = (IndexTuple) PageGetItem(page, itemid);
1943  BTreeTupleSetDownLink(itup, topparentrightsib);
1944 
1945  nextoffset = OffsetNumberNext(poffset);
1946  PageIndexTupleDelete(page, nextoffset);
1947 
1948  /*
1949  * Mark the leaf page as half-dead, and stamp it with a link to the top
1950  * parent page. When the leaf page is also the top parent page, the link
1951  * is set to InvalidBlockNumber.
1952  */
1953  page = BufferGetPage(leafbuf);
1954  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1955  opaque->btpo_flags |= BTP_HALF_DEAD;
1956 
1958  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
1959  trunctuple.t_info = sizeof(IndexTupleData);
1960  if (topparent != leafblkno)
1961  BTreeTupleSetTopParent(&trunctuple, topparent);
1962  else
1964 
1965  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
1966  IndexTupleSize(&trunctuple)))
1967  elog(ERROR, "could not overwrite high key in half-dead page");
1968 
1969  /* Must mark buffers dirty before XLogInsert */
1970  MarkBufferDirty(subtreeparent);
1971  MarkBufferDirty(leafbuf);
1972 
1973  /* XLOG stuff */
1974  if (RelationNeedsWAL(rel))
1975  {
1977  XLogRecPtr recptr;
1978 
1979  xlrec.poffset = poffset;
1980  xlrec.leafblk = leafblkno;
1981  if (topparent != leafblkno)
1982  xlrec.topparent = topparent;
1983  else
1984  xlrec.topparent = InvalidBlockNumber;
1985 
1986  XLogBeginInsert();
1987  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
1988  XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
1989 
1990  page = BufferGetPage(leafbuf);
1991  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1992  xlrec.leftblk = opaque->btpo_prev;
1993  xlrec.rightblk = opaque->btpo_next;
1994 
1996 
1997  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
1998 
1999  page = BufferGetPage(subtreeparent);
2000  PageSetLSN(page, recptr);
2001  page = BufferGetPage(leafbuf);
2002  PageSetLSN(page, recptr);
2003  }
2004 
2005  END_CRIT_SECTION();
2006 
2007  _bt_relbuf(rel, subtreeparent);
2008  return true;
2009 }
2010 
2011 /*
2012  * Second stage of page deletion.
2013  *
2014  * Unlinks a single page (in the subtree undergoing deletion) from its
2015  * siblings. Also marks the page deleted.
2016  *
2017  * To get rid of the whole subtree, including the leaf page itself, call here
2018  * until the leaf page is deleted. The original "top parent" established in
2019  * the first stage of deletion is deleted in the first call here, while the
2020  * leaf page is deleted in the last call here. Note that the leaf page itself
2021  * is often the initial top parent page.
2022  *
2023  * Returns 'false' if the page could not be unlinked (shouldn't happen). If
2024  * the right sibling of the current target page is empty, *rightsib_empty is
2025  * set to true, allowing caller to delete the target's right sibling page in
2026  * passing. Note that *rightsib_empty is only actually used by caller when
2027  * target page is leafbuf, following last call here for leafbuf/the subtree
2028  * containing leafbuf. (We always set *rightsib_empty for caller, just to be
2029  * consistent.)
2030  *
2031  * We maintain *oldestBtpoXact for pages that are deleted by the current
2032  * VACUUM operation here. This must be handled here because we conservatively
2033  * assume that there needs to be a new call to ReadNewTransactionId() each
2034  * time a page gets deleted. See comments about the underlying assumption
2035  * below.
2036  *
2037  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2038  * On success exit, we'll be holding pin and write lock. On failure exit,
2039  * we'll release both pin and lock before returning (we define it that way
2040  * to avoid having to reacquire a lock we already released).
2041  */
2042 static bool
2044  bool *rightsib_empty, TransactionId *oldestBtpoXact,
2045  uint32 *ndeleted)
2046 {
2047  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2048  BlockNumber leafleftsib;
2049  BlockNumber leafrightsib;
2050  BlockNumber target;
2051  BlockNumber leftsib;
2052  BlockNumber rightsib;
2053  Buffer lbuf = InvalidBuffer;
2054  Buffer buf;
2055  Buffer rbuf;
2056  Buffer metabuf = InvalidBuffer;
2057  Page metapg = NULL;
2058  BTMetaPageData *metad = NULL;
2059  ItemId itemid;
2060  Page page;
2062  BTPageOpaque opaque;
2063  bool rightsib_is_rightmost;
2064  int targetlevel;
2065  IndexTuple leafhikey;
2066  BlockNumber nextchild;
2067 
2068  page = BufferGetPage(leafbuf);
2069  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2070 
2071  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2072 
2073  /*
2074  * Remember some information about the leaf page.
2075  */
2076  itemid = PageGetItemId(page, P_HIKEY);
2077  leafhikey = (IndexTuple) PageGetItem(page, itemid);
2078  target = BTreeTupleGetTopParent(leafhikey);
2079  leafleftsib = opaque->btpo_prev;
2080  leafrightsib = opaque->btpo_next;
2081 
2082  _bt_unlockbuf(rel, leafbuf);
2083 
2084  /*
2085  * Check here, as calling loops will have locks held, preventing
2086  * interrupts from being processed.
2087  */
2089 
2090  /* Unlink the current top parent of the subtree */
2091  if (!BlockNumberIsValid(target))
2092  {
2093  /* Target is leaf page (or leaf page is top parent, if you prefer) */
2094  target = leafblkno;
2095 
2096  buf = leafbuf;
2097  leftsib = leafleftsib;
2098  targetlevel = 0;
2099  }
2100  else
2101  {
2102  /* Target is the internal page taken from leaf's top parent link */
2103  Assert(target != leafblkno);
2104 
2105  /* Fetch the block number of the target's left sibling */
2106  buf = _bt_getbuf(rel, target, BT_READ);
2107  page = BufferGetPage(buf);
2108  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2109  leftsib = opaque->btpo_prev;
2110  targetlevel = opaque->btpo.level;
2111  Assert(targetlevel > 0);
2112 
2113  /*
2114  * To avoid deadlocks, we'd better drop the target page lock before
2115  * going further.
2116  */
2117  _bt_unlockbuf(rel, buf);
2118  }
2119 
2120  /*
2121  * We have to lock the pages we need to modify in the standard order:
2122  * moving right, then up. Else we will deadlock against other writers.
2123  *
2124  * So, first lock the leaf page, if it's not the target. Then find and
2125  * write-lock the current left sibling of the target page. The sibling
2126  * that was current a moment ago could have split, so we may have to move
2127  * right. This search could fail if either the sibling or the target page
2128  * was deleted by someone else meanwhile; if so, give up. (Right now,
2129  * that should never happen, since page deletion is only done in VACUUM
2130  * and there shouldn't be multiple VACUUMs concurrently on the same
2131  * table.)
2132  */
2133  if (target != leafblkno)
2134  _bt_lockbuf(rel, leafbuf, BT_WRITE);
2135  if (leftsib != P_NONE)
2136  {
2137  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2138  page = BufferGetPage(lbuf);
2139  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2140  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2141  {
2142  /* step right one page */
2143  leftsib = opaque->btpo_next;
2144  _bt_relbuf(rel, lbuf);
2145 
2146  /*
2147  * It'd be good to check for interrupts here, but it's not easy to
2148  * do so because a lock is always held. This block isn't
2149  * frequently reached, so hopefully the consequences of not
2150  * checking interrupts aren't too bad.
2151  */
2152 
2153  if (leftsib == P_NONE)
2154  {
2155  elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
2156  target,
2158  if (target != leafblkno)
2159  {
2160  /* we have only a pin on target, but pin+lock on leafbuf */
2161  ReleaseBuffer(buf);
2162  _bt_relbuf(rel, leafbuf);
2163  }
2164  else
2165  {
2166  /* we have only a pin on leafbuf */
2167  ReleaseBuffer(leafbuf);
2168  }
2169  return false;
2170  }
2171  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2172  page = BufferGetPage(lbuf);
2173  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2174  }
2175  }
2176  else
2177  lbuf = InvalidBuffer;
2178 
2179  /*
2180  * Next write-lock the target page itself. It's okay to take a write lock
2181  * rather than a superexclusive lock, since no scan will stop on an empty
2182  * page.
2183  */
2184  _bt_lockbuf(rel, buf, BT_WRITE);
2185  page = BufferGetPage(buf);
2186  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2187 
2188  /*
2189  * Check page is still empty etc, else abandon deletion. This is just for
2190  * paranoia's sake; a half-dead page cannot resurrect because there can be
2191  * only one vacuum process running at a time.
2192  */
2193  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2194  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2195  target, RelationGetRelationName(rel));
2196 
2197  if (opaque->btpo_prev != leftsib)
2198  ereport(ERROR,
2199  (errcode(ERRCODE_INDEX_CORRUPTED),
2200  errmsg_internal("left link changed unexpectedly in block %u of index \"%s\"",
2201  target, RelationGetRelationName(rel))));
2202 
2203  if (target == leafblkno)
2204  {
2205  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2206  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2207  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2208  target, RelationGetRelationName(rel));
2209  nextchild = InvalidBlockNumber;
2210  }
2211  else
2212  {
2213  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2214  P_ISLEAF(opaque))
2215  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2216  target, RelationGetRelationName(rel));
2217 
2218  /* Remember the next non-leaf child down in the subtree */
2219  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2220  nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
2221  if (nextchild == leafblkno)
2222  nextchild = InvalidBlockNumber;
2223  }
2224 
2225  /*
2226  * And next write-lock the (current) right sibling.
2227  */
2228  rightsib = opaque->btpo_next;
2229  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2230  page = BufferGetPage(rbuf);
2231  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2232  if (opaque->btpo_prev != target)
2233  ereport(ERROR,
2234  (errcode(ERRCODE_INDEX_CORRUPTED),
2235  errmsg_internal("right sibling's left-link doesn't match: "
2236  "block %u links to %u instead of expected %u in index \"%s\"",
2237  rightsib, opaque->btpo_prev, target,
2238  RelationGetRelationName(rel))));
2239  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2240  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2241 
2242  /*
2243  * If we are deleting the next-to-last page on the target's level, then
2244  * the rightsib is a candidate to become the new fast root. (In theory, it
2245  * might be possible to push the fast root even further down, but the odds
2246  * of doing so are slim, and the locking considerations daunting.)
2247  *
2248  * We can safely acquire a lock on the metapage here --- see comments for
2249  * _bt_newroot().
2250  */
2251  if (leftsib == P_NONE && rightsib_is_rightmost)
2252  {
2253  page = BufferGetPage(rbuf);
2254  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2255  if (P_RIGHTMOST(opaque))
2256  {
2257  /* rightsib will be the only one left on the level */
2258  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2259  metapg = BufferGetPage(metabuf);
2260  metad = BTPageGetMeta(metapg);
2261 
2262  /*
2263  * The expected case here is btm_fastlevel == targetlevel+1; if
2264  * the fastlevel is <= targetlevel, something is wrong, and we
2265  * choose to overwrite it to fix it.
2266  */
2267  if (metad->btm_fastlevel > targetlevel + 1)
2268  {
2269  /* no update wanted */
2270  _bt_relbuf(rel, metabuf);
2271  metabuf = InvalidBuffer;
2272  }
2273  }
2274  }
2275 
2276  /*
2277  * Here we begin doing the deletion.
2278  */
2279 
2280  /* No ereport(ERROR) until changes are logged */
2282 
2283  /*
2284  * Update siblings' side-links. Note the target page's side-links will
2285  * continue to point to the siblings. Asserts here are just rechecking
2286  * things we already verified above.
2287  */
2288  if (BufferIsValid(lbuf))
2289  {
2290  page = BufferGetPage(lbuf);
2291  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2292  Assert(opaque->btpo_next == target);
2293  opaque->btpo_next = rightsib;
2294  }
2295  page = BufferGetPage(rbuf);
2296  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2297  Assert(opaque->btpo_prev == target);
2298  opaque->btpo_prev = leftsib;
2299 
2300  /*
2301  * If we deleted a parent of the targeted leaf page, instead of the leaf
2302  * itself, update the leaf to point to the next remaining child in the
2303  * subtree.
2304  *
2305  * Note: We rely on the fact that a buffer pin on the leaf page has been
2306  * held since leafhikey was initialized. This is safe, though only
2307  * because the page was already half-dead at that point. The leaf page
2308  * cannot have been modified by any other backend during the period when
2309  * no lock was held.
2310  */
2311  if (target != leafblkno)
2312  BTreeTupleSetTopParent(leafhikey, nextchild);
2313 
2314  /*
2315  * Mark the page itself deleted. It can be recycled when all current
2316  * transactions are gone. Storing GetTopTransactionId() would work, but
2317  * we're in VACUUM and would not otherwise have an XID. Having already
2318  * updated links to the target, ReadNewTransactionId() suffices as an
2319  * upper bound. Any scan having retained a now-stale link is advertising
2320  * in its PGPROC an xmin less than or equal to the value we read here. It
2321  * will continue to do so, holding back the xmin horizon, for the duration
2322  * of that scan.
2323  */
2324  page = BufferGetPage(buf);
2325  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2326  Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2327  opaque->btpo_flags &= ~BTP_HALF_DEAD;
2328  opaque->btpo_flags |= BTP_DELETED;
2329  opaque->btpo.xact = ReadNewTransactionId();
2330 
2331  /*
2332  * Remove the remaining tuples on the page. This keeps things simple for
2333  * WAL consistency checking.
2334  */
2335  header = (PageHeader) page;
2336  header->pd_lower = SizeOfPageHeaderData;
2337  header->pd_upper = header->pd_special;
2338 
2339  /* And update the metapage, if needed */
2340  if (BufferIsValid(metabuf))
2341  {
2342  /* upgrade metapage if needed */
2343  if (metad->btm_version < BTREE_NOVAC_VERSION)
2344  _bt_upgrademetapage(metapg);
2345  metad->btm_fastroot = rightsib;
2346  metad->btm_fastlevel = targetlevel;
2347  MarkBufferDirty(metabuf);
2348  }
2349 
2350  /* Must mark buffers dirty before XLogInsert */
2351  MarkBufferDirty(rbuf);
2352  MarkBufferDirty(buf);
2353  if (BufferIsValid(lbuf))
2354  MarkBufferDirty(lbuf);
2355  if (target != leafblkno)
2356  MarkBufferDirty(leafbuf);
2357 
2358  /* XLOG stuff */
2359  if (RelationNeedsWAL(rel))
2360  {
2361  xl_btree_unlink_page xlrec;
2362  xl_btree_metadata xlmeta;
2363  uint8 xlinfo;
2364  XLogRecPtr recptr;
2365 
2366  XLogBeginInsert();
2367 
2369  if (BufferIsValid(lbuf))
2372  if (target != leafblkno)
2373  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2374 
2375  /* information on the unlinked block */
2376  xlrec.leftsib = leftsib;
2377  xlrec.rightsib = rightsib;
2378  xlrec.btpo_xact = opaque->btpo.xact;
2379 
2380  /* information needed to recreate the leaf block (if not the target) */
2381  xlrec.leafleftsib = leafleftsib;
2382  xlrec.leafrightsib = leafrightsib;
2383  xlrec.topparent = nextchild;
2384 
2385  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2386 
2387  if (BufferIsValid(metabuf))
2388  {
2390 
2392  xlmeta.version = metad->btm_version;
2393  xlmeta.root = metad->btm_root;
2394  xlmeta.level = metad->btm_level;
2395  xlmeta.fastroot = metad->btm_fastroot;
2396  xlmeta.fastlevel = metad->btm_fastlevel;
2397  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
2399  xlmeta.allequalimage = metad->btm_allequalimage;
2400 
2401  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2402  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2403  }
2404  else
2405  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2406 
2407  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2408 
2409  if (BufferIsValid(metabuf))
2410  {
2411  PageSetLSN(metapg, recptr);
2412  }
2413  page = BufferGetPage(rbuf);
2414  PageSetLSN(page, recptr);
2415  page = BufferGetPage(buf);
2416  PageSetLSN(page, recptr);
2417  if (BufferIsValid(lbuf))
2418  {
2419  page = BufferGetPage(lbuf);
2420  PageSetLSN(page, recptr);
2421  }
2422  if (target != leafblkno)
2423  {
2424  page = BufferGetPage(leafbuf);
2425  PageSetLSN(page, recptr);
2426  }
2427  }
2428 
2429  END_CRIT_SECTION();
2430 
2431  /* release metapage */
2432  if (BufferIsValid(metabuf))
2433  _bt_relbuf(rel, metabuf);
2434 
2435  /* release siblings */
2436  if (BufferIsValid(lbuf))
2437  _bt_relbuf(rel, lbuf);
2438  _bt_relbuf(rel, rbuf);
2439 
2440  if (!TransactionIdIsValid(*oldestBtpoXact) ||
2441  TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
2442  *oldestBtpoXact = opaque->btpo.xact;
2443 
2444  /*
2445  * If btvacuumscan won't revisit this page in a future btvacuumpage call
2446  * and count it as deleted then, we count it as deleted by current
2447  * btvacuumpage call
2448  */
2449  if (target <= scanblkno)
2450  (*ndeleted)++;
2451 
2452  /* If the target is not leafbuf, we're done with it now -- release it */
2453  if (target != leafblkno)
2454  _bt_relbuf(rel, buf);
2455 
2456  return true;
2457 }
2458 
2459 /*
2460  * Establish how tall the to-be-deleted subtree will be during the first stage
2461  * of page deletion.
2462  *
2463  * Caller's child argument is the block number of the page caller wants to
2464  * delete (this is leafbuf's block number, except when we're called
2465  * recursively). stack is a search stack leading to it. Note that we will
2466  * update the stack entry(s) to reflect current downlink positions --- this is
2467  * similar to the corresponding point in page split handling.
2468  *
2469  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2470  * false. Returns true on success, in which case caller can use certain
2471  * details established here to perform the first stage of deletion. This
2472  * function is the last point at which page deletion may be deemed unsafe
2473  * (barring index corruption, or unexpected concurrent page deletions).
2474  *
2475  * We write lock the parent of the root of the to-be-deleted subtree for
2476  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2477  * caller). Caller will have to remove a downlink from *subtreeparent. We
2478  * also set a *subtreeparent offset number in *poffset, to indicate the
2479  * location of the pivot tuple that contains the relevant downlink.
2480  *
2481  * The root of the to-be-deleted subtree is called the "top parent". Note
2482  * that the leafbuf page is often the final "top parent" page (you can think
2483  * of the leafbuf page as a degenerate single page subtree when that happens).
2484  * Caller should initialize *topparent to the target leafbuf page block number
2485  * (while *topparentrightsib should be set to leafbuf's right sibling block
2486  * number). We will update *topparent (and *topparentrightsib) for caller
2487  * here, though only when it turns out that caller will delete at least one
2488  * internal page (i.e. only when caller needs to store a valid link to the top
2489  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2490  */
2491 static bool
2493  Buffer *subtreeparent, OffsetNumber *poffset,
2494  BlockNumber *topparent, BlockNumber *topparentrightsib)
2495 {
2496  BlockNumber parent,
2497  leftsibparent;
2498  OffsetNumber parentoffset,
2499  maxoff;
2500  Buffer pbuf;
2501  Page page;
2502  BTPageOpaque opaque;
2503 
2504  /*
2505  * Locate the pivot tuple whose downlink points to "child". Write lock
2506  * the parent page itself.
2507  */
2508  pbuf = _bt_getstackbuf(rel, stack, child);
2509  if (pbuf == InvalidBuffer)
2510  ereport(ERROR,
2511  (errcode(ERRCODE_INDEX_CORRUPTED),
2512  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2513  RelationGetRelationName(rel), child)));
2514  parent = stack->bts_blkno;
2515  parentoffset = stack->bts_offset;
2516 
2517  page = BufferGetPage(pbuf);
2518  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2519  maxoff = PageGetMaxOffsetNumber(page);
2520  leftsibparent = opaque->btpo_prev;
2521 
2522  /*
2523  * _bt_getstackbuf() completes page splits on returned parent buffer when
2524  * required.
2525  *
2526  * In general it's a bad idea for VACUUM to use up more disk space, which
2527  * is why page deletion does not finish incomplete page splits most of the
2528  * time. We allow this limited exception because the risk is much lower,
2529  * and the potential downside of not proceeding is much higher: A single
2530  * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2531  * prevent us from deleting hundreds of empty leaf pages from one level
2532  * down.
2533  */
2534  Assert(!P_INCOMPLETE_SPLIT(opaque));
2535 
2536  if (parentoffset < maxoff)
2537  {
2538  /*
2539  * Child is not the rightmost child in parent, so it's safe to delete
2540  * the subtree whose root/topparent is child page
2541  */
2542  *subtreeparent = pbuf;
2543  *poffset = parentoffset;
2544  return true;
2545  }
2546 
2547  /*
2548  * Child is the rightmost child of parent.
2549  *
2550  * Since it's the rightmost child of parent, deleting the child (or
2551  * deleting the subtree whose root/topparent is the child page) is only
2552  * safe when it's also possible to delete the parent.
2553  */
2554  Assert(parentoffset == maxoff);
2555  if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2556  {
2557  /*
2558  * Child isn't parent's only child, or parent is rightmost on its
2559  * entire level. Definitely cannot delete any pages.
2560  */
2561  _bt_relbuf(rel, pbuf);
2562  return false;
2563  }
2564 
2565  /*
2566  * Now make sure that the parent deletion is itself safe by examining the
2567  * child's grandparent page. Recurse, passing the parent page as the
2568  * child page (child's grandparent is the parent on the next level up). If
2569  * parent deletion is unsafe, then child deletion must also be unsafe (in
2570  * which case caller cannot delete any pages at all).
2571  */
2572  *topparent = parent;
2573  *topparentrightsib = opaque->btpo_next;
2574 
2575  /*
2576  * Release lock on parent before recursing.
2577  *
2578  * It's OK to release page locks on parent before recursive call locks
2579  * grandparent. An internal page can only acquire an entry if the child
2580  * is split, but that cannot happen as long as we still hold a lock on the
2581  * leafbuf page.
2582  */
2583  _bt_relbuf(rel, pbuf);
2584 
2585  /*
2586  * Before recursing, check that the left sibling of parent (if any) is not
2587  * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2588  * parent lock).
2589  *
2590  * Note: We deliberately avoid completing incomplete splits here.
2591  */
2592  if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2593  return false;
2594 
2595  /* Recurse to examine child page's grandparent page */
2596  return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2597  subtreeparent, poffset,
2598  topparent, topparentrightsib);
2599 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
bool allequalimage
Definition: nbtxlog.h:57
BlockNumber rootblk
Definition: nbtxlog.h:318
TransactionId latestRemovedXid
Definition: nbtxlog.h:189
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
uint16 ndeletedtids
Definition: nbtree.h:782
#define BTP_ROOT
Definition: nbtree.h:73
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3807
BlockNumber btpo_next
Definition: nbtree.h:59
#define DEBUG1
Definition: elog.h:25
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
int errhint(const char *fmt,...)
Definition: elog.c:1071
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:136
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:676
#define P_IGNORE(opaque)
Definition: nbtree.h:219
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:101
uint32 TransactionId
Definition: c.h:520
uint32 btm_version
Definition: nbtree.h:101
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:604
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:939
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:225
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:803
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:927
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1469
#define ExclusiveLock
Definition: lockdefs.h:44
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
#define BTREE_VERSION
Definition: nbtree.h:143
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1124
OffsetNumber updatedoffset
Definition: nbtree.h:779
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
uint32 btm_magic
Definition: nbtree.h:100
#define BTP_LEAF
Definition: nbtree.h:72
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:583
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:273
ItemPointerData t_tid
Definition: itup.h:37
#define BTP_HALF_DEAD
Definition: nbtree.h:76
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:51
IndexTuple itup
Definition: nbtree.h:778
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1498
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:349
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:372
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2492
RelFileNode node
Definition: nbtxlog.h:206
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 level
Definition: nbtxlog.h:319
#define MemSet(start, val, len)
Definition: c.h:949
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:61
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3511
uint16 nupdated
Definition: nbtxlog.h:243
#define P_NEW
Definition: bufmgr.h:91
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:174
#define BTP_DELETED
Definition: nbtree.h:74
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define LOG
Definition: elog.h:26
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:1081
#define PANIC
Definition: elog.h:53
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:63
#define BTP_META
Definition: nbtree.h:75
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint32 ndeleted
Definition: nbtxlog.h:190
uint16 OffsetNumber
Definition: off.h:24
ItemPointerData * ItemPointer
Definition: itemptr.h:49
#define P_ISMETA(opaque)
Definition: nbtree.h:217
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:211
static TransactionId table_compute_xid_horizon_for_tuples(Relation rel, ItemPointerData *items, int nitems)
Definition: tableam.h:1130
#define BT_READ
Definition: nbtree.h:587
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
unsigned short uint16
Definition: c.h:373
void pfree(void *pointer)
Definition: mcxt.c:1057
#define BTREE_MAGIC
Definition: nbtree.h:142
BlockNumber block
Definition: nbtxlog.h:207
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:726
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:218
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:498
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:250
BTCycleId btpo_cycleid
Definition: nbtree.h:66
static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:1821
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1290
#define DEBUG2
Definition: elog.h:24
#define BTPageGetMeta(p)
Definition: nbtree.h:114
uint32 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
Definition: nbtpage.c:1549
bool btm_allequalimage
Definition: nbtree.h:111
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1280
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
OffsetNumber bts_offset
Definition: nbtree.h:603
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:38
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:34
unsigned int uint32
Definition: c.h:374
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:430
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:272
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3776
#define BTREE_METAPAGE
Definition: nbtree.h:141
bool pivotsearch
Definition: nbtree.h:663
#define P_ISDELETED(opaque)
Definition: nbtree.h:216
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define P_ISROOT(opaque)
Definition: nbtree.h:215
static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable)
Definition: nbtpage.c:1359
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:668
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:322
uint32 version
Definition: nbtxlog.h:50
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:33
LocationIndex pd_special
Definition: bufpage.h:160
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:760
uint16 ndeleted
Definition: nbtxlog.h:242
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:975
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:37
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:783
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3750
BlockNumber btm_root
Definition: nbtree.h:102
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:508
#define XLogStandbyInfoActive()
Definition: xlog.h:205
#define BTREE_MIN_VERSION
Definition: nbtree.h:144
BlockNumber bts_blkno
Definition: nbtree.h:602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:959
#define ereport(elevel,...)
Definition: elog.h:144
bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
Definition: procarray.c:4094
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
RelFileNode rd_node
Definition: rel.h:55
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
#define Max(x, y)
Definition: c.h:921
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1036
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:607
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:473
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1532
#define MAXALIGN(LEN)
Definition: c.h:698
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
struct BTStackData * bts_parent
Definition: nbtree.h:604
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
static void header(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:210
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
static TransactionId ReadNewTransactionId(void)
Definition: transam.h:308
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1006
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2661
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:824
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:797
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:214
MemoryContext rd_indexcxt
Definition: rel.h:186
uint32 level
Definition: nbtxlog.h:52
int i
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:303
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1066
#define SizeOfBtreeDelete
Definition: nbtxlog.h:195
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:488
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3181
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1029
BlockNumber fastroot
Definition: nbtxlog.h:53
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:35
TransactionId latestRemovedXid
Definition: nbtxlog.h:208
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
LocationIndex pd_upper
Definition: bufpage.h:159
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void * rd_amcache
Definition: rel.h:211
#define BT_WRITE
Definition: nbtree.h:588
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1046
void XLogBeginInsert(void)
Definition: xloginsert.c:123
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:494
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:42
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2290
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
uint16 ndeletedtids
Definition: nbtxlog.h:220
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
LocationIndex pd_lower
Definition: bufpage.h:158
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, TransactionId *oldestBtpoXact, uint32 *ndeleted)
Definition: nbtpage.c:2043
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:121
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1441
#define P_ISLEAF(opaque)
Definition: nbtree.h:214
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42