PostgreSQL Source Code  git master
nbtpage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtpage.c
4  * BTree-specific page management code for the Postgres btree access
5  * method.
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  * src/backend/access/nbtree/nbtpage.c
13  *
14  * NOTES
15  * Postgres btree pages look like ordinary relation pages. The opaque
16  * data at high addresses includes pointers to left and right siblings
17  * and flag data describing page state. The first page in a btree, page
18  * zero, is special -- it stores meta-information describing the tree.
19  * Pages one and higher store the actual tree data.
20  *
21  *-------------------------------------------------------------------------
22  */
23 #include "postgres.h"
24 
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/transam.h"
28 #include "access/xlog.h"
29 #include "access/xloginsert.h"
30 #include "miscadmin.h"
31 #include "storage/indexfsm.h"
32 #include "storage/lmgr.h"
33 #include "storage/predicate.h"
34 #include "utils/snapmgr.h"
35 
36 static void _bt_cachemetadata(Relation rel, BTMetaPageData *input);
37 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
38 static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
39 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
40  bool *rightsib_empty);
41 static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
42  BTStack stack, Buffer *topparent, OffsetNumber *topoff,
43  BlockNumber *target, BlockNumber *rightsib);
44 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
45  TransactionId latestRemovedXid);
46 
47 /*
48  * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
49  */
50 void
51 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
52 {
53  BTMetaPageData *metad;
54  BTPageOpaque metaopaque;
55 
56  _bt_pageinit(page, BLCKSZ);
57 
58  metad = BTPageGetMeta(page);
59  metad->btm_magic = BTREE_MAGIC;
60  metad->btm_version = BTREE_VERSION;
61  metad->btm_root = rootbknum;
62  metad->btm_level = level;
63  metad->btm_fastroot = rootbknum;
64  metad->btm_fastlevel = level;
67 
68  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
69  metaopaque->btpo_flags = BTP_META;
70 
71  /*
72  * Set pd_lower just past the end of the metadata. This is essential,
73  * because without doing so, metadata will be lost if xlog.c compresses
74  * the page.
75  */
76  ((PageHeader) page)->pd_lower =
77  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
78 }
79 
80 /*
81  * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
82  * 3, the last version that can be updated without broadly affecting
83  * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
84  *
85  * This routine does purely in-memory image upgrade. Caller is
86  * responsible for locking, WAL-logging etc.
87  */
88 void
90 {
91  BTMetaPageData *metad;
93 
94  metad = BTPageGetMeta(page);
95  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
96 
97  /* It must be really a meta page of upgradable version */
98  Assert(metaopaque->btpo_flags & BTP_META);
101 
102  /* Set version number and fill extra fields added into version 3 */
105  metad->btm_last_cleanup_num_heap_tuples = -1.0;
106 
107  /* Adjust pd_lower (see _bt_initmetapage() for details) */
108  ((PageHeader) page)->pd_lower =
109  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
110 }
111 
112 /*
113  * Cache metadata from input meta page to rel->rd_amcache.
114  */
115 static void
117 {
118  BTMetaPageData *cached_metad;
119 
120  /* We assume rel->rd_amcache was already freed by caller */
121  Assert(rel->rd_amcache == NULL);
123  sizeof(BTMetaPageData));
124 
125  /* Meta page should be of supported version */
127  input->btm_version <= BTREE_VERSION);
128 
129  cached_metad = (BTMetaPageData *) rel->rd_amcache;
130  if (input->btm_version >= BTREE_NOVAC_VERSION)
131  {
132  /* Version with compatible meta-data, no need to upgrade */
133  memcpy(cached_metad, input, sizeof(BTMetaPageData));
134  }
135  else
136  {
137  /*
138  * Upgrade meta-data: copy available information from meta-page and
139  * fill new fields with default values.
140  *
141  * Note that we cannot upgrade to version 4+ without a REINDEX, since
142  * extensive on-disk changes are required.
143  */
144  memcpy(cached_metad, input, offsetof(BTMetaPageData, btm_oldest_btpo_xact));
145  cached_metad->btm_version = BTREE_NOVAC_VERSION;
147  cached_metad->btm_last_cleanup_num_heap_tuples = -1.0;
148  }
149 }
150 
151 /*
152  * Get metadata from share-locked buffer containing metapage, while performing
153  * standard sanity checks. Sanity checks here must match _bt_getroot().
154  */
155 static BTMetaPageData *
157 {
158  Page metapg;
159  BTPageOpaque metaopaque;
160  BTMetaPageData *metad;
161 
162  metapg = BufferGetPage(metabuf);
163  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
164  metad = BTPageGetMeta(metapg);
165 
166  /* sanity-check the metapage */
167  if (!P_ISMETA(metaopaque) ||
168  metad->btm_magic != BTREE_MAGIC)
169  ereport(ERROR,
170  (errcode(ERRCODE_INDEX_CORRUPTED),
171  errmsg("index \"%s\" is not a btree",
172  RelationGetRelationName(rel))));
173 
174  if (metad->btm_version < BTREE_MIN_VERSION ||
175  metad->btm_version > BTREE_VERSION)
176  ereport(ERROR,
177  (errcode(ERRCODE_INDEX_CORRUPTED),
178  errmsg("version mismatch in index \"%s\": file version %d, "
179  "current version %d, minimal supported version %d",
182 
183  return metad;
184 }
185 
186 /*
187  * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
188  * the metapage.
189  *
190  * This routine checks if provided cleanup-related information is matching
191  * to those written in the metapage. On mismatch, metapage is overwritten.
192  */
193 void
195  float8 numHeapTuples)
196 {
197  Buffer metabuf;
198  Page metapg;
199  BTMetaPageData *metad;
200  bool needsRewrite = false;
201  XLogRecPtr recptr;
202 
203  /* read the metapage and check if it needs rewrite */
204  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
205  metapg = BufferGetPage(metabuf);
206  metad = BTPageGetMeta(metapg);
207 
208  /* outdated version of metapage always needs rewrite */
209  if (metad->btm_version < BTREE_NOVAC_VERSION)
210  needsRewrite = true;
211  else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
212  metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
213  needsRewrite = true;
214 
215  if (!needsRewrite)
216  {
217  _bt_relbuf(rel, metabuf);
218  return;
219  }
220 
221  /* trade in our read lock for a write lock */
222  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
223  LockBuffer(metabuf, BT_WRITE);
224 
226 
227  /* upgrade meta-page if needed */
228  if (metad->btm_version < BTREE_NOVAC_VERSION)
229  _bt_upgrademetapage(metapg);
230 
231  /* update cleanup-related information */
232  metad->btm_oldest_btpo_xact = oldestBtpoXact;
233  metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
234  MarkBufferDirty(metabuf);
235 
236  /* write wal record if needed */
237  if (RelationNeedsWAL(rel))
238  {
240 
241  XLogBeginInsert();
243 
245  md.version = metad->btm_version;
246  md.root = metad->btm_root;
247  md.level = metad->btm_level;
248  md.fastroot = metad->btm_fastroot;
249  md.fastlevel = metad->btm_fastlevel;
250  md.oldest_btpo_xact = oldestBtpoXact;
251  md.last_cleanup_num_heap_tuples = numHeapTuples;
252 
253  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
254 
255  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
256 
257  PageSetLSN(metapg, recptr);
258  }
259 
261  _bt_relbuf(rel, metabuf);
262 }
263 
264 /*
265  * _bt_getroot() -- Get the root page of the btree.
266  *
267  * Since the root page can move around the btree file, we have to read
268  * its location from the metadata page, and then read the root page
269  * itself. If no root page exists yet, we have to create one.
270  *
271  * The access type parameter (BT_READ or BT_WRITE) controls whether
272  * a new root page will be created or not. If access = BT_READ,
273  * and no root page exists, we just return InvalidBuffer. For
274  * BT_WRITE, we try to create the root page if it doesn't exist.
275  * NOTE that the returned root page will have only a read lock set
276  * on it even if access = BT_WRITE!
277  *
278  * The returned page is not necessarily the true root --- it could be
279  * a "fast root" (a page that is alone in its level due to deletions).
280  * Also, if the root page is split while we are "in flight" to it,
281  * what we will return is the old root, which is now just the leftmost
282  * page on a probably-not-very-wide level. For most purposes this is
283  * as good as or better than the true root, so we do not bother to
284  * insist on finding the true root. We do, however, guarantee to
285  * return a live (not deleted or half-dead) page.
286  *
287  * On successful return, the root page is pinned and read-locked.
288  * The metadata page is not locked or pinned on exit.
289  */
290 Buffer
291 _bt_getroot(Relation rel, int access)
292 {
293  Buffer metabuf;
294  Page metapg;
295  BTPageOpaque metaopaque;
296  Buffer rootbuf;
297  Page rootpage;
298  BTPageOpaque rootopaque;
299  BlockNumber rootblkno;
300  uint32 rootlevel;
301  BTMetaPageData *metad;
302 
303  /*
304  * Try to use previously-cached metapage data to find the root. This
305  * normally saves one buffer access per index search, which is a very
306  * helpful savings in bufmgr traffic and hence contention.
307  */
308  if (rel->rd_amcache != NULL)
309  {
310  metad = (BTMetaPageData *) rel->rd_amcache;
311  /* We shouldn't have cached it if any of these fail */
312  Assert(metad->btm_magic == BTREE_MAGIC);
314  Assert(metad->btm_version <= BTREE_VERSION);
315  Assert(metad->btm_root != P_NONE);
316 
317  rootblkno = metad->btm_fastroot;
318  Assert(rootblkno != P_NONE);
319  rootlevel = metad->btm_fastlevel;
320 
321  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
322  rootpage = BufferGetPage(rootbuf);
323  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
324 
325  /*
326  * Since the cache might be stale, we check the page more carefully
327  * here than normal. We *must* check that it's not deleted. If it's
328  * not alone on its level, then we reject too --- this may be overly
329  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
330  * because that's not set in a "fast root".
331  */
332  if (!P_IGNORE(rootopaque) &&
333  rootopaque->btpo.level == rootlevel &&
334  P_LEFTMOST(rootopaque) &&
335  P_RIGHTMOST(rootopaque))
336  {
337  /* OK, accept cached page as the root */
338  return rootbuf;
339  }
340  _bt_relbuf(rel, rootbuf);
341  /* Cache is stale, throw it away */
342  if (rel->rd_amcache)
343  pfree(rel->rd_amcache);
344  rel->rd_amcache = NULL;
345  }
346 
347  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
348  metapg = BufferGetPage(metabuf);
349  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
350  metad = BTPageGetMeta(metapg);
351 
352  /* sanity-check the metapage */
353  if (!P_ISMETA(metaopaque) ||
354  metad->btm_magic != BTREE_MAGIC)
355  ereport(ERROR,
356  (errcode(ERRCODE_INDEX_CORRUPTED),
357  errmsg("index \"%s\" is not a btree",
358  RelationGetRelationName(rel))));
359 
360  if (metad->btm_version < BTREE_MIN_VERSION ||
361  metad->btm_version > BTREE_VERSION)
362  ereport(ERROR,
363  (errcode(ERRCODE_INDEX_CORRUPTED),
364  errmsg("version mismatch in index \"%s\": file version %d, "
365  "current version %d, minimal supported version %d",
368 
369  /* if no root page initialized yet, do it */
370  if (metad->btm_root == P_NONE)
371  {
372  /* If access = BT_READ, caller doesn't want us to create root yet */
373  if (access == BT_READ)
374  {
375  _bt_relbuf(rel, metabuf);
376  return InvalidBuffer;
377  }
378 
379  /* trade in our read lock for a write lock */
380  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
381  LockBuffer(metabuf, BT_WRITE);
382 
383  /*
384  * Race condition: if someone else initialized the metadata between
385  * the time we released the read lock and acquired the write lock, we
386  * must avoid doing it again.
387  */
388  if (metad->btm_root != P_NONE)
389  {
390  /*
391  * Metadata initialized by someone else. In order to guarantee no
392  * deadlocks, we have to release the metadata page and start all
393  * over again. (Is that really true? But it's hardly worth trying
394  * to optimize this case.)
395  */
396  _bt_relbuf(rel, metabuf);
397  return _bt_getroot(rel, access);
398  }
399 
400  /*
401  * Get, initialize, write, and leave a lock of the appropriate type on
402  * the new root page. Since this is the first page in the tree, it's
403  * a leaf as well as the root.
404  */
405  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
406  rootblkno = BufferGetBlockNumber(rootbuf);
407  rootpage = BufferGetPage(rootbuf);
408  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
409  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
410  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
411  rootopaque->btpo.level = 0;
412  rootopaque->btpo_cycleid = 0;
413 
414  /* NO ELOG(ERROR) till meta is updated */
416 
417  /* upgrade metapage if needed */
418  if (metad->btm_version < BTREE_NOVAC_VERSION)
419  _bt_upgrademetapage(metapg);
420 
421  metad->btm_root = rootblkno;
422  metad->btm_level = 0;
423  metad->btm_fastroot = rootblkno;
424  metad->btm_fastlevel = 0;
426  metad->btm_last_cleanup_num_heap_tuples = -1.0;
427 
428  MarkBufferDirty(rootbuf);
429  MarkBufferDirty(metabuf);
430 
431  /* XLOG stuff */
432  if (RelationNeedsWAL(rel))
433  {
434  xl_btree_newroot xlrec;
435  XLogRecPtr recptr;
437 
438  XLogBeginInsert();
441 
443  md.version = metad->btm_version;
444  md.root = rootblkno;
445  md.level = 0;
446  md.fastroot = rootblkno;
447  md.fastlevel = 0;
450 
451  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
452 
453  xlrec.rootblk = rootblkno;
454  xlrec.level = 0;
455 
456  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
457 
458  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
459 
460  PageSetLSN(rootpage, recptr);
461  PageSetLSN(metapg, recptr);
462  }
463 
465 
466  /*
467  * swap root write lock for read lock. There is no danger of anyone
468  * else accessing the new root page while it's unlocked, since no one
469  * else knows where it is yet.
470  */
471  LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
472  LockBuffer(rootbuf, BT_READ);
473 
474  /* okay, metadata is correct, release lock on it */
475  _bt_relbuf(rel, metabuf);
476  }
477  else
478  {
479  rootblkno = metad->btm_fastroot;
480  Assert(rootblkno != P_NONE);
481  rootlevel = metad->btm_fastlevel;
482 
483  /*
484  * Cache the metapage data for next time
485  */
486  _bt_cachemetadata(rel, metad);
487 
488  /*
489  * We are done with the metapage; arrange to release it via first
490  * _bt_relandgetbuf call
491  */
492  rootbuf = metabuf;
493 
494  for (;;)
495  {
496  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
497  rootpage = BufferGetPage(rootbuf);
498  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
499 
500  if (!P_IGNORE(rootopaque))
501  break;
502 
503  /* it's dead, Jim. step right one page */
504  if (P_RIGHTMOST(rootopaque))
505  elog(ERROR, "no live root page found in index \"%s\"",
507  rootblkno = rootopaque->btpo_next;
508  }
509 
510  /* Note: can't check btpo.level on deleted pages */
511  if (rootopaque->btpo.level != rootlevel)
512  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
513  rootblkno, RelationGetRelationName(rel),
514  rootopaque->btpo.level, rootlevel);
515  }
516 
517  /*
518  * By here, we have a pin and read lock on the root page, and no lock set
519  * on the metadata page. Return the root page's buffer.
520  */
521  return rootbuf;
522 }
523 
524 /*
525  * _bt_gettrueroot() -- Get the true root page of the btree.
526  *
527  * This is the same as the BT_READ case of _bt_getroot(), except
528  * we follow the true-root link not the fast-root link.
529  *
530  * By the time we acquire lock on the root page, it might have been split and
531  * not be the true root anymore. This is okay for the present uses of this
532  * routine; we only really need to be able to move up at least one tree level
533  * from whatever non-root page we were at. If we ever do need to lock the
534  * one true root page, we could loop here, re-reading the metapage on each
535  * failure. (Note that it wouldn't do to hold the lock on the metapage while
536  * moving to the root --- that'd deadlock against any concurrent root split.)
537  */
538 Buffer
540 {
541  Buffer metabuf;
542  Page metapg;
543  BTPageOpaque metaopaque;
544  Buffer rootbuf;
545  Page rootpage;
546  BTPageOpaque rootopaque;
547  BlockNumber rootblkno;
548  uint32 rootlevel;
549  BTMetaPageData *metad;
550 
551  /*
552  * We don't try to use cached metapage data here, since (a) this path is
553  * not performance-critical, and (b) if we are here it suggests our cache
554  * is out-of-date anyway. In light of point (b), it's probably safest to
555  * actively flush any cached metapage info.
556  */
557  if (rel->rd_amcache)
558  pfree(rel->rd_amcache);
559  rel->rd_amcache = NULL;
560 
561  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
562  metapg = BufferGetPage(metabuf);
563  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
564  metad = BTPageGetMeta(metapg);
565 
566  if (!P_ISMETA(metaopaque) ||
567  metad->btm_magic != BTREE_MAGIC)
568  ereport(ERROR,
569  (errcode(ERRCODE_INDEX_CORRUPTED),
570  errmsg("index \"%s\" is not a btree",
571  RelationGetRelationName(rel))));
572 
573  if (metad->btm_version < BTREE_MIN_VERSION ||
574  metad->btm_version > BTREE_VERSION)
575  ereport(ERROR,
576  (errcode(ERRCODE_INDEX_CORRUPTED),
577  errmsg("version mismatch in index \"%s\": file version %d, "
578  "current version %d, minimal supported version %d",
581 
582  /* if no root page initialized yet, fail */
583  if (metad->btm_root == P_NONE)
584  {
585  _bt_relbuf(rel, metabuf);
586  return InvalidBuffer;
587  }
588 
589  rootblkno = metad->btm_root;
590  rootlevel = metad->btm_level;
591 
592  /*
593  * We are done with the metapage; arrange to release it via first
594  * _bt_relandgetbuf call
595  */
596  rootbuf = metabuf;
597 
598  for (;;)
599  {
600  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
601  rootpage = BufferGetPage(rootbuf);
602  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
603 
604  if (!P_IGNORE(rootopaque))
605  break;
606 
607  /* it's dead, Jim. step right one page */
608  if (P_RIGHTMOST(rootopaque))
609  elog(ERROR, "no live root page found in index \"%s\"",
611  rootblkno = rootopaque->btpo_next;
612  }
613 
614  /* Note: can't check btpo.level on deleted pages */
615  if (rootopaque->btpo.level != rootlevel)
616  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
617  rootblkno, RelationGetRelationName(rel),
618  rootopaque->btpo.level, rootlevel);
619 
620  return rootbuf;
621 }
622 
623 /*
624  * _bt_getrootheight() -- Get the height of the btree search tree.
625  *
626  * We return the level (counting from zero) of the current fast root.
627  * This represents the number of tree levels we'd have to descend through
628  * to start any btree index search.
629  *
630  * This is used by the planner for cost-estimation purposes. Since it's
631  * only an estimate, slightly-stale data is fine, hence we don't worry
632  * about updating previously cached data.
633  */
634 int
636 {
637  BTMetaPageData *metad;
638 
639  if (rel->rd_amcache == NULL)
640  {
641  Buffer metabuf;
642 
643  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
644  metad = _bt_getmeta(rel, metabuf);
645 
646  /*
647  * If there's no root page yet, _bt_getroot() doesn't expect a cache
648  * to be made, so just stop here and report the index height is zero.
649  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
650  */
651  if (metad->btm_root == P_NONE)
652  {
653  _bt_relbuf(rel, metabuf);
654  return 0;
655  }
656 
657  /*
658  * Cache the metapage data for next time
659  */
660  _bt_cachemetadata(rel, metad);
661  /* We shouldn't have cached it if any of these fail */
662  Assert(metad->btm_magic == BTREE_MAGIC);
664  Assert(metad->btm_fastroot != P_NONE);
665  _bt_relbuf(rel, metabuf);
666  }
667 
668  /* Get cached page */
669  metad = (BTMetaPageData *) rel->rd_amcache;
670 
671  return metad->btm_fastlevel;
672 }
673 
674 /*
675  * _bt_heapkeyspace() -- is heap TID being treated as a key?
676  *
677  * This is used to determine the rules that must be used to descend a
678  * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
679  * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
680  * performance when inserting a new BTScanInsert-wise duplicate tuple
681  * among many leaf pages already full of such duplicates.
682  */
683 bool
685 {
686  BTMetaPageData *metad;
687 
688  if (rel->rd_amcache == NULL)
689  {
690  Buffer metabuf;
691 
692  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
693  metad = _bt_getmeta(rel, metabuf);
694 
695  /*
696  * If there's no root page yet, _bt_getroot() doesn't expect a cache
697  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
698  * be changed to allow this case.)
699  */
700  if (metad->btm_root == P_NONE)
701  {
702  uint32 btm_version = metad->btm_version;
703 
704  _bt_relbuf(rel, metabuf);
705  return btm_version > BTREE_NOVAC_VERSION;
706  }
707 
708  /*
709  * Cache the metapage data for next time
710  */
711  _bt_cachemetadata(rel, metad);
712  /* We shouldn't have cached it if any of these fail */
713  Assert(metad->btm_magic == BTREE_MAGIC);
715  Assert(metad->btm_fastroot != P_NONE);
716  _bt_relbuf(rel, metabuf);
717  }
718 
719  /* Get cached page */
720  metad = (BTMetaPageData *) rel->rd_amcache;
721 
722  return metad->btm_version > BTREE_NOVAC_VERSION;
723 }
724 
725 /*
726  * _bt_checkpage() -- Verify that a freshly-read page looks sane.
727  */
728 void
730 {
731  Page page = BufferGetPage(buf);
732 
733  /*
734  * ReadBuffer verifies that every newly-read page passes
735  * PageHeaderIsValid, which means it either contains a reasonably sane
736  * page header or is all-zero. We have to defend against the all-zero
737  * case, however.
738  */
739  if (PageIsNew(page))
740  ereport(ERROR,
741  (errcode(ERRCODE_INDEX_CORRUPTED),
742  errmsg("index \"%s\" contains unexpected zero page at block %u",
744  BufferGetBlockNumber(buf)),
745  errhint("Please REINDEX it.")));
746 
747  /*
748  * Additionally check that the special area looks sane.
749  */
750  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
751  ereport(ERROR,
752  (errcode(ERRCODE_INDEX_CORRUPTED),
753  errmsg("index \"%s\" contains corrupted page at block %u",
755  BufferGetBlockNumber(buf)),
756  errhint("Please REINDEX it.")));
757 }
758 
759 /*
760  * Log the reuse of a page from the FSM.
761  */
762 static void
764 {
765  xl_btree_reuse_page xlrec_reuse;
766 
767  /*
768  * Note that we don't register the buffer with the record, because this
769  * operation doesn't modify the page. This record only exists to provide a
770  * conflict point for Hot Standby.
771  */
772 
773  /* XLOG stuff */
774  xlrec_reuse.node = rel->rd_node;
775  xlrec_reuse.block = blkno;
776  xlrec_reuse.latestRemovedXid = latestRemovedXid;
777 
778  XLogBeginInsert();
779  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
780 
781  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
782 }
783 
784 /*
785  * _bt_getbuf() -- Get a buffer by block number for read or write.
786  *
787  * blkno == P_NEW means to get an unallocated index page. The page
788  * will be initialized before returning it.
789  *
790  * When this routine returns, the appropriate lock is set on the
791  * requested buffer and its reference count has been incremented
792  * (ie, the buffer is "locked and pinned"). Also, we apply
793  * _bt_checkpage to sanity-check the page (except in P_NEW case).
794  */
795 Buffer
796 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
797 {
798  Buffer buf;
799 
800  if (blkno != P_NEW)
801  {
802  /* Read an existing block of the relation */
803  buf = ReadBuffer(rel, blkno);
804  LockBuffer(buf, access);
805  _bt_checkpage(rel, buf);
806  }
807  else
808  {
809  bool needLock;
810  Page page;
811 
812  Assert(access == BT_WRITE);
813 
814  /*
815  * First see if the FSM knows of any free pages.
816  *
817  * We can't trust the FSM's report unreservedly; we have to check that
818  * the page is still free. (For example, an already-free page could
819  * have been re-used between the time the last VACUUM scanned it and
820  * the time the VACUUM made its FSM updates.)
821  *
822  * In fact, it's worse than that: we can't even assume that it's safe
823  * to take a lock on the reported page. If somebody else has a lock
824  * on it, or even worse our own caller does, we could deadlock. (The
825  * own-caller scenario is actually not improbable. Consider an index
826  * on a serial or timestamp column. Nearly all splits will be at the
827  * rightmost page, so it's entirely likely that _bt_split will call us
828  * while holding a lock on the page most recently acquired from FSM. A
829  * VACUUM running concurrently with the previous split could well have
830  * placed that page back in FSM.)
831  *
832  * To get around that, we ask for only a conditional lock on the
833  * reported page. If we fail, then someone else is using the page,
834  * and we may reasonably assume it's not free. (If we happen to be
835  * wrong, the worst consequence is the page will be lost to use till
836  * the next VACUUM, which is no big problem.)
837  */
838  for (;;)
839  {
840  blkno = GetFreeIndexPage(rel);
841  if (blkno == InvalidBlockNumber)
842  break;
843  buf = ReadBuffer(rel, blkno);
844  if (ConditionalLockBuffer(buf))
845  {
846  page = BufferGetPage(buf);
847  if (_bt_page_recyclable(page))
848  {
849  /*
850  * If we are generating WAL for Hot Standby then create a
851  * WAL record that will allow us to conflict with queries
852  * running on standby, in case they have snapshots older
853  * than btpo.xact. This can only apply if the page does
854  * have a valid btpo.xact value, ie not if it's new. (We
855  * must check that because an all-zero page has no special
856  * space.)
857  */
858  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
859  !PageIsNew(page))
860  {
862 
863  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
864  }
865 
866  /* Okay to use page. Re-initialize and return it */
867  _bt_pageinit(page, BufferGetPageSize(buf));
868  return buf;
869  }
870  elog(DEBUG2, "FSM returned nonrecyclable page");
871  _bt_relbuf(rel, buf);
872  }
873  else
874  {
875  elog(DEBUG2, "FSM returned nonlockable page");
876  /* couldn't get lock, so just drop pin */
877  ReleaseBuffer(buf);
878  }
879  }
880 
881  /*
882  * Extend the relation by one page.
883  *
884  * We have to use a lock to ensure no one else is extending the rel at
885  * the same time, else we will both try to initialize the same new
886  * page. We can skip locking for new or temp relations, however,
887  * since no one else could be accessing them.
888  */
889  needLock = !RELATION_IS_LOCAL(rel);
890 
891  if (needLock)
893 
894  buf = ReadBuffer(rel, P_NEW);
895 
896  /* Acquire buffer lock on new page */
897  LockBuffer(buf, BT_WRITE);
898 
899  /*
900  * Release the file-extension lock; it's now OK for someone else to
901  * extend the relation some more. Note that we cannot release this
902  * lock before we have buffer lock on the new page, or we risk a race
903  * condition against btvacuumscan --- see comments therein.
904  */
905  if (needLock)
907 
908  /* Initialize the new page before returning it */
909  page = BufferGetPage(buf);
910  Assert(PageIsNew(page));
911  _bt_pageinit(page, BufferGetPageSize(buf));
912  }
913 
914  /* ref count and lock type are correct */
915  return buf;
916 }
917 
918 /*
919  * _bt_relandgetbuf() -- release a locked buffer and get another one.
920  *
921  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
922  * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
923  * then it reduces to just _bt_getbuf; allowing this case simplifies some
924  * callers.
925  *
926  * The original motivation for using this was to avoid two entries to the
927  * bufmgr when one would do. However, now it's mainly just a notational
928  * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
929  * is when the target page is the same one already in the buffer.
930  */
931 Buffer
932 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
933 {
934  Buffer buf;
935 
936  Assert(blkno != P_NEW);
937  if (BufferIsValid(obuf))
939  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
940  LockBuffer(buf, access);
941  _bt_checkpage(rel, buf);
942  return buf;
943 }
944 
945 /*
946  * _bt_relbuf() -- release a locked buffer.
947  *
948  * Lock and pin (refcount) are both dropped.
949  */
950 void
952 {
953  UnlockReleaseBuffer(buf);
954 }
955 
956 /*
957  * _bt_pageinit() -- Initialize a new page.
958  *
959  * On return, the page header is initialized; data space is empty;
960  * special space is zeroed out.
961  */
962 void
963 _bt_pageinit(Page page, Size size)
964 {
965  PageInit(page, size, sizeof(BTPageOpaqueData));
966 }
967 
968 /*
969  * _bt_page_recyclable() -- Is an existing page recyclable?
970  *
971  * This exists to make sure _bt_getbuf and btvacuumscan have the same
972  * policy about whether a page is safe to re-use. But note that _bt_getbuf
973  * knows enough to distinguish the PageIsNew condition from the other one.
974  * At some point it might be appropriate to redesign this to have a three-way
975  * result value.
976  */
977 bool
979 {
980  BTPageOpaque opaque;
981 
982  /*
983  * It's possible to find an all-zeroes page in an index --- for example, a
984  * backend might successfully extend the relation one page and then crash
985  * before it is able to make a WAL entry for adding the page. If we find a
986  * zeroed page then reclaim it.
987  */
988  if (PageIsNew(page))
989  return true;
990 
991  /*
992  * Otherwise, recycle if deleted and too old to have any processes
993  * interested in it.
994  */
995  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
996  if (P_ISDELETED(opaque) &&
998  return true;
999  return false;
1000 }
1001 
1002 /*
1003  * Delete item(s) from a btree page during VACUUM.
1004  *
1005  * This must only be used for deleting leaf items. Deleting an item on a
1006  * non-leaf page has to be done as part of an atomic action that includes
1007  * deleting the page it points to.
1008  *
1009  * This routine assumes that the caller has pinned and locked the buffer.
1010  * Also, the given itemnos *must* appear in increasing order in the array.
1011  *
1012  * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby
1013  * we need to be able to pin all of the blocks in the btree in physical
1014  * order when replaying the effects of a VACUUM, just as we do for the
1015  * original VACUUM itself. lastBlockVacuumed allows us to tell whether an
1016  * intermediate range of blocks has had no changes at all by VACUUM,
1017  * and so must be scanned anyway during replay. We always write a WAL record
1018  * for the last block in the index, whether or not it contained any items
1019  * to be removed. This allows us to scan right up to end of index to
1020  * ensure correct locking.
1021  */
1022 void
1024  OffsetNumber *itemnos, int nitems,
1025  BlockNumber lastBlockVacuumed)
1026 {
1027  Page page = BufferGetPage(buf);
1028  BTPageOpaque opaque;
1029 
1030  /* No ereport(ERROR) until changes are logged */
1032 
1033  /* Fix the page */
1034  if (nitems > 0)
1035  PageIndexMultiDelete(page, itemnos, nitems);
1036 
1037  /*
1038  * We can clear the vacuum cycle ID since this page has certainly been
1039  * processed by the current vacuum scan.
1040  */
1041  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1042  opaque->btpo_cycleid = 0;
1043 
1044  /*
1045  * Mark the page as not containing any LP_DEAD items. This is not
1046  * certainly true (there might be some that have recently been marked, but
1047  * weren't included in our target-item list), but it will almost always be
1048  * true and it doesn't seem worth an additional page scan to check it.
1049  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
1050  */
1051  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1052 
1053  MarkBufferDirty(buf);
1054 
1055  /* XLOG stuff */
1056  if (RelationNeedsWAL(rel))
1057  {
1058  XLogRecPtr recptr;
1059  xl_btree_vacuum xlrec_vacuum;
1060 
1061  xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
1062 
1063  XLogBeginInsert();
1065  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1066 
1067  /*
1068  * The target-offsets array is not in the buffer, but pretend that it
1069  * is. When XLogInsert stores the whole buffer, the offsets array
1070  * need not be stored too.
1071  */
1072  if (nitems > 0)
1073  XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
1074 
1075  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1076 
1077  PageSetLSN(page, recptr);
1078  }
1079 
1080  END_CRIT_SECTION();
1081 }
1082 
1083 /*
1084  * Delete item(s) from a btree page during single-page cleanup.
1085  *
1086  * As above, must only be used on leaf pages.
1087  *
1088  * This routine assumes that the caller has pinned and locked the buffer.
1089  * Also, the given itemnos *must* appear in increasing order in the array.
1090  *
1091  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1092  * the page, but the WAL logging considerations are quite different. See
1093  * comments for _bt_delitems_vacuum.
1094  */
1095 void
1097  OffsetNumber *itemnos, int nitems,
1098  Relation heapRel)
1099 {
1100  Page page = BufferGetPage(buf);
1101  BTPageOpaque opaque;
1102  TransactionId latestRemovedXid = InvalidTransactionId;
1103 
1104  /* Shouldn't be called unless there's something to do */
1105  Assert(nitems > 0);
1106 
1108  latestRemovedXid =
1109  index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
1110  itemnos, nitems);
1111 
1112  /* No ereport(ERROR) until changes are logged */
1114 
1115  /* Fix the page */
1116  PageIndexMultiDelete(page, itemnos, nitems);
1117 
1118  /*
1119  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1120  * because this is not called by VACUUM.
1121  */
1122 
1123  /*
1124  * Mark the page as not containing any LP_DEAD items. This is not
1125  * certainly true (there might be some that have recently been marked, but
1126  * weren't included in our target-item list), but it will almost always be
1127  * true and it doesn't seem worth an additional page scan to check it.
1128  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
1129  */
1130  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1131  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1132 
1133  MarkBufferDirty(buf);
1134 
1135  /* XLOG stuff */
1136  if (RelationNeedsWAL(rel))
1137  {
1138  XLogRecPtr recptr;
1139  xl_btree_delete xlrec_delete;
1140 
1141  xlrec_delete.latestRemovedXid = latestRemovedXid;
1142  xlrec_delete.nitems = nitems;
1143 
1144  XLogBeginInsert();
1146  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1147 
1148  /*
1149  * We need the target-offsets array whether or not we store the whole
1150  * buffer, to allow us to find the latestRemovedXid on a standby
1151  * server.
1152  */
1153  XLogRegisterData((char *) itemnos, nitems * sizeof(OffsetNumber));
1154 
1155  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1156 
1157  PageSetLSN(page, recptr);
1158  }
1159 
1160  END_CRIT_SECTION();
1161 }
1162 
1163 /*
1164  * Returns true, if the given block has the half-dead flag set.
1165  */
1166 static bool
1168 {
1169  Buffer buf;
1170  Page page;
1171  BTPageOpaque opaque;
1172  bool result;
1173 
1174  buf = _bt_getbuf(rel, blk, BT_READ);
1175  page = BufferGetPage(buf);
1176  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1177 
1178  result = P_ISHALFDEAD(opaque);
1179  _bt_relbuf(rel, buf);
1180 
1181  return result;
1182 }
1183 
1184 /*
1185  * Subroutine to find the parent of the branch we're deleting. This climbs
1186  * up the tree until it finds a page with more than one child, i.e. a page
1187  * that will not be totally emptied by the deletion. The chain of pages below
1188  * it, with one downlink each, will form the branch that we need to delete.
1189  *
1190  * If we cannot remove the downlink from the parent, because it's the
1191  * rightmost entry, returns false. On success, *topparent and *topoff are set
1192  * to the buffer holding the parent, and the offset of the downlink in it.
1193  * *topparent is write-locked, the caller is responsible for releasing it when
1194  * done. *target is set to the topmost page in the branch to-be-deleted, i.e.
1195  * the page whose downlink *topparent / *topoff point to, and *rightsib to its
1196  * right sibling.
1197  *
1198  * "child" is the leaf page we wish to delete, and "stack" is a search stack
1199  * leading to it (it actually leads to the leftmost leaf page with a high key
1200  * matching that of the page to be deleted in !heapkeyspace indexes). Note
1201  * that we will update the stack entry(s) to reflect current downlink
1202  * positions --- this is essentially the same as the corresponding step of
1203  * splitting, and is not expected to affect caller. The caller should
1204  * initialize *target and *rightsib to the leaf page and its right sibling.
1205  *
1206  * Note: it's OK to release page locks on any internal pages between the leaf
1207  * and *topparent, because a safe deletion can't become unsafe due to
1208  * concurrent activity. An internal page can only acquire an entry if the
1209  * child is split, but that cannot happen as long as we hold a lock on the
1210  * leaf.
1211  */
1212 static bool
1214  Buffer *topparent, OffsetNumber *topoff,
1215  BlockNumber *target, BlockNumber *rightsib)
1216 {
1217  BlockNumber parent;
1218  OffsetNumber poffset,
1219  maxoff;
1220  Buffer pbuf;
1221  Page page;
1222  BTPageOpaque opaque;
1223  BlockNumber leftsib;
1224 
1225  /*
1226  * Locate the downlink of "child" in the parent, updating the stack entry
1227  * if needed. This is how !heapkeyspace indexes deal with having
1228  * non-unique high keys in leaf level pages. Even heapkeyspace indexes
1229  * can have a stale stack due to insertions into the parent.
1230  */
1231  stack->bts_btentry = child;
1232  pbuf = _bt_getstackbuf(rel, stack);
1233  if (pbuf == InvalidBuffer)
1234  elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
1235  RelationGetRelationName(rel), child);
1236  parent = stack->bts_blkno;
1237  poffset = stack->bts_offset;
1238 
1239  page = BufferGetPage(pbuf);
1240  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1241  maxoff = PageGetMaxOffsetNumber(page);
1242 
1243  /*
1244  * If the target is the rightmost child of its parent, then we can't
1245  * delete, unless it's also the only child.
1246  */
1247  if (poffset >= maxoff)
1248  {
1249  /* It's rightmost child... */
1250  if (poffset == P_FIRSTDATAKEY(opaque))
1251  {
1252  /*
1253  * It's only child, so safe if parent would itself be removable.
1254  * We have to check the parent itself, and then recurse to test
1255  * the conditions at the parent's parent.
1256  */
1257  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1258  P_INCOMPLETE_SPLIT(opaque))
1259  {
1260  _bt_relbuf(rel, pbuf);
1261  return false;
1262  }
1263 
1264  *target = parent;
1265  *rightsib = opaque->btpo_next;
1266  leftsib = opaque->btpo_prev;
1267 
1268  _bt_relbuf(rel, pbuf);
1269 
1270  /*
1271  * Like in _bt_pagedel, check that the left sibling is not marked
1272  * with INCOMPLETE_SPLIT flag. That would mean that there is no
1273  * downlink to the page to be deleted, and the page deletion
1274  * algorithm isn't prepared to handle that.
1275  */
1276  if (leftsib != P_NONE)
1277  {
1278  Buffer lbuf;
1279  Page lpage;
1280  BTPageOpaque lopaque;
1281 
1282  lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1283  lpage = BufferGetPage(lbuf);
1284  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1285 
1286  /*
1287  * If the left sibling was concurrently split, so that its
1288  * next-pointer doesn't point to the current page anymore, the
1289  * split that created the current page must be completed. (We
1290  * don't allow splitting an incompletely split page again
1291  * until the previous split has been completed)
1292  */
1293  if (lopaque->btpo_next == parent &&
1294  P_INCOMPLETE_SPLIT(lopaque))
1295  {
1296  _bt_relbuf(rel, lbuf);
1297  return false;
1298  }
1299  _bt_relbuf(rel, lbuf);
1300  }
1301 
1302  return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
1303  topparent, topoff, target, rightsib);
1304  }
1305  else
1306  {
1307  /* Unsafe to delete */
1308  _bt_relbuf(rel, pbuf);
1309  return false;
1310  }
1311  }
1312  else
1313  {
1314  /* Not rightmost child, so safe to delete */
1315  *topparent = pbuf;
1316  *topoff = poffset;
1317  return true;
1318  }
1319 }
1320 
1321 /*
1322  * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1323  *
1324  * This action unlinks the page from the b-tree structure, removing all
1325  * pointers leading to it --- but not touching its own left and right links.
1326  * The page cannot be physically reclaimed right away, since other processes
1327  * may currently be trying to follow links leading to the page; they have to
1328  * be allowed to use its right-link to recover. See nbtree/README.
1329  *
1330  * On entry, the target buffer must be pinned and locked (either read or write
1331  * lock is OK). This lock and pin will be dropped before exiting.
1332  *
1333  * Returns the number of pages successfully deleted (zero if page cannot
1334  * be deleted now; could be more than one if parent or sibling pages were
1335  * deleted too).
1336  *
1337  * NOTE: this leaks memory. Rather than trying to clean up everything
1338  * carefully, it's better to run it in a temp context that can be reset
1339  * frequently.
1340  */
1341 int
1343 {
1344  int ndeleted = 0;
1345  BlockNumber rightsib;
1346  bool rightsib_empty;
1347  Page page;
1348  BTPageOpaque opaque;
1349 
1350  /*
1351  * "stack" is a search stack leading (approximately) to the target page.
1352  * It is initially NULL, but when iterating, we keep it to avoid
1353  * duplicated search effort.
1354  *
1355  * Also, when "stack" is not NULL, we have already checked that the
1356  * current page is not the right half of an incomplete split, i.e. the
1357  * left sibling does not have its INCOMPLETE_SPLIT flag set.
1358  */
1359  BTStack stack = NULL;
1360 
1361  for (;;)
1362  {
1363  page = BufferGetPage(buf);
1364  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1365 
1366  /*
1367  * Internal pages are never deleted directly, only as part of deleting
1368  * the whole branch all the way down to leaf level.
1369  */
1370  if (!P_ISLEAF(opaque))
1371  {
1372  /*
1373  * Pre-9.4 page deletion only marked internal pages as half-dead,
1374  * but now we only use that flag on leaf pages. The old algorithm
1375  * was never supposed to leave half-dead pages in the tree, it was
1376  * just a transient state, but it was nevertheless possible in
1377  * error scenarios. We don't know how to deal with them here. They
1378  * are harmless as far as searches are considered, but inserts
1379  * into the deleted keyspace could add out-of-order downlinks in
1380  * the upper levels. Log a notice, hopefully the admin will notice
1381  * and reindex.
1382  */
1383  if (P_ISHALFDEAD(opaque))
1384  ereport(LOG,
1385  (errcode(ERRCODE_INDEX_CORRUPTED),
1386  errmsg("index \"%s\" contains a half-dead internal page",
1388  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1389  _bt_relbuf(rel, buf);
1390  return ndeleted;
1391  }
1392 
1393  /*
1394  * We can never delete rightmost pages nor root pages. While at it,
1395  * check that page is not already deleted and is empty.
1396  *
1397  * To keep the algorithm simple, we also never delete an incompletely
1398  * split page (they should be rare enough that this doesn't make any
1399  * meaningful difference to disk usage):
1400  *
1401  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1402  * left half of an incomplete split, but ensuring that it's not the
1403  * right half is more complicated. For that, we have to check that
1404  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
1405  * the first iteration, we temporarily release the lock on the current
1406  * page, and check the left sibling and also construct a search stack
1407  * to. On subsequent iterations, we know we stepped right from a page
1408  * that passed these tests, so it's OK.
1409  */
1410  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
1411  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1412  P_INCOMPLETE_SPLIT(opaque))
1413  {
1414  /* Should never fail to delete a half-dead page */
1415  Assert(!P_ISHALFDEAD(opaque));
1416 
1417  _bt_relbuf(rel, buf);
1418  return ndeleted;
1419  }
1420 
1421  /*
1422  * First, remove downlink pointing to the page (or a parent of the
1423  * page, if we are going to delete a taller branch), and mark the page
1424  * as half-dead.
1425  */
1426  if (!P_ISHALFDEAD(opaque))
1427  {
1428  /*
1429  * We need an approximate pointer to the page's parent page. We
1430  * use a variant of the standard search mechanism to search for
1431  * the page's high key; this will give us a link to either the
1432  * current parent or someplace to its left (if there are multiple
1433  * equal high keys, which is possible with !heapkeyspace indexes).
1434  *
1435  * Also check if this is the right-half of an incomplete split
1436  * (see comment above).
1437  */
1438  if (!stack)
1439  {
1440  BTScanInsert itup_key;
1441  ItemId itemid;
1442  IndexTuple targetkey;
1443  Buffer lbuf;
1444  BlockNumber leftsib;
1445 
1446  itemid = PageGetItemId(page, P_HIKEY);
1447  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1448 
1449  leftsib = opaque->btpo_prev;
1450 
1451  /*
1452  * To avoid deadlocks, we'd better drop the leaf page lock
1453  * before going further.
1454  */
1456 
1457  /*
1458  * Fetch the left sibling, to check that it's not marked with
1459  * INCOMPLETE_SPLIT flag. That would mean that the page
1460  * to-be-deleted doesn't have a downlink, and the page
1461  * deletion algorithm isn't prepared to handle that.
1462  */
1463  if (!P_LEFTMOST(opaque))
1464  {
1465  BTPageOpaque lopaque;
1466  Page lpage;
1467 
1468  lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1469  lpage = BufferGetPage(lbuf);
1470  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1471 
1472  /*
1473  * If the left sibling is split again by another backend,
1474  * after we released the lock, we know that the first
1475  * split must have finished, because we don't allow an
1476  * incompletely-split page to be split again. So we don't
1477  * need to walk right here.
1478  */
1479  if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
1480  P_INCOMPLETE_SPLIT(lopaque))
1481  {
1482  ReleaseBuffer(buf);
1483  _bt_relbuf(rel, lbuf);
1484  return ndeleted;
1485  }
1486  _bt_relbuf(rel, lbuf);
1487  }
1488 
1489  /* we need an insertion scan key for the search, so build one */
1490  itup_key = _bt_mkscankey(rel, targetkey);
1491  /* find the leftmost leaf page with matching pivot/high key */
1492  itup_key->pivotsearch = true;
1493  stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
1494  /* don't need a lock or second pin on the page */
1495  _bt_relbuf(rel, lbuf);
1496 
1497  /*
1498  * Re-lock the leaf page, and start over, to re-check that the
1499  * page can still be deleted.
1500  */
1501  LockBuffer(buf, BT_WRITE);
1502  continue;
1503  }
1504 
1505  if (!_bt_mark_page_halfdead(rel, buf, stack))
1506  {
1507  _bt_relbuf(rel, buf);
1508  return ndeleted;
1509  }
1510  }
1511 
1512  /*
1513  * Then unlink it from its siblings. Each call to
1514  * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1515  * making it shallower. Iterate until the leaf page is gone.
1516  */
1517  rightsib_empty = false;
1518  while (P_ISHALFDEAD(opaque))
1519  {
1520  /* will check for interrupts, once lock is released */
1521  if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
1522  {
1523  /* _bt_unlink_halfdead_page already released buffer */
1524  return ndeleted;
1525  }
1526  ndeleted++;
1527  }
1528 
1529  rightsib = opaque->btpo_next;
1530 
1531  _bt_relbuf(rel, buf);
1532 
1533  /*
1534  * Check here, as calling loops will have locks held, preventing
1535  * interrupts from being processed.
1536  */
1538 
1539  /*
1540  * The page has now been deleted. If its right sibling is completely
1541  * empty, it's possible that the reason we haven't deleted it earlier
1542  * is that it was the rightmost child of the parent. Now that we
1543  * removed the downlink for this page, the right sibling might now be
1544  * the only child of the parent, and could be removed. It would be
1545  * picked up by the next vacuum anyway, but might as well try to
1546  * remove it now, so loop back to process the right sibling.
1547  */
1548  if (!rightsib_empty)
1549  break;
1550 
1551  buf = _bt_getbuf(rel, rightsib, BT_WRITE);
1552  }
1553 
1554  return ndeleted;
1555 }
1556 
1557 /*
1558  * First stage of page deletion. Remove the downlink to the top of the
1559  * branch being deleted, and mark the leaf page as half-dead.
1560  */
1561 static bool
1563 {
1564  BlockNumber leafblkno;
1565  BlockNumber leafrightsib;
1566  BlockNumber target;
1567  BlockNumber rightsib;
1568  ItemId itemid;
1569  Page page;
1570  BTPageOpaque opaque;
1571  Buffer topparent;
1572  OffsetNumber topoff;
1573  OffsetNumber nextoffset;
1574  IndexTuple itup;
1575  IndexTupleData trunctuple;
1576 
1577  page = BufferGetPage(leafbuf);
1578  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1579 
1580  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) &&
1581  !P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) &&
1582  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1583 
1584  /*
1585  * Save info about the leaf page.
1586  */
1587  leafblkno = BufferGetBlockNumber(leafbuf);
1588  leafrightsib = opaque->btpo_next;
1589 
1590  /*
1591  * Before attempting to lock the parent page, check that the right sibling
1592  * is not in half-dead state. A half-dead right sibling would have no
1593  * downlink in the parent, which would be highly confusing later when we
1594  * delete the downlink that follows the current page's downlink. (I
1595  * believe the deletion would work correctly, but it would fail the
1596  * cross-check we make that the following downlink points to the right
1597  * sibling of the delete page.)
1598  */
1599  if (_bt_is_page_halfdead(rel, leafrightsib))
1600  {
1601  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
1602  leafblkno, leafrightsib);
1603  return false;
1604  }
1605 
1606  /*
1607  * We cannot delete a page that is the rightmost child of its immediate
1608  * parent, unless it is the only child --- in which case the parent has to
1609  * be deleted too, and the same condition applies recursively to it. We
1610  * have to check this condition all the way up before trying to delete,
1611  * and lock the final parent of the to-be-deleted subtree.
1612  *
1613  * However, we won't need to repeat the above _bt_is_page_halfdead() check
1614  * for parent/ancestor pages because of the rightmost restriction. The
1615  * leaf check will apply to a right "cousin" leaf page rather than a
1616  * simple right sibling leaf page in cases where we actually go on to
1617  * perform internal page deletion. The right cousin leaf page is
1618  * representative of the left edge of the subtree to the right of the
1619  * to-be-deleted subtree as a whole. (Besides, internal pages are never
1620  * marked half-dead, so it isn't even possible to directly assess if an
1621  * internal page is part of some other to-be-deleted subtree.)
1622  */
1623  rightsib = leafrightsib;
1624  target = leafblkno;
1625  if (!_bt_lock_branch_parent(rel, leafblkno, stack,
1626  &topparent, &topoff, &target, &rightsib))
1627  return false;
1628 
1629  /*
1630  * Check that the parent-page index items we're about to delete/overwrite
1631  * contain what we expect. This can fail if the index has become corrupt
1632  * for some reason. We want to throw any error before entering the
1633  * critical section --- otherwise it'd be a PANIC.
1634  *
1635  * The test on the target item is just an Assert because
1636  * _bt_lock_branch_parent should have guaranteed it has the expected
1637  * contents. The test on the next-child downlink is known to sometimes
1638  * fail in the field, though.
1639  */
1640  page = BufferGetPage(topparent);
1641  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1642 
1643 #ifdef USE_ASSERT_CHECKING
1644  itemid = PageGetItemId(page, topoff);
1645  itup = (IndexTuple) PageGetItem(page, itemid);
1646  Assert(BTreeInnerTupleGetDownLink(itup) == target);
1647 #endif
1648 
1649  nextoffset = OffsetNumberNext(topoff);
1650  itemid = PageGetItemId(page, nextoffset);
1651  itup = (IndexTuple) PageGetItem(page, itemid);
1652  if (BTreeInnerTupleGetDownLink(itup) != rightsib)
1653  elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1654  rightsib, target, BTreeInnerTupleGetDownLink(itup),
1656 
1657  /*
1658  * Any insert which would have gone on the leaf block will now go to its
1659  * right sibling.
1660  */
1661  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
1662 
1663  /* No ereport(ERROR) until changes are logged */
1665 
1666  /*
1667  * Update parent. The normal case is a tad tricky because we want to
1668  * delete the target's downlink and the *following* key. Easiest way is
1669  * to copy the right sibling's downlink over the target downlink, and then
1670  * delete the following item.
1671  */
1672  page = BufferGetPage(topparent);
1673  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1674 
1675  itemid = PageGetItemId(page, topoff);
1676  itup = (IndexTuple) PageGetItem(page, itemid);
1677  BTreeInnerTupleSetDownLink(itup, rightsib);
1678 
1679  nextoffset = OffsetNumberNext(topoff);
1680  PageIndexTupleDelete(page, nextoffset);
1681 
1682  /*
1683  * Mark the leaf page as half-dead, and stamp it with a pointer to the
1684  * highest internal page in the branch we're deleting. We use the tid of
1685  * the high key to store it.
1686  */
1687  page = BufferGetPage(leafbuf);
1688  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1689  opaque->btpo_flags |= BTP_HALF_DEAD;
1690 
1692  Assert(PageGetMaxOffsetNumber(page) == 0);
1693  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
1694  trunctuple.t_info = sizeof(IndexTupleData);
1695  if (target != leafblkno)
1696  BTreeTupleSetTopParent(&trunctuple, target);
1697  else
1699 
1700  if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
1701  false, false) == InvalidOffsetNumber)
1702  elog(ERROR, "could not add dummy high key to half-dead page");
1703 
1704  /* Must mark buffers dirty before XLogInsert */
1705  MarkBufferDirty(topparent);
1706  MarkBufferDirty(leafbuf);
1707 
1708  /* XLOG stuff */
1709  if (RelationNeedsWAL(rel))
1710  {
1712  XLogRecPtr recptr;
1713 
1714  xlrec.poffset = topoff;
1715  xlrec.leafblk = leafblkno;
1716  if (target != leafblkno)
1717  xlrec.topparent = target;
1718  else
1719  xlrec.topparent = InvalidBlockNumber;
1720 
1721  XLogBeginInsert();
1722  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
1723  XLogRegisterBuffer(1, topparent, REGBUF_STANDARD);
1724 
1725  page = BufferGetPage(leafbuf);
1726  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1727  xlrec.leftblk = opaque->btpo_prev;
1728  xlrec.rightblk = opaque->btpo_next;
1729 
1731 
1732  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
1733 
1734  page = BufferGetPage(topparent);
1735  PageSetLSN(page, recptr);
1736  page = BufferGetPage(leafbuf);
1737  PageSetLSN(page, recptr);
1738  }
1739 
1740  END_CRIT_SECTION();
1741 
1742  _bt_relbuf(rel, topparent);
1743  return true;
1744 }
1745 
1746 /*
1747  * Unlink a page in a branch of half-dead pages from its siblings.
1748  *
1749  * If the leaf page still has a downlink pointing to it, unlinks the highest
1750  * parent in the to-be-deleted branch instead of the leaf page. To get rid
1751  * of the whole branch, including the leaf page itself, iterate until the
1752  * leaf page is deleted.
1753  *
1754  * Returns 'false' if the page could not be unlinked (shouldn't happen).
1755  * If the (new) right sibling of the page is empty, *rightsib_empty is set
1756  * to true.
1757  *
1758  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1759  * On success exit, we'll be holding pin and write lock. On failure exit,
1760  * we'll release both pin and lock before returning (we define it that way
1761  * to avoid having to reacquire a lock we already released).
1762  */
1763 static bool
1764 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
1765 {
1766  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
1767  BlockNumber leafleftsib;
1768  BlockNumber leafrightsib;
1769  BlockNumber target;
1770  BlockNumber leftsib;
1771  BlockNumber rightsib;
1772  Buffer lbuf = InvalidBuffer;
1773  Buffer buf;
1774  Buffer rbuf;
1775  Buffer metabuf = InvalidBuffer;
1776  Page metapg = NULL;
1777  BTMetaPageData *metad = NULL;
1778  ItemId itemid;
1779  Page page;
1780  BTPageOpaque opaque;
1781  bool rightsib_is_rightmost;
1782  int targetlevel;
1783  IndexTuple leafhikey;
1784  BlockNumber nextchild;
1785 
1786  page = BufferGetPage(leafbuf);
1787  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1788 
1789  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1790 
1791  /*
1792  * Remember some information about the leaf page.
1793  */
1794  itemid = PageGetItemId(page, P_HIKEY);
1795  leafhikey = (IndexTuple) PageGetItem(page, itemid);
1796  leafleftsib = opaque->btpo_prev;
1797  leafrightsib = opaque->btpo_next;
1798 
1799  LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
1800 
1801  /*
1802  * Check here, as calling loops will have locks held, preventing
1803  * interrupts from being processed.
1804  */
1806 
1807  /*
1808  * If the leaf page still has a parent pointing to it (or a chain of
1809  * parents), we don't unlink the leaf page yet, but the topmost remaining
1810  * parent in the branch. Set 'target' and 'buf' to reference the page
1811  * actually being unlinked.
1812  */
1813  target = BTreeTupleGetTopParent(leafhikey);
1814 
1815  if (target != InvalidBlockNumber)
1816  {
1817  Assert(target != leafblkno);
1818 
1819  /* fetch the block number of the topmost parent's left sibling */
1820  buf = _bt_getbuf(rel, target, BT_READ);
1821  page = BufferGetPage(buf);
1822  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1823  leftsib = opaque->btpo_prev;
1824  targetlevel = opaque->btpo.level;
1825 
1826  /*
1827  * To avoid deadlocks, we'd better drop the target page lock before
1828  * going further.
1829  */
1831  }
1832  else
1833  {
1834  target = leafblkno;
1835 
1836  buf = leafbuf;
1837  leftsib = leafleftsib;
1838  targetlevel = 0;
1839  }
1840 
1841  /*
1842  * We have to lock the pages we need to modify in the standard order:
1843  * moving right, then up. Else we will deadlock against other writers.
1844  *
1845  * So, first lock the leaf page, if it's not the target. Then find and
1846  * write-lock the current left sibling of the target page. The sibling
1847  * that was current a moment ago could have split, so we may have to move
1848  * right. This search could fail if either the sibling or the target page
1849  * was deleted by someone else meanwhile; if so, give up. (Right now,
1850  * that should never happen, since page deletion is only done in VACUUM
1851  * and there shouldn't be multiple VACUUMs concurrently on the same
1852  * table.)
1853  */
1854  if (target != leafblkno)
1855  LockBuffer(leafbuf, BT_WRITE);
1856  if (leftsib != P_NONE)
1857  {
1858  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
1859  page = BufferGetPage(lbuf);
1860  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1861  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
1862  {
1863  /* step right one page */
1864  leftsib = opaque->btpo_next;
1865  _bt_relbuf(rel, lbuf);
1866 
1867  /*
1868  * It'd be good to check for interrupts here, but it's not easy to
1869  * do so because a lock is always held. This block isn't
1870  * frequently reached, so hopefully the consequences of not
1871  * checking interrupts aren't too bad.
1872  */
1873 
1874  if (leftsib == P_NONE)
1875  {
1876  elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
1877  target,
1879  if (target != leafblkno)
1880  {
1881  /* we have only a pin on target, but pin+lock on leafbuf */
1882  ReleaseBuffer(buf);
1883  _bt_relbuf(rel, leafbuf);
1884  }
1885  else
1886  {
1887  /* we have only a pin on leafbuf */
1888  ReleaseBuffer(leafbuf);
1889  }
1890  return false;
1891  }
1892  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
1893  page = BufferGetPage(lbuf);
1894  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1895  }
1896  }
1897  else
1898  lbuf = InvalidBuffer;
1899 
1900  /*
1901  * Next write-lock the target page itself. It should be okay to take just
1902  * a write lock not a superexclusive lock, since no scans would stop on an
1903  * empty page.
1904  */
1905  LockBuffer(buf, BT_WRITE);
1906  page = BufferGetPage(buf);
1907  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1908 
1909  /*
1910  * Check page is still empty etc, else abandon deletion. This is just for
1911  * paranoia's sake; a half-dead page cannot resurrect because there can be
1912  * only one vacuum process running at a time.
1913  */
1914  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
1915  {
1916  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
1917  target, RelationGetRelationName(rel));
1918  }
1919  if (opaque->btpo_prev != leftsib)
1920  elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
1921  target, RelationGetRelationName(rel));
1922 
1923  if (target == leafblkno)
1924  {
1925  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1926  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
1927  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
1928  target, RelationGetRelationName(rel));
1929  nextchild = InvalidBlockNumber;
1930  }
1931  else
1932  {
1933  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
1934  P_ISLEAF(opaque))
1935  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
1936  target, RelationGetRelationName(rel));
1937 
1938  /* remember the next non-leaf child down in the branch. */
1939  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
1940  nextchild = BTreeInnerTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
1941  if (nextchild == leafblkno)
1942  nextchild = InvalidBlockNumber;
1943  }
1944 
1945  /*
1946  * And next write-lock the (current) right sibling.
1947  */
1948  rightsib = opaque->btpo_next;
1949  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1950  page = BufferGetPage(rbuf);
1951  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1952  if (opaque->btpo_prev != target)
1953  elog(ERROR, "right sibling's left-link doesn't match: "
1954  "block %u links to %u instead of expected %u in index \"%s\"",
1955  rightsib, opaque->btpo_prev, target,
1957  rightsib_is_rightmost = P_RIGHTMOST(opaque);
1958  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1959 
1960  /*
1961  * If we are deleting the next-to-last page on the target's level, then
1962  * the rightsib is a candidate to become the new fast root. (In theory, it
1963  * might be possible to push the fast root even further down, but the odds
1964  * of doing so are slim, and the locking considerations daunting.)
1965  *
1966  * We don't support handling this in the case where the parent is becoming
1967  * half-dead, even though it theoretically could occur.
1968  *
1969  * We can safely acquire a lock on the metapage here --- see comments for
1970  * _bt_newroot().
1971  */
1972  if (leftsib == P_NONE && rightsib_is_rightmost)
1973  {
1974  page = BufferGetPage(rbuf);
1975  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1976  if (P_RIGHTMOST(opaque))
1977  {
1978  /* rightsib will be the only one left on the level */
1979  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1980  metapg = BufferGetPage(metabuf);
1981  metad = BTPageGetMeta(metapg);
1982 
1983  /*
1984  * The expected case here is btm_fastlevel == targetlevel+1; if
1985  * the fastlevel is <= targetlevel, something is wrong, and we
1986  * choose to overwrite it to fix it.
1987  */
1988  if (metad->btm_fastlevel > targetlevel + 1)
1989  {
1990  /* no update wanted */
1991  _bt_relbuf(rel, metabuf);
1992  metabuf = InvalidBuffer;
1993  }
1994  }
1995  }
1996 
1997  /*
1998  * Here we begin doing the deletion.
1999  */
2000 
2001  /* No ereport(ERROR) until changes are logged */
2003 
2004  /*
2005  * Update siblings' side-links. Note the target page's side-links will
2006  * continue to point to the siblings. Asserts here are just rechecking
2007  * things we already verified above.
2008  */
2009  if (BufferIsValid(lbuf))
2010  {
2011  page = BufferGetPage(lbuf);
2012  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2013  Assert(opaque->btpo_next == target);
2014  opaque->btpo_next = rightsib;
2015  }
2016  page = BufferGetPage(rbuf);
2017  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2018  Assert(opaque->btpo_prev == target);
2019  opaque->btpo_prev = leftsib;
2020 
2021  /*
2022  * If we deleted a parent of the targeted leaf page, instead of the leaf
2023  * itself, update the leaf to point to the next remaining child in the
2024  * branch.
2025  */
2026  if (target != leafblkno)
2027  BTreeTupleSetTopParent(leafhikey, nextchild);
2028 
2029  /*
2030  * Mark the page itself deleted. It can be recycled when all current
2031  * transactions are gone. Storing GetTopTransactionId() would work, but
2032  * we're in VACUUM and would not otherwise have an XID. Having already
2033  * updated links to the target, ReadNewTransactionId() suffices as an
2034  * upper bound. Any scan having retained a now-stale link is advertising
2035  * in its PGXACT an xmin less than or equal to the value we read here. It
2036  * will continue to do so, holding back RecentGlobalXmin, for the duration
2037  * of that scan.
2038  */
2039  page = BufferGetPage(buf);
2040  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2041  opaque->btpo_flags &= ~BTP_HALF_DEAD;
2042  opaque->btpo_flags |= BTP_DELETED;
2043  opaque->btpo.xact = ReadNewTransactionId();
2044 
2045  /* And update the metapage, if needed */
2046  if (BufferIsValid(metabuf))
2047  {
2048  /* upgrade metapage if needed */
2049  if (metad->btm_version < BTREE_NOVAC_VERSION)
2050  _bt_upgrademetapage(metapg);
2051  metad->btm_fastroot = rightsib;
2052  metad->btm_fastlevel = targetlevel;
2053  MarkBufferDirty(metabuf);
2054  }
2055 
2056  /* Must mark buffers dirty before XLogInsert */
2057  MarkBufferDirty(rbuf);
2058  MarkBufferDirty(buf);
2059  if (BufferIsValid(lbuf))
2060  MarkBufferDirty(lbuf);
2061  if (target != leafblkno)
2062  MarkBufferDirty(leafbuf);
2063 
2064  /* XLOG stuff */
2065  if (RelationNeedsWAL(rel))
2066  {
2067  xl_btree_unlink_page xlrec;
2068  xl_btree_metadata xlmeta;
2069  uint8 xlinfo;
2070  XLogRecPtr recptr;
2071 
2072  XLogBeginInsert();
2073 
2075  if (BufferIsValid(lbuf))
2078  if (target != leafblkno)
2079  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2080 
2081  /* information on the unlinked block */
2082  xlrec.leftsib = leftsib;
2083  xlrec.rightsib = rightsib;
2084  xlrec.btpo_xact = opaque->btpo.xact;
2085 
2086  /* information needed to recreate the leaf block (if not the target) */
2087  xlrec.leafleftsib = leafleftsib;
2088  xlrec.leafrightsib = leafrightsib;
2089  xlrec.topparent = nextchild;
2090 
2091  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2092 
2093  if (BufferIsValid(metabuf))
2094  {
2096 
2098  xlmeta.version = metad->btm_version;
2099  xlmeta.root = metad->btm_root;
2100  xlmeta.level = metad->btm_level;
2101  xlmeta.fastroot = metad->btm_fastroot;
2102  xlmeta.fastlevel = metad->btm_fastlevel;
2103  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
2105 
2106  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2107  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2108  }
2109  else
2110  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2111 
2112  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2113 
2114  if (BufferIsValid(metabuf))
2115  {
2116  PageSetLSN(metapg, recptr);
2117  }
2118  page = BufferGetPage(rbuf);
2119  PageSetLSN(page, recptr);
2120  page = BufferGetPage(buf);
2121  PageSetLSN(page, recptr);
2122  if (BufferIsValid(lbuf))
2123  {
2124  page = BufferGetPage(lbuf);
2125  PageSetLSN(page, recptr);
2126  }
2127  if (target != leafblkno)
2128  {
2129  page = BufferGetPage(leafbuf);
2130  PageSetLSN(page, recptr);
2131  }
2132  }
2133 
2134  END_CRIT_SECTION();
2135 
2136  /* release metapage */
2137  if (BufferIsValid(metabuf))
2138  _bt_relbuf(rel, metabuf);
2139 
2140  /* release siblings */
2141  if (BufferIsValid(lbuf))
2142  _bt_relbuf(rel, lbuf);
2143  _bt_relbuf(rel, rbuf);
2144 
2145  /*
2146  * Release the target, if it was not the leaf block. The leaf is always
2147  * kept locked.
2148  */
2149  if (target != leafblkno)
2150  _bt_relbuf(rel, buf);
2151 
2152  return true;
2153 }
BlockNumber lastBlockVacuumed
Definition: nbtxlog.h:174
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:247
TransactionId latestRemovedXid
Definition: nbtxlog.h:129
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define BTP_ROOT
Definition: nbtree.h:72
static void _bt_cachemetadata(Relation rel, BTMetaPageData *input)
Definition: nbtpage.c:116
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:974
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:156
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:89
uint32 TransactionId
Definition: c.h:507
uint32 btm_version
Definition: nbtree.h:100
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:635
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:932
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:85
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:726
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
#define ExclusiveLock
Definition: lockdefs.h:44
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:133
static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack)
Definition: nbtpage.c:1562
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
uint32 btm_magic
Definition: nbtree.h:99
#define BTP_LEAF
Definition: nbtree.h:71
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:534
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:202
#define BTP_HALF_DEAD
Definition: nbtree.h:75
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:356
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
RelFileNode node
Definition: nbtxlog.h:142
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 level
Definition: nbtxlog.h:248
#define MemSet(start, val, len)
Definition: c.h:941
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_NEW
Definition: bufmgr.h:81
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:194
#define BTP_DELETED
Definition: nbtree.h:73
#define LOG
Definition: elog.h:26
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:978
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:62
#define BTP_META
Definition: nbtree.h:74
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define P_ISMETA(opaque)
Definition: nbtree.h:192
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:147
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
void pfree(void *pointer)
Definition: mcxt.c:1031
#define BTREE_MAGIC
Definition: nbtree.h:132
BlockNumber block
Definition: nbtxlog.h:143
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:729
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:193
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3376
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:491
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:179
#define BTreeTupleSetTopParent(itup, blkno)
Definition: nbtree.h:314
BTCycleId btpo_cycleid
Definition: nbtree.h:65
static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *topparent, OffsetNumber *topoff, BlockNumber *target, BlockNumber *rightsib)
Definition: nbtpage.c:1213
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define DEBUG2
Definition: elog.h:24
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
Definition: nbtpage.c:51
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:502
OffsetNumber bts_offset
Definition: nbtree.h:418
static char * buf
Definition: pg_test_fsync.c:68
TransactionId RecentGlobalXmin
Definition: snapmgr.c:168
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:37
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:33
unsigned int uint32
Definition: c.h:358
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:291
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
Buffer _bt_getstackbuf(Relation rel, BTStack stack)
Definition: nbtinsert.c:1914
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3616
#define BTREE_METAPAGE
Definition: nbtree.h:131
bool pivotsearch
Definition: nbtree.h:475
#define P_ISDELETED(opaque)
Definition: nbtree.h:191
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define P_ISROOT(opaque)
Definition: nbtree.h:190
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:251
uint32 version
Definition: nbtxlog.h:49
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:32
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:763
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:104
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
static bool _bt_is_page_halfdead(Relation rel, BlockNumber blk)
Definition: nbtpage.c:1167
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:39
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:36
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
BlockNumber btm_root
Definition: nbtree.h:101
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:539
#define XLogStandbyInfoActive()
Definition: xlog.h:195
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
bool _bt_heapkeyspace(Relation rel)
Definition: nbtpage.c:684
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:417
int _bt_pagedel(Relation rel, Buffer buf)
Definition: nbtpage.c:1342
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
BlockNumber bts_btentry
Definition: nbtree.h:419
RelFileNode rd_node
Definition: rel.h:55
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define BTreeTupleGetTopParent(itup)
Definition: nbtree.h:312
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:835
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
Definition: nbtpage.c:1023
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1521
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
struct BTStackData * bts_parent
Definition: nbtree.h:420
#define RelationNeedsWAL(relation)
Definition: rel.h:516
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
static TransactionId ReadNewTransactionId(void)
Definition: transam.h:244
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:303
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:784
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
Definition: nbtpage.c:1764
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:771
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
MemoryContext rd_indexcxt
Definition: rel.h:163
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:232
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:963
#define SizeOfBtreeDelete
Definition: nbtxlog.h:135
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3203
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
Definition: nbtpage.c:1096
BlockNumber fastroot
Definition: nbtxlog.h:52
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:34
TransactionId latestRemovedXid
Definition: nbtxlog.h:144
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
void * rd_amcache
Definition: rel.h:176
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:123
#define BT_WRITE
Definition: nbtree.h:403
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
#define offsetof(type, field)
Definition: c.h:655
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
TransactionId index_compute_xid_horizon_for_tuples(Relation irel, Relation hrel, Buffer ibuf, OffsetNumber *itemnos, int nitems)
Definition: genam.c:281
#define P_ISLEAF(opaque)
Definition: nbtree.h:189
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42