PostgreSQL Source Code  git master
nbtpage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtpage.c
4  * BTree-specific page management code for the Postgres btree access
5  * method.
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  * src/backend/access/nbtree/nbtpage.c
13  *
14  * NOTES
15  * Postgres btree pages look like ordinary relation pages. The opaque
16  * data at high addresses includes pointers to left and right siblings
17  * and flag data describing page state. The first page in a btree, page
18  * zero, is special -- it stores meta-information describing the tree.
19  * Pages one and higher store the actual tree data.
20  *
21  *-------------------------------------------------------------------------
22  */
23 #include "postgres.h"
24 
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/tableam.h"
28 #include "access/transam.h"
29 #include "access/xlog.h"
30 #include "access/xloginsert.h"
31 #include "miscadmin.h"
32 #include "storage/indexfsm.h"
33 #include "storage/lmgr.h"
34 #include "storage/predicate.h"
35 #include "utils/snapmgr.h"
36 
37 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
38 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
39  TransactionId latestRemovedXid);
40 static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
41  OffsetNumber *deletable, int ndeletable);
42 static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
43  BTStack stack);
44 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
45  BlockNumber scanblkno,
46  bool *rightsib_empty,
47  TransactionId *oldestBtpoXact,
48  uint32 *ndeleted);
49 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
50  BTStack stack,
51  Buffer *subtreeparent,
52  OffsetNumber *poffset,
53  BlockNumber *topparent,
54  BlockNumber *topparentrightsib);
55 
56 /*
57  * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
58  */
59 void
60 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
61  bool allequalimage)
62 {
63  BTMetaPageData *metad;
64  BTPageOpaque metaopaque;
65 
66  _bt_pageinit(page, BLCKSZ);
67 
68  metad = BTPageGetMeta(page);
69  metad->btm_magic = BTREE_MAGIC;
70  metad->btm_version = BTREE_VERSION;
71  metad->btm_root = rootbknum;
72  metad->btm_level = level;
73  metad->btm_fastroot = rootbknum;
74  metad->btm_fastlevel = level;
77  metad->btm_allequalimage = allequalimage;
78 
79  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
80  metaopaque->btpo_flags = BTP_META;
81 
82  /*
83  * Set pd_lower just past the end of the metadata. This is essential,
84  * because without doing so, metadata will be lost if xlog.c compresses
85  * the page.
86  */
87  ((PageHeader) page)->pd_lower =
88  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
89 }
90 
91 /*
92  * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
93  * 3, the last version that can be updated without broadly affecting
94  * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
95  *
96  * This routine does purely in-memory image upgrade. Caller is
97  * responsible for locking, WAL-logging etc.
98  */
99 void
101 {
102  BTMetaPageData *metad;
104 
105  metad = BTPageGetMeta(page);
106  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
107 
108  /* It must be really a meta page of upgradable version */
109  Assert(metaopaque->btpo_flags & BTP_META);
112 
113  /* Set version number and fill extra fields added into version 3 */
116  metad->btm_last_cleanup_num_heap_tuples = -1.0;
117  /* Only a REINDEX can set this field */
118  Assert(!metad->btm_allequalimage);
119  metad->btm_allequalimage = false;
120 
121  /* Adjust pd_lower (see _bt_initmetapage() for details) */
122  ((PageHeader) page)->pd_lower =
123  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
124 }
125 
126 /*
127  * Get metadata from share-locked buffer containing metapage, while performing
128  * standard sanity checks.
129  *
130  * Callers that cache data returned here in local cache should note that an
131  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
132  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
133  */
134 static BTMetaPageData *
136 {
137  Page metapg;
138  BTPageOpaque metaopaque;
139  BTMetaPageData *metad;
140 
141  metapg = BufferGetPage(metabuf);
142  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
143  metad = BTPageGetMeta(metapg);
144 
145  /* sanity-check the metapage */
146  if (!P_ISMETA(metaopaque) ||
147  metad->btm_magic != BTREE_MAGIC)
148  ereport(ERROR,
149  (errcode(ERRCODE_INDEX_CORRUPTED),
150  errmsg("index \"%s\" is not a btree",
151  RelationGetRelationName(rel))));
152 
153  if (metad->btm_version < BTREE_MIN_VERSION ||
154  metad->btm_version > BTREE_VERSION)
155  ereport(ERROR,
156  (errcode(ERRCODE_INDEX_CORRUPTED),
157  errmsg("version mismatch in index \"%s\": file version %d, "
158  "current version %d, minimal supported version %d",
161 
162  return metad;
163 }
164 
165 /*
166  * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
167  * the metapage.
168  *
169  * This routine checks if provided cleanup-related information is matching
170  * to those written in the metapage. On mismatch, metapage is overwritten.
171  */
172 void
174  float8 numHeapTuples)
175 {
176  Buffer metabuf;
177  Page metapg;
178  BTMetaPageData *metad;
179  bool needsRewrite = false;
180  XLogRecPtr recptr;
181 
182  /* read the metapage and check if it needs rewrite */
183  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
184  metapg = BufferGetPage(metabuf);
185  metad = BTPageGetMeta(metapg);
186 
187  /* outdated version of metapage always needs rewrite */
188  if (metad->btm_version < BTREE_NOVAC_VERSION)
189  needsRewrite = true;
190  else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
191  metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
192  needsRewrite = true;
193 
194  if (!needsRewrite)
195  {
196  _bt_relbuf(rel, metabuf);
197  return;
198  }
199 
200  /* trade in our read lock for a write lock */
201  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
202  LockBuffer(metabuf, BT_WRITE);
203 
205 
206  /* upgrade meta-page if needed */
207  if (metad->btm_version < BTREE_NOVAC_VERSION)
208  _bt_upgrademetapage(metapg);
209 
210  /* update cleanup-related information */
211  metad->btm_oldest_btpo_xact = oldestBtpoXact;
212  metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
213  MarkBufferDirty(metabuf);
214 
215  /* write wal record if needed */
216  if (RelationNeedsWAL(rel))
217  {
219 
220  XLogBeginInsert();
222 
224  md.version = metad->btm_version;
225  md.root = metad->btm_root;
226  md.level = metad->btm_level;
227  md.fastroot = metad->btm_fastroot;
228  md.fastlevel = metad->btm_fastlevel;
229  md.oldest_btpo_xact = oldestBtpoXact;
230  md.last_cleanup_num_heap_tuples = numHeapTuples;
231  md.allequalimage = metad->btm_allequalimage;
232 
233  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
234 
235  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
236 
237  PageSetLSN(metapg, recptr);
238  }
239 
241  _bt_relbuf(rel, metabuf);
242 }
243 
244 /*
245  * _bt_getroot() -- Get the root page of the btree.
246  *
247  * Since the root page can move around the btree file, we have to read
248  * its location from the metadata page, and then read the root page
249  * itself. If no root page exists yet, we have to create one.
250  *
251  * The access type parameter (BT_READ or BT_WRITE) controls whether
252  * a new root page will be created or not. If access = BT_READ,
253  * and no root page exists, we just return InvalidBuffer. For
254  * BT_WRITE, we try to create the root page if it doesn't exist.
255  * NOTE that the returned root page will have only a read lock set
256  * on it even if access = BT_WRITE!
257  *
258  * The returned page is not necessarily the true root --- it could be
259  * a "fast root" (a page that is alone in its level due to deletions).
260  * Also, if the root page is split while we are "in flight" to it,
261  * what we will return is the old root, which is now just the leftmost
262  * page on a probably-not-very-wide level. For most purposes this is
263  * as good as or better than the true root, so we do not bother to
264  * insist on finding the true root. We do, however, guarantee to
265  * return a live (not deleted or half-dead) page.
266  *
267  * On successful return, the root page is pinned and read-locked.
268  * The metadata page is not locked or pinned on exit.
269  */
270 Buffer
271 _bt_getroot(Relation rel, int access)
272 {
273  Buffer metabuf;
274  Buffer rootbuf;
275  Page rootpage;
276  BTPageOpaque rootopaque;
277  BlockNumber rootblkno;
278  uint32 rootlevel;
279  BTMetaPageData *metad;
280 
281  /*
282  * Try to use previously-cached metapage data to find the root. This
283  * normally saves one buffer access per index search, which is a very
284  * helpful savings in bufmgr traffic and hence contention.
285  */
286  if (rel->rd_amcache != NULL)
287  {
288  metad = (BTMetaPageData *) rel->rd_amcache;
289  /* We shouldn't have cached it if any of these fail */
290  Assert(metad->btm_magic == BTREE_MAGIC);
292  Assert(metad->btm_version <= BTREE_VERSION);
293  Assert(!metad->btm_allequalimage ||
295  Assert(metad->btm_root != P_NONE);
296 
297  rootblkno = metad->btm_fastroot;
298  Assert(rootblkno != P_NONE);
299  rootlevel = metad->btm_fastlevel;
300 
301  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
302  rootpage = BufferGetPage(rootbuf);
303  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
304 
305  /*
306  * Since the cache might be stale, we check the page more carefully
307  * here than normal. We *must* check that it's not deleted. If it's
308  * not alone on its level, then we reject too --- this may be overly
309  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
310  * because that's not set in a "fast root".
311  */
312  if (!P_IGNORE(rootopaque) &&
313  rootopaque->btpo.level == rootlevel &&
314  P_LEFTMOST(rootopaque) &&
315  P_RIGHTMOST(rootopaque))
316  {
317  /* OK, accept cached page as the root */
318  return rootbuf;
319  }
320  _bt_relbuf(rel, rootbuf);
321  /* Cache is stale, throw it away */
322  if (rel->rd_amcache)
323  pfree(rel->rd_amcache);
324  rel->rd_amcache = NULL;
325  }
326 
327  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
328  metad = _bt_getmeta(rel, metabuf);
329 
330  /* if no root page initialized yet, do it */
331  if (metad->btm_root == P_NONE)
332  {
333  Page metapg;
334 
335  /* If access = BT_READ, caller doesn't want us to create root yet */
336  if (access == BT_READ)
337  {
338  _bt_relbuf(rel, metabuf);
339  return InvalidBuffer;
340  }
341 
342  /* trade in our read lock for a write lock */
343  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
344  LockBuffer(metabuf, BT_WRITE);
345 
346  /*
347  * Race condition: if someone else initialized the metadata between
348  * the time we released the read lock and acquired the write lock, we
349  * must avoid doing it again.
350  */
351  if (metad->btm_root != P_NONE)
352  {
353  /*
354  * Metadata initialized by someone else. In order to guarantee no
355  * deadlocks, we have to release the metadata page and start all
356  * over again. (Is that really true? But it's hardly worth trying
357  * to optimize this case.)
358  */
359  _bt_relbuf(rel, metabuf);
360  return _bt_getroot(rel, access);
361  }
362 
363  /*
364  * Get, initialize, write, and leave a lock of the appropriate type on
365  * the new root page. Since this is the first page in the tree, it's
366  * a leaf as well as the root.
367  */
368  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
369  rootblkno = BufferGetBlockNumber(rootbuf);
370  rootpage = BufferGetPage(rootbuf);
371  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
372  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
373  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
374  rootopaque->btpo.level = 0;
375  rootopaque->btpo_cycleid = 0;
376  /* Get raw page pointer for metapage */
377  metapg = BufferGetPage(metabuf);
378 
379  /* NO ELOG(ERROR) till meta is updated */
381 
382  /* upgrade metapage if needed */
383  if (metad->btm_version < BTREE_NOVAC_VERSION)
384  _bt_upgrademetapage(metapg);
385 
386  metad->btm_root = rootblkno;
387  metad->btm_level = 0;
388  metad->btm_fastroot = rootblkno;
389  metad->btm_fastlevel = 0;
391  metad->btm_last_cleanup_num_heap_tuples = -1.0;
392 
393  MarkBufferDirty(rootbuf);
394  MarkBufferDirty(metabuf);
395 
396  /* XLOG stuff */
397  if (RelationNeedsWAL(rel))
398  {
399  xl_btree_newroot xlrec;
400  XLogRecPtr recptr;
402 
403  XLogBeginInsert();
406 
408  md.version = metad->btm_version;
409  md.root = rootblkno;
410  md.level = 0;
411  md.fastroot = rootblkno;
412  md.fastlevel = 0;
415  md.allequalimage = metad->btm_allequalimage;
416 
417  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
418 
419  xlrec.rootblk = rootblkno;
420  xlrec.level = 0;
421 
422  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
423 
424  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
425 
426  PageSetLSN(rootpage, recptr);
427  PageSetLSN(metapg, recptr);
428  }
429 
431 
432  /*
433  * swap root write lock for read lock. There is no danger of anyone
434  * else accessing the new root page while it's unlocked, since no one
435  * else knows where it is yet.
436  */
437  LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
438  LockBuffer(rootbuf, BT_READ);
439 
440  /* okay, metadata is correct, release lock on it without caching */
441  _bt_relbuf(rel, metabuf);
442  }
443  else
444  {
445  rootblkno = metad->btm_fastroot;
446  Assert(rootblkno != P_NONE);
447  rootlevel = metad->btm_fastlevel;
448 
449  /*
450  * Cache the metapage data for next time
451  */
453  sizeof(BTMetaPageData));
454  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
455 
456  /*
457  * We are done with the metapage; arrange to release it via first
458  * _bt_relandgetbuf call
459  */
460  rootbuf = metabuf;
461 
462  for (;;)
463  {
464  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
465  rootpage = BufferGetPage(rootbuf);
466  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
467 
468  if (!P_IGNORE(rootopaque))
469  break;
470 
471  /* it's dead, Jim. step right one page */
472  if (P_RIGHTMOST(rootopaque))
473  elog(ERROR, "no live root page found in index \"%s\"",
475  rootblkno = rootopaque->btpo_next;
476  }
477 
478  /* Note: can't check btpo.level on deleted pages */
479  if (rootopaque->btpo.level != rootlevel)
480  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
481  rootblkno, RelationGetRelationName(rel),
482  rootopaque->btpo.level, rootlevel);
483  }
484 
485  /*
486  * By here, we have a pin and read lock on the root page, and no lock set
487  * on the metadata page. Return the root page's buffer.
488  */
489  return rootbuf;
490 }
491 
492 /*
493  * _bt_gettrueroot() -- Get the true root page of the btree.
494  *
495  * This is the same as the BT_READ case of _bt_getroot(), except
496  * we follow the true-root link not the fast-root link.
497  *
498  * By the time we acquire lock on the root page, it might have been split and
499  * not be the true root anymore. This is okay for the present uses of this
500  * routine; we only really need to be able to move up at least one tree level
501  * from whatever non-root page we were at. If we ever do need to lock the
502  * one true root page, we could loop here, re-reading the metapage on each
503  * failure. (Note that it wouldn't do to hold the lock on the metapage while
504  * moving to the root --- that'd deadlock against any concurrent root split.)
505  */
506 Buffer
508 {
509  Buffer metabuf;
510  Page metapg;
511  BTPageOpaque metaopaque;
512  Buffer rootbuf;
513  Page rootpage;
514  BTPageOpaque rootopaque;
515  BlockNumber rootblkno;
516  uint32 rootlevel;
517  BTMetaPageData *metad;
518 
519  /*
520  * We don't try to use cached metapage data here, since (a) this path is
521  * not performance-critical, and (b) if we are here it suggests our cache
522  * is out-of-date anyway. In light of point (b), it's probably safest to
523  * actively flush any cached metapage info.
524  */
525  if (rel->rd_amcache)
526  pfree(rel->rd_amcache);
527  rel->rd_amcache = NULL;
528 
529  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
530  metapg = BufferGetPage(metabuf);
531  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
532  metad = BTPageGetMeta(metapg);
533 
534  if (!P_ISMETA(metaopaque) ||
535  metad->btm_magic != BTREE_MAGIC)
536  ereport(ERROR,
537  (errcode(ERRCODE_INDEX_CORRUPTED),
538  errmsg("index \"%s\" is not a btree",
539  RelationGetRelationName(rel))));
540 
541  if (metad->btm_version < BTREE_MIN_VERSION ||
542  metad->btm_version > BTREE_VERSION)
543  ereport(ERROR,
544  (errcode(ERRCODE_INDEX_CORRUPTED),
545  errmsg("version mismatch in index \"%s\": file version %d, "
546  "current version %d, minimal supported version %d",
549 
550  /* if no root page initialized yet, fail */
551  if (metad->btm_root == P_NONE)
552  {
553  _bt_relbuf(rel, metabuf);
554  return InvalidBuffer;
555  }
556 
557  rootblkno = metad->btm_root;
558  rootlevel = metad->btm_level;
559 
560  /*
561  * We are done with the metapage; arrange to release it via first
562  * _bt_relandgetbuf call
563  */
564  rootbuf = metabuf;
565 
566  for (;;)
567  {
568  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
569  rootpage = BufferGetPage(rootbuf);
570  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
571 
572  if (!P_IGNORE(rootopaque))
573  break;
574 
575  /* it's dead, Jim. step right one page */
576  if (P_RIGHTMOST(rootopaque))
577  elog(ERROR, "no live root page found in index \"%s\"",
579  rootblkno = rootopaque->btpo_next;
580  }
581 
582  /* Note: can't check btpo.level on deleted pages */
583  if (rootopaque->btpo.level != rootlevel)
584  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
585  rootblkno, RelationGetRelationName(rel),
586  rootopaque->btpo.level, rootlevel);
587 
588  return rootbuf;
589 }
590 
591 /*
592  * _bt_getrootheight() -- Get the height of the btree search tree.
593  *
594  * We return the level (counting from zero) of the current fast root.
595  * This represents the number of tree levels we'd have to descend through
596  * to start any btree index search.
597  *
598  * This is used by the planner for cost-estimation purposes. Since it's
599  * only an estimate, slightly-stale data is fine, hence we don't worry
600  * about updating previously cached data.
601  */
602 int
604 {
605  BTMetaPageData *metad;
606 
607  if (rel->rd_amcache == NULL)
608  {
609  Buffer metabuf;
610 
611  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
612  metad = _bt_getmeta(rel, metabuf);
613 
614  /*
615  * If there's no root page yet, _bt_getroot() doesn't expect a cache
616  * to be made, so just stop here and report the index height is zero.
617  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
618  */
619  if (metad->btm_root == P_NONE)
620  {
621  _bt_relbuf(rel, metabuf);
622  return 0;
623  }
624 
625  /*
626  * Cache the metapage data for next time
627  */
629  sizeof(BTMetaPageData));
630  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
631  _bt_relbuf(rel, metabuf);
632  }
633 
634  /* Get cached page */
635  metad = (BTMetaPageData *) rel->rd_amcache;
636  /* We shouldn't have cached it if any of these fail */
637  Assert(metad->btm_magic == BTREE_MAGIC);
639  Assert(metad->btm_version <= BTREE_VERSION);
640  Assert(!metad->btm_allequalimage ||
642  Assert(metad->btm_fastroot != P_NONE);
643 
644  return metad->btm_fastlevel;
645 }
646 
647 /*
648  * _bt_metaversion() -- Get version/status info from metapage.
649  *
650  * Sets caller's *heapkeyspace and *allequalimage arguments using data
651  * from the B-Tree metapage (could be locally-cached version). This
652  * information needs to be stashed in insertion scankey, so we provide a
653  * single function that fetches both at once.
654  *
655  * This is used to determine the rules that must be used to descend a
656  * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
657  * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
658  * performance when inserting a new BTScanInsert-wise duplicate tuple
659  * among many leaf pages already full of such duplicates.
660  *
661  * Also sets allequalimage field, which indicates whether or not it is
662  * safe to apply deduplication. We rely on the assumption that
663  * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
664  * pg_upgrade'd from Postgres 12.
665  */
666 void
667 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
668 {
669  BTMetaPageData *metad;
670 
671  if (rel->rd_amcache == NULL)
672  {
673  Buffer metabuf;
674 
675  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
676  metad = _bt_getmeta(rel, metabuf);
677 
678  /*
679  * If there's no root page yet, _bt_getroot() doesn't expect a cache
680  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
681  * be changed to allow this case.)
682  */
683  if (metad->btm_root == P_NONE)
684  {
685  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
686  *allequalimage = metad->btm_allequalimage;
687 
688  _bt_relbuf(rel, metabuf);
689  return;
690  }
691 
692  /*
693  * Cache the metapage data for next time
694  *
695  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
696  * can change the nbtree version for an index without invalidating any
697  * local cache. This is okay because it can only happen when moving
698  * from version 2 to version 3, both of which are !heapkeyspace
699  * versions.
700  */
702  sizeof(BTMetaPageData));
703  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
704  _bt_relbuf(rel, metabuf);
705  }
706 
707  /* Get cached page */
708  metad = (BTMetaPageData *) rel->rd_amcache;
709  /* We shouldn't have cached it if any of these fail */
710  Assert(metad->btm_magic == BTREE_MAGIC);
712  Assert(metad->btm_version <= BTREE_VERSION);
713  Assert(!metad->btm_allequalimage ||
715  Assert(metad->btm_fastroot != P_NONE);
716 
717  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
718  *allequalimage = metad->btm_allequalimage;
719 }
720 
721 /*
722  * _bt_checkpage() -- Verify that a freshly-read page looks sane.
723  */
724 void
726 {
727  Page page = BufferGetPage(buf);
728 
729  /*
730  * ReadBuffer verifies that every newly-read page passes
731  * PageHeaderIsValid, which means it either contains a reasonably sane
732  * page header or is all-zero. We have to defend against the all-zero
733  * case, however.
734  */
735  if (PageIsNew(page))
736  ereport(ERROR,
737  (errcode(ERRCODE_INDEX_CORRUPTED),
738  errmsg("index \"%s\" contains unexpected zero page at block %u",
740  BufferGetBlockNumber(buf)),
741  errhint("Please REINDEX it.")));
742 
743  /*
744  * Additionally check that the special area looks sane.
745  */
746  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
747  ereport(ERROR,
748  (errcode(ERRCODE_INDEX_CORRUPTED),
749  errmsg("index \"%s\" contains corrupted page at block %u",
751  BufferGetBlockNumber(buf)),
752  errhint("Please REINDEX it.")));
753 }
754 
755 /*
756  * Log the reuse of a page from the FSM.
757  */
758 static void
760 {
761  xl_btree_reuse_page xlrec_reuse;
762 
763  /*
764  * Note that we don't register the buffer with the record, because this
765  * operation doesn't modify the page. This record only exists to provide a
766  * conflict point for Hot Standby.
767  */
768 
769  /* XLOG stuff */
770  xlrec_reuse.node = rel->rd_node;
771  xlrec_reuse.block = blkno;
772  xlrec_reuse.latestRemovedXid = latestRemovedXid;
773 
774  XLogBeginInsert();
775  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
776 
777  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
778 }
779 
780 /*
781  * _bt_getbuf() -- Get a buffer by block number for read or write.
782  *
783  * blkno == P_NEW means to get an unallocated index page. The page
784  * will be initialized before returning it.
785  *
786  * When this routine returns, the appropriate lock is set on the
787  * requested buffer and its reference count has been incremented
788  * (ie, the buffer is "locked and pinned"). Also, we apply
789  * _bt_checkpage to sanity-check the page (except in P_NEW case).
790  */
791 Buffer
792 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
793 {
794  Buffer buf;
795 
796  if (blkno != P_NEW)
797  {
798  /* Read an existing block of the relation */
799  buf = ReadBuffer(rel, blkno);
800  LockBuffer(buf, access);
801  _bt_checkpage(rel, buf);
802  }
803  else
804  {
805  bool needLock;
806  Page page;
807 
808  Assert(access == BT_WRITE);
809 
810  /*
811  * First see if the FSM knows of any free pages.
812  *
813  * We can't trust the FSM's report unreservedly; we have to check that
814  * the page is still free. (For example, an already-free page could
815  * have been re-used between the time the last VACUUM scanned it and
816  * the time the VACUUM made its FSM updates.)
817  *
818  * In fact, it's worse than that: we can't even assume that it's safe
819  * to take a lock on the reported page. If somebody else has a lock
820  * on it, or even worse our own caller does, we could deadlock. (The
821  * own-caller scenario is actually not improbable. Consider an index
822  * on a serial or timestamp column. Nearly all splits will be at the
823  * rightmost page, so it's entirely likely that _bt_split will call us
824  * while holding a lock on the page most recently acquired from FSM. A
825  * VACUUM running concurrently with the previous split could well have
826  * placed that page back in FSM.)
827  *
828  * To get around that, we ask for only a conditional lock on the
829  * reported page. If we fail, then someone else is using the page,
830  * and we may reasonably assume it's not free. (If we happen to be
831  * wrong, the worst consequence is the page will be lost to use till
832  * the next VACUUM, which is no big problem.)
833  */
834  for (;;)
835  {
836  blkno = GetFreeIndexPage(rel);
837  if (blkno == InvalidBlockNumber)
838  break;
839  buf = ReadBuffer(rel, blkno);
840  if (ConditionalLockBuffer(buf))
841  {
842  page = BufferGetPage(buf);
843  if (_bt_page_recyclable(page))
844  {
845  /*
846  * If we are generating WAL for Hot Standby then create a
847  * WAL record that will allow us to conflict with queries
848  * running on standby, in case they have snapshots older
849  * than btpo.xact. This can only apply if the page does
850  * have a valid btpo.xact value, ie not if it's new. (We
851  * must check that because an all-zero page has no special
852  * space.)
853  */
854  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
855  !PageIsNew(page))
856  {
858 
859  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
860  }
861 
862  /* Okay to use page. Re-initialize and return it */
863  _bt_pageinit(page, BufferGetPageSize(buf));
864  return buf;
865  }
866  elog(DEBUG2, "FSM returned nonrecyclable page");
867  _bt_relbuf(rel, buf);
868  }
869  else
870  {
871  elog(DEBUG2, "FSM returned nonlockable page");
872  /* couldn't get lock, so just drop pin */
873  ReleaseBuffer(buf);
874  }
875  }
876 
877  /*
878  * Extend the relation by one page.
879  *
880  * We have to use a lock to ensure no one else is extending the rel at
881  * the same time, else we will both try to initialize the same new
882  * page. We can skip locking for new or temp relations, however,
883  * since no one else could be accessing them.
884  */
885  needLock = !RELATION_IS_LOCAL(rel);
886 
887  if (needLock)
889 
890  buf = ReadBuffer(rel, P_NEW);
891 
892  /* Acquire buffer lock on new page */
893  LockBuffer(buf, BT_WRITE);
894 
895  /*
896  * Release the file-extension lock; it's now OK for someone else to
897  * extend the relation some more. Note that we cannot release this
898  * lock before we have buffer lock on the new page, or we risk a race
899  * condition against btvacuumscan --- see comments therein.
900  */
901  if (needLock)
903 
904  /* Initialize the new page before returning it */
905  page = BufferGetPage(buf);
906  Assert(PageIsNew(page));
907  _bt_pageinit(page, BufferGetPageSize(buf));
908  }
909 
910  /* ref count and lock type are correct */
911  return buf;
912 }
913 
914 /*
915  * _bt_relandgetbuf() -- release a locked buffer and get another one.
916  *
917  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
918  * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
919  * then it reduces to just _bt_getbuf; allowing this case simplifies some
920  * callers.
921  *
922  * The original motivation for using this was to avoid two entries to the
923  * bufmgr when one would do. However, now it's mainly just a notational
924  * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
925  * is when the target page is the same one already in the buffer.
926  */
927 Buffer
928 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
929 {
930  Buffer buf;
931 
932  Assert(blkno != P_NEW);
933  if (BufferIsValid(obuf))
935  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
936  LockBuffer(buf, access);
937  _bt_checkpage(rel, buf);
938  return buf;
939 }
940 
941 /*
942  * _bt_relbuf() -- release a locked buffer.
943  *
944  * Lock and pin (refcount) are both dropped.
945  */
946 void
948 {
949  UnlockReleaseBuffer(buf);
950 }
951 
952 /*
953  * _bt_pageinit() -- Initialize a new page.
954  *
955  * On return, the page header is initialized; data space is empty;
956  * special space is zeroed out.
957  */
958 void
959 _bt_pageinit(Page page, Size size)
960 {
961  PageInit(page, size, sizeof(BTPageOpaqueData));
962 }
963 
964 /*
965  * _bt_page_recyclable() -- Is an existing page recyclable?
966  *
967  * This exists to make sure _bt_getbuf and btvacuumscan have the same
968  * policy about whether a page is safe to re-use. But note that _bt_getbuf
969  * knows enough to distinguish the PageIsNew condition from the other one.
970  * At some point it might be appropriate to redesign this to have a three-way
971  * result value.
972  */
973 bool
975 {
976  BTPageOpaque opaque;
977 
978  /*
979  * It's possible to find an all-zeroes page in an index --- for example, a
980  * backend might successfully extend the relation one page and then crash
981  * before it is able to make a WAL entry for adding the page. If we find a
982  * zeroed page then reclaim it.
983  */
984  if (PageIsNew(page))
985  return true;
986 
987  /*
988  * Otherwise, recycle if deleted and too old to have any processes
989  * interested in it.
990  */
991  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
992  if (P_ISDELETED(opaque) &&
994  return true;
995  return false;
996 }
997 
998 /*
999  * Delete item(s) from a btree leaf page during VACUUM.
1000  *
1001  * This routine assumes that the caller has a super-exclusive write lock on
1002  * the buffer. Also, the given deletable and updatable arrays *must* be
1003  * sorted in ascending order.
1004  *
1005  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1006  * in an existing posting list item are to be removed by VACUUM. This works
1007  * by updating/overwriting an existing item with caller's new version of the
1008  * item (a version that lacks the TIDs that are to be deleted).
1009  *
1010  * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1011  * generate their own latestRemovedXid by accessing the heap directly, whereas
1012  * VACUUMs rely on the initial heap scan taking care of it indirectly. Also,
1013  * only VACUUM can perform granular deletes of individual TIDs in posting list
1014  * tuples.
1015  */
1016 void
1018  OffsetNumber *deletable, int ndeletable,
1019  BTVacuumPosting *updatable, int nupdatable)
1020 {
1021  Page page = BufferGetPage(buf);
1022  BTPageOpaque opaque;
1023  Size itemsz;
1024  char *updatedbuf = NULL;
1025  Size updatedbuflen = 0;
1026  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1027 
1028  /* Shouldn't be called unless there's something to do */
1029  Assert(ndeletable > 0 || nupdatable > 0);
1030 
1031  for (int i = 0; i < nupdatable; i++)
1032  {
1033  /* Replace work area IndexTuple with updated version */
1034  _bt_update_posting(updatable[i]);
1035 
1036  /* Maintain array of updatable page offsets for WAL record */
1037  updatedoffsets[i] = updatable[i]->updatedoffset;
1038  }
1039 
1040  /* XLOG stuff -- allocate and fill buffer before critical section */
1041  if (nupdatable > 0 && RelationNeedsWAL(rel))
1042  {
1043  Size offset = 0;
1044 
1045  for (int i = 0; i < nupdatable; i++)
1046  {
1047  BTVacuumPosting vacposting = updatable[i];
1048 
1049  itemsz = SizeOfBtreeUpdate +
1050  vacposting->ndeletedtids * sizeof(uint16);
1051  updatedbuflen += itemsz;
1052  }
1053 
1054  updatedbuf = palloc(updatedbuflen);
1055  for (int i = 0; i < nupdatable; i++)
1056  {
1057  BTVacuumPosting vacposting = updatable[i];
1058  xl_btree_update update;
1059 
1060  update.ndeletedtids = vacposting->ndeletedtids;
1061  memcpy(updatedbuf + offset, &update.ndeletedtids,
1063  offset += SizeOfBtreeUpdate;
1064 
1065  itemsz = update.ndeletedtids * sizeof(uint16);
1066  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1067  offset += itemsz;
1068  }
1069  }
1070 
1071  /* No ereport(ERROR) until changes are logged */
1073 
1074  /*
1075  * Handle posting tuple updates.
1076  *
1077  * Deliberately do this before handling simple deletes. If we did it the
1078  * other way around (i.e. WAL record order -- simple deletes before
1079  * updates) then we'd have to make compensating changes to the 'updatable'
1080  * array of offset numbers.
1081  *
1082  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1083  * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
1084  * level flag, unsetting individual LP_DEAD bits should still be avoided.
1085  */
1086  for (int i = 0; i < nupdatable; i++)
1087  {
1088  OffsetNumber updatedoffset = updatedoffsets[i];
1089  IndexTuple itup;
1090 
1091  itup = updatable[i]->itup;
1092  itemsz = MAXALIGN(IndexTupleSize(itup));
1093  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1094  itemsz))
1095  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1097  }
1098 
1099  /* Now handle simple deletes of entire tuples */
1100  if (ndeletable > 0)
1101  PageIndexMultiDelete(page, deletable, ndeletable);
1102 
1103  /*
1104  * We can clear the vacuum cycle ID since this page has certainly been
1105  * processed by the current vacuum scan.
1106  */
1107  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1108  opaque->btpo_cycleid = 0;
1109 
1110  /*
1111  * Mark the page as not containing any LP_DEAD items. This is not
1112  * certainly true (there might be some that have recently been marked, but
1113  * weren't targeted by VACUUM's heap scan), but it will be true often
1114  * enough. VACUUM does not delete items purely because they have their
1115  * LP_DEAD bit set, since doing so would necessitate explicitly logging a
1116  * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
1117  *
1118  * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
1119  * limited, since we never falsely unset an LP_DEAD bit. Workloads that
1120  * are particularly dependent on LP_DEAD bits being set quickly will
1121  * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
1122  * again anyway. Furthermore, attempting a deduplication pass will remove
1123  * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
1124  * is set or not.
1125  */
1126  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1127 
1128  MarkBufferDirty(buf);
1129 
1130  /* XLOG stuff */
1131  if (RelationNeedsWAL(rel))
1132  {
1133  XLogRecPtr recptr;
1134  xl_btree_vacuum xlrec_vacuum;
1135 
1136  xlrec_vacuum.ndeleted = ndeletable;
1137  xlrec_vacuum.nupdated = nupdatable;
1138 
1139  XLogBeginInsert();
1141  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1142 
1143  if (ndeletable > 0)
1144  XLogRegisterBufData(0, (char *) deletable,
1145  ndeletable * sizeof(OffsetNumber));
1146 
1147  if (nupdatable > 0)
1148  {
1149  XLogRegisterBufData(0, (char *) updatedoffsets,
1150  nupdatable * sizeof(OffsetNumber));
1151  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1152  }
1153 
1154  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1155 
1156  PageSetLSN(page, recptr);
1157  }
1158 
1159  END_CRIT_SECTION();
1160 
1161  /* can't leak memory here */
1162  if (updatedbuf != NULL)
1163  pfree(updatedbuf);
1164  /* free tuples generated by calling _bt_update_posting() */
1165  for (int i = 0; i < nupdatable; i++)
1166  pfree(updatable[i]->itup);
1167 }
1168 
1169 /*
1170  * Delete item(s) from a btree leaf page during single-page cleanup.
1171  *
1172  * This routine assumes that the caller has pinned and write locked the
1173  * buffer. Also, the given deletable array *must* be sorted in ascending
1174  * order.
1175  *
1176  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1177  * the page, but it needs to generate its own latestRemovedXid by accessing
1178  * the heap. This is used by the REDO routine to generate recovery conflicts.
1179  * Also, it doesn't handle posting list tuples unless the entire tuple can be
1180  * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
1181  */
1182 void
1184  OffsetNumber *deletable, int ndeletable,
1185  Relation heapRel)
1186 {
1187  Page page = BufferGetPage(buf);
1188  BTPageOpaque opaque;
1189  TransactionId latestRemovedXid = InvalidTransactionId;
1190 
1191  /* Shouldn't be called unless there's something to do */
1192  Assert(ndeletable > 0);
1193 
1195  latestRemovedXid =
1196  _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
1197 
1198  /* No ereport(ERROR) until changes are logged */
1200 
1201  /* Fix the page */
1202  PageIndexMultiDelete(page, deletable, ndeletable);
1203 
1204  /*
1205  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1206  * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE
1207  * page flag, since we deleted all items with their LP_DEAD bit set.
1208  */
1209  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1210  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1211 
1212  MarkBufferDirty(buf);
1213 
1214  /* XLOG stuff */
1215  if (RelationNeedsWAL(rel))
1216  {
1217  XLogRecPtr recptr;
1218  xl_btree_delete xlrec_delete;
1219 
1220  xlrec_delete.latestRemovedXid = latestRemovedXid;
1221  xlrec_delete.ndeleted = ndeletable;
1222 
1223  XLogBeginInsert();
1225  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1226 
1227  /*
1228  * The deletable array is not in the buffer, but pretend that it is.
1229  * When XLogInsert stores the whole buffer, the array need not be
1230  * stored too.
1231  */
1232  XLogRegisterBufData(0, (char *) deletable,
1233  ndeletable * sizeof(OffsetNumber));
1234 
1235  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1236 
1237  PageSetLSN(page, recptr);
1238  }
1239 
1240  END_CRIT_SECTION();
1241 }
1242 
1243 /*
1244  * Get the latestRemovedXid from the table entries pointed to by the non-pivot
1245  * tuples being deleted.
1246  *
1247  * This is a specialized version of index_compute_xid_horizon_for_tuples().
1248  * It's needed because btree tuples don't always store table TID using the
1249  * standard index tuple header field.
1250  */
1251 static TransactionId
1253  OffsetNumber *deletable, int ndeletable)
1254 {
1255  TransactionId latestRemovedXid = InvalidTransactionId;
1256  int spacenhtids;
1257  int nhtids;
1258  ItemPointer htids;
1259 
1260  /* Array will grow iff there are posting list tuples to consider */
1261  spacenhtids = ndeletable;
1262  nhtids = 0;
1263  htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
1264  for (int i = 0; i < ndeletable; i++)
1265  {
1266  ItemId itemid;
1267  IndexTuple itup;
1268 
1269  itemid = PageGetItemId(page, deletable[i]);
1270  itup = (IndexTuple) PageGetItem(page, itemid);
1271 
1272  Assert(ItemIdIsDead(itemid));
1273  Assert(!BTreeTupleIsPivot(itup));
1274 
1275  if (!BTreeTupleIsPosting(itup))
1276  {
1277  if (nhtids + 1 > spacenhtids)
1278  {
1279  spacenhtids *= 2;
1280  htids = (ItemPointer)
1281  repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1282  }
1283 
1284  Assert(ItemPointerIsValid(&itup->t_tid));
1285  ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
1286  nhtids++;
1287  }
1288  else
1289  {
1290  int nposting = BTreeTupleGetNPosting(itup);
1291 
1292  if (nhtids + nposting > spacenhtids)
1293  {
1294  spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
1295  htids = (ItemPointer)
1296  repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1297  }
1298 
1299  for (int j = 0; j < nposting; j++)
1300  {
1301  ItemPointer htid = BTreeTupleGetPostingN(itup, j);
1302 
1303  Assert(ItemPointerIsValid(htid));
1304  ItemPointerCopy(htid, &htids[nhtids]);
1305  nhtids++;
1306  }
1307  }
1308  }
1309 
1310  Assert(nhtids >= ndeletable);
1311 
1312  latestRemovedXid =
1313  table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
1314 
1315  pfree(htids);
1316 
1317  return latestRemovedXid;
1318 }
1319 
1320 /*
1321  * Check that leftsib page (the btpo_prev of target page) is not marked with
1322  * INCOMPLETE_SPLIT flag. Used during page deletion.
1323  *
1324  * Returning true indicates that page flag is set in leftsib (which is
1325  * definitely still the left sibling of target). When that happens, the
1326  * target doesn't have a downlink in parent, and the page deletion algorithm
1327  * isn't prepared to handle that. Deletion of the target page (or the whole
1328  * subtree that contains the target page) cannot take place.
1329  *
1330  * Caller should not have a lock on the target page itself, since pages on the
1331  * same level must always be locked left to right to avoid deadlocks.
1332  */
1333 static bool
1335 {
1336  Buffer buf;
1337  Page page;
1338  BTPageOpaque opaque;
1339  bool result;
1340 
1341  /* Easy case: No left sibling */
1342  if (leftsib == P_NONE)
1343  return false;
1344 
1345  buf = _bt_getbuf(rel, leftsib, BT_READ);
1346  page = BufferGetPage(buf);
1347  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1348 
1349  /*
1350  * If the left sibling was concurrently split, so that its next-pointer
1351  * doesn't point to the current page anymore, the split that created
1352  * target must be completed. Caller can reasonably expect that there will
1353  * be a downlink to the target page that it can relocate using its stack.
1354  * (We don't allow splitting an incompletely split page again until the
1355  * previous split has been completed.)
1356  */
1357  result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1358  _bt_relbuf(rel, buf);
1359 
1360  return result;
1361 }
1362 
1363 /*
1364  * Check that leafrightsib page (the btpo_next of target leaf page) is not
1365  * marked with ISHALFDEAD flag. Used during page deletion.
1366  *
1367  * Returning true indicates that page flag is set in leafrightsib, so page
1368  * deletion cannot go ahead. Our caller is not prepared to deal with the case
1369  * where the parent page does not have a pivot tuples whose downlink points to
1370  * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1371  * seem worth going to the trouble of teaching our caller to deal with it.
1372  * The situation will be resolved after VACUUM finishes the deletion of the
1373  * half-dead page (when a future VACUUM operation reaches the target page
1374  * again).
1375  *
1376  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1377  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1378  * okay because of the restriction on deleting pages that are the rightmost
1379  * page of their parent (i.e. that such deletions can only take place when the
1380  * entire subtree must be deleted). The leaf level check made here will apply
1381  * to a right "cousin" leaf page rather than a simple right sibling leaf page
1382  * in cases where caller actually goes on to attempt deleting pages that are
1383  * above the leaf page. The right cousin leaf page is representative of the
1384  * left edge of the subtree to the right of the to-be-deleted subtree as a
1385  * whole, which is exactly the condition that our caller cares about.
1386  * (Besides, internal pages are never marked half-dead, so it isn't even
1387  * possible to _directly_ assess if an internal page is part of some other
1388  * to-be-deleted subtree.)
1389  */
1390 static bool
1392 {
1393  Buffer buf;
1394  Page page;
1395  BTPageOpaque opaque;
1396  bool result;
1397 
1398  Assert(leafrightsib != P_NONE);
1399 
1400  buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1401  page = BufferGetPage(buf);
1402  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1403 
1404  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1405  result = P_ISHALFDEAD(opaque);
1406  _bt_relbuf(rel, buf);
1407 
1408  return result;
1409 }
1410 
1411 /*
1412  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1413  *
1414  * This action unlinks the leaf page from the b-tree structure, removing all
1415  * pointers leading to it --- but not touching its own left and right links.
1416  * The page cannot be physically reclaimed right away, since other processes
1417  * may currently be trying to follow links leading to the page; they have to
1418  * be allowed to use its right-link to recover. See nbtree/README.
1419  *
1420  * On entry, the target buffer must be pinned and locked (either read or write
1421  * lock is OK). The page must be an empty leaf page, which may be half-dead
1422  * already (a half-dead page should only be passed to us when an earlier
1423  * VACUUM operation was interrupted, though). Note in particular that caller
1424  * should never pass a buffer containing an existing deleted page here. The
1425  * lock and pin on caller's buffer will be dropped before we return.
1426  *
1427  * Returns the number of pages successfully deleted (zero if page cannot
1428  * be deleted now; could be more than one if parent or right sibling pages
1429  * were deleted too). Note that this does not include pages that we delete
1430  * that the btvacuumscan scan has yet to reach; they'll get counted later
1431  * instead.
1432  *
1433  * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
1434  * responsible for maintaining *oldestBtpoXact in the case of pages that were
1435  * deleted by a previous VACUUM.
1436  *
1437  * NOTE: this leaks memory. Rather than trying to clean up everything
1438  * carefully, it's better to run it in a temp context that can be reset
1439  * frequently.
1440  */
1441 uint32
1442 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
1443 {
1444  uint32 ndeleted = 0;
1445  BlockNumber rightsib;
1446  bool rightsib_empty;
1447  Page page;
1448  BTPageOpaque opaque;
1449 
1450  /*
1451  * Save original leafbuf block number from caller. Only deleted blocks
1452  * that are <= scanblkno get counted in ndeleted return value.
1453  */
1454  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1455 
1456  /*
1457  * "stack" is a search stack leading (approximately) to the target page.
1458  * It is initially NULL, but when iterating, we keep it to avoid
1459  * duplicated search effort.
1460  *
1461  * Also, when "stack" is not NULL, we have already checked that the
1462  * current page is not the right half of an incomplete split, i.e. the
1463  * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1464  * when the current target page is to the right of caller's initial page
1465  * (the scanblkno page).
1466  */
1467  BTStack stack = NULL;
1468 
1469  for (;;)
1470  {
1471  page = BufferGetPage(leafbuf);
1472  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1473 
1474  /*
1475  * Internal pages are never deleted directly, only as part of deleting
1476  * the whole subtree all the way down to leaf level.
1477  *
1478  * Also check for deleted pages here. Caller never passes us a fully
1479  * deleted page. Only VACUUM can delete pages, so there can't have
1480  * been a concurrent deletion. Assume that we reached any deleted
1481  * page encountered here by following a sibling link, and that the
1482  * index is corrupt.
1483  */
1484  Assert(!P_ISDELETED(opaque));
1485  if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1486  {
1487  /*
1488  * Pre-9.4 page deletion only marked internal pages as half-dead,
1489  * but now we only use that flag on leaf pages. The old algorithm
1490  * was never supposed to leave half-dead pages in the tree, it was
1491  * just a transient state, but it was nevertheless possible in
1492  * error scenarios. We don't know how to deal with them here. They
1493  * are harmless as far as searches are considered, but inserts
1494  * into the deleted keyspace could add out-of-order downlinks in
1495  * the upper levels. Log a notice, hopefully the admin will notice
1496  * and reindex.
1497  */
1498  if (P_ISHALFDEAD(opaque))
1499  ereport(LOG,
1500  (errcode(ERRCODE_INDEX_CORRUPTED),
1501  errmsg("index \"%s\" contains a half-dead internal page",
1503  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1504 
1505  if (P_ISDELETED(opaque))
1506  ereport(LOG,
1507  (errcode(ERRCODE_INDEX_CORRUPTED),
1508  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1509  BufferGetBlockNumber(leafbuf),
1510  scanblkno,
1511  RelationGetRelationName(rel))));
1512 
1513  _bt_relbuf(rel, leafbuf);
1514  return ndeleted;
1515  }
1516 
1517  /*
1518  * We can never delete rightmost pages nor root pages. While at it,
1519  * check that page is empty, since it's possible that the leafbuf page
1520  * was empty a moment ago, but has since had some inserts.
1521  *
1522  * To keep the algorithm simple, we also never delete an incompletely
1523  * split page (they should be rare enough that this doesn't make any
1524  * meaningful difference to disk usage):
1525  *
1526  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1527  * left half of an incomplete split, but ensuring that it's not the
1528  * right half is more complicated. For that, we have to check that
1529  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1530  * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1531  * release the lock on scanblkno/leafbuf, check the left sibling, and
1532  * construct a search stack to scanblkno. On subsequent iterations,
1533  * we know we stepped right from a page that passed these tests, so
1534  * it's OK.
1535  */
1536  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1537  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1538  P_INCOMPLETE_SPLIT(opaque))
1539  {
1540  /* Should never fail to delete a half-dead page */
1541  Assert(!P_ISHALFDEAD(opaque));
1542 
1543  _bt_relbuf(rel, leafbuf);
1544  return ndeleted;
1545  }
1546 
1547  /*
1548  * First, remove downlink pointing to the page (or a parent of the
1549  * page, if we are going to delete a taller subtree), and mark the
1550  * leafbuf page half-dead
1551  */
1552  if (!P_ISHALFDEAD(opaque))
1553  {
1554  /*
1555  * We need an approximate pointer to the page's parent page. We
1556  * use a variant of the standard search mechanism to search for
1557  * the page's high key; this will give us a link to either the
1558  * current parent or someplace to its left (if there are multiple
1559  * equal high keys, which is possible with !heapkeyspace indexes).
1560  *
1561  * Also check if this is the right-half of an incomplete split
1562  * (see comment above).
1563  */
1564  if (!stack)
1565  {
1566  BTScanInsert itup_key;
1567  ItemId itemid;
1568  IndexTuple targetkey;
1569  BlockNumber leftsib,
1570  leafblkno;
1571  Buffer sleafbuf;
1572 
1573  itemid = PageGetItemId(page, P_HIKEY);
1574  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1575 
1576  leftsib = opaque->btpo_prev;
1577  leafblkno = BufferGetBlockNumber(leafbuf);
1578 
1579  /*
1580  * To avoid deadlocks, we'd better drop the leaf page lock
1581  * before going further.
1582  */
1583  LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
1584 
1585  /*
1586  * Check that the left sibling of leafbuf (if any) is not
1587  * marked with INCOMPLETE_SPLIT flag before proceeding
1588  */
1589  Assert(leafblkno == scanblkno);
1590  if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1591  {
1592  ReleaseBuffer(leafbuf);
1593  return ndeleted;
1594  }
1595 
1596  /* we need an insertion scan key for the search, so build one */
1597  itup_key = _bt_mkscankey(rel, targetkey);
1598  /* find the leftmost leaf page with matching pivot/high key */
1599  itup_key->pivotsearch = true;
1600  stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1601  /* won't need a second lock or pin on leafbuf */
1602  _bt_relbuf(rel, sleafbuf);
1603 
1604  /*
1605  * Re-lock the leaf page, and start over to use our stack
1606  * within _bt_mark_page_halfdead. We must do it that way
1607  * because it's possible that leafbuf can no longer be
1608  * deleted. We need to recheck.
1609  *
1610  * Note: We can't simply hold on to the sleafbuf lock instead,
1611  * because it's barely possible that sleafbuf is not the same
1612  * page as leafbuf. This happens when leafbuf split after our
1613  * original lock was dropped, but before _bt_search finished
1614  * its descent. We rely on the assumption that we'll find
1615  * leafbuf isn't safe to delete anymore in this scenario.
1616  * (Page deletion can cope with the stack being to the left of
1617  * leafbuf, but not to the right of leafbuf.)
1618  */
1619  LockBuffer(leafbuf, BT_WRITE);
1620  continue;
1621  }
1622 
1623  /*
1624  * See if it's safe to delete the leaf page, and determine how
1625  * many parent/internal pages above the leaf level will be
1626  * deleted. If it's safe then _bt_mark_page_halfdead will also
1627  * perform the first phase of deletion, which includes marking the
1628  * leafbuf page half-dead.
1629  */
1630  Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
1631  if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
1632  {
1633  _bt_relbuf(rel, leafbuf);
1634  return ndeleted;
1635  }
1636  }
1637 
1638  /*
1639  * Then unlink it from its siblings. Each call to
1640  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
1641  * making it shallower. Iterate until the leafbuf page is deleted.
1642  *
1643  * _bt_unlink_halfdead_page should never fail, since we established
1644  * that deletion is generally safe in _bt_mark_page_halfdead.
1645  */
1646  rightsib_empty = false;
1647  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1648  while (P_ISHALFDEAD(opaque))
1649  {
1650  /* Check for interrupts in _bt_unlink_halfdead_page */
1651  if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
1652  &rightsib_empty, oldestBtpoXact,
1653  &ndeleted))
1654  {
1655  /* _bt_unlink_halfdead_page failed, released buffer */
1656  return ndeleted;
1657  }
1658  }
1659 
1660  Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
1662  *oldestBtpoXact));
1663 
1664  rightsib = opaque->btpo_next;
1665 
1666  _bt_relbuf(rel, leafbuf);
1667 
1668  /*
1669  * Check here, as calling loops will have locks held, preventing
1670  * interrupts from being processed.
1671  */
1673 
1674  /*
1675  * The page has now been deleted. If its right sibling is completely
1676  * empty, it's possible that the reason we haven't deleted it earlier
1677  * is that it was the rightmost child of the parent. Now that we
1678  * removed the downlink for this page, the right sibling might now be
1679  * the only child of the parent, and could be removed. It would be
1680  * picked up by the next vacuum anyway, but might as well try to
1681  * remove it now, so loop back to process the right sibling.
1682  *
1683  * Note: This relies on the assumption that _bt_getstackbuf() will be
1684  * able to reuse our original descent stack with a different child
1685  * block (provided that the child block is to the right of the
1686  * original leaf page reached by _bt_search()). It will even update
1687  * the descent stack each time we loop around, avoiding repeated work.
1688  */
1689  if (!rightsib_empty)
1690  break;
1691 
1692  leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1693  }
1694 
1695  return ndeleted;
1696 }
1697 
1698 /*
1699  * First stage of page deletion.
1700  *
1701  * Establish the height of the to-be-deleted subtree with leafbuf at its
1702  * lowest level, remove the downlink to the subtree, and mark leafbuf
1703  * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
1704  * but may include additional internal pages (at most one per level of the
1705  * tree below the root).
1706  *
1707  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
1708  * the rightmost child of its parent (and parent has more than one downlink).
1709  * Returns 'true' when the first stage of page deletion completed
1710  * successfully.
1711  */
1712 static bool
1714 {
1715  BlockNumber leafblkno;
1716  BlockNumber leafrightsib;
1717  BlockNumber topparent;
1718  BlockNumber topparentrightsib;
1719  ItemId itemid;
1720  Page page;
1721  BTPageOpaque opaque;
1722  Buffer subtreeparent;
1723  OffsetNumber poffset;
1724  OffsetNumber nextoffset;
1725  IndexTuple itup;
1726  IndexTupleData trunctuple;
1727 
1728  page = BufferGetPage(leafbuf);
1729  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1730 
1731  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
1732  P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
1733  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1734 
1735  /*
1736  * Save info about the leaf page.
1737  */
1738  leafblkno = BufferGetBlockNumber(leafbuf);
1739  leafrightsib = opaque->btpo_next;
1740 
1741  /*
1742  * Before attempting to lock the parent page, check that the right sibling
1743  * is not in half-dead state. A half-dead right sibling would have no
1744  * downlink in the parent, which would be highly confusing later when we
1745  * delete the downlink. It would fail the "right sibling of target page
1746  * is also the next child in parent page" cross-check below.
1747  */
1748  if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
1749  {
1750  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
1751  leafblkno, leafrightsib);
1752  return false;
1753  }
1754 
1755  /*
1756  * We cannot delete a page that is the rightmost child of its immediate
1757  * parent, unless it is the only child --- in which case the parent has to
1758  * be deleted too, and the same condition applies recursively to it. We
1759  * have to check this condition all the way up before trying to delete,
1760  * and lock the parent of the root of the to-be-deleted subtree (the
1761  * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
1762  * for us. We remove the downlink to the "top parent" page (subtree root
1763  * page) from the subtree parent page below.
1764  *
1765  * Initialize topparent to be leafbuf page now. The final to-be-deleted
1766  * subtree is often a degenerate one page subtree consisting only of the
1767  * leafbuf page. When that happens, the leafbuf page is the final subtree
1768  * root page/top parent page.
1769  */
1770  topparent = leafblkno;
1771  topparentrightsib = leafrightsib;
1772  if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
1773  &subtreeparent, &poffset,
1774  &topparent, &topparentrightsib))
1775  return false;
1776 
1777  /*
1778  * Check that the parent-page index items we're about to delete/overwrite
1779  * in subtree parent page contain what we expect. This can fail if the
1780  * index has become corrupt for some reason. We want to throw any error
1781  * before entering the critical section --- otherwise it'd be a PANIC.
1782  */
1783  page = BufferGetPage(subtreeparent);
1784  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1785 
1786 #ifdef USE_ASSERT_CHECKING
1787 
1788  /*
1789  * This is just an assertion because _bt_lock_subtree_parent should have
1790  * guaranteed tuple has the expected contents
1791  */
1792  itemid = PageGetItemId(page, poffset);
1793  itup = (IndexTuple) PageGetItem(page, itemid);
1794  Assert(BTreeTupleGetDownLink(itup) == topparent);
1795 #endif
1796 
1797  nextoffset = OffsetNumberNext(poffset);
1798  itemid = PageGetItemId(page, nextoffset);
1799  itup = (IndexTuple) PageGetItem(page, itemid);
1800  if (BTreeTupleGetDownLink(itup) != topparentrightsib)
1801  ereport(ERROR,
1802  (errcode(ERRCODE_INDEX_CORRUPTED),
1803  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1804  topparentrightsib, topparent,
1805  BTreeTupleGetDownLink(itup),
1806  BufferGetBlockNumber(subtreeparent),
1807  RelationGetRelationName(rel))));
1808 
1809  /*
1810  * Any insert which would have gone on the leaf block will now go to its
1811  * right sibling. In other words, the key space moves right.
1812  */
1813  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
1814 
1815  /* No ereport(ERROR) until changes are logged */
1817 
1818  /*
1819  * Update parent of subtree. We want to delete the downlink to the top
1820  * parent page/root of the subtree, and the *following* key. Easiest way
1821  * is to copy the right sibling's downlink over the downlink that points
1822  * to top parent page, and then delete the right sibling's original pivot
1823  * tuple.
1824  *
1825  * Lanin and Shasha make the key space move left when deleting a page,
1826  * whereas the key space moves right here. That's why we cannot simply
1827  * delete the pivot tuple with the downlink to the top parent page. See
1828  * nbtree/README.
1829  */
1830  page = BufferGetPage(subtreeparent);
1831  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1832 
1833  itemid = PageGetItemId(page, poffset);
1834  itup = (IndexTuple) PageGetItem(page, itemid);
1835  BTreeTupleSetDownLink(itup, topparentrightsib);
1836 
1837  nextoffset = OffsetNumberNext(poffset);
1838  PageIndexTupleDelete(page, nextoffset);
1839 
1840  /*
1841  * Mark the leaf page as half-dead, and stamp it with a link to the top
1842  * parent page. When the leaf page is also the top parent page, the link
1843  * is set to InvalidBlockNumber.
1844  */
1845  page = BufferGetPage(leafbuf);
1846  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1847  opaque->btpo_flags |= BTP_HALF_DEAD;
1848 
1850  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
1851  trunctuple.t_info = sizeof(IndexTupleData);
1852  if (topparent != leafblkno)
1853  BTreeTupleSetTopParent(&trunctuple, topparent);
1854  else
1856 
1857  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
1858  IndexTupleSize(&trunctuple)))
1859  elog(ERROR, "could not overwrite high key in half-dead page");
1860 
1861  /* Must mark buffers dirty before XLogInsert */
1862  MarkBufferDirty(subtreeparent);
1863  MarkBufferDirty(leafbuf);
1864 
1865  /* XLOG stuff */
1866  if (RelationNeedsWAL(rel))
1867  {
1869  XLogRecPtr recptr;
1870 
1871  xlrec.poffset = poffset;
1872  xlrec.leafblk = leafblkno;
1873  if (topparent != leafblkno)
1874  xlrec.topparent = topparent;
1875  else
1876  xlrec.topparent = InvalidBlockNumber;
1877 
1878  XLogBeginInsert();
1879  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
1880  XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
1881 
1882  page = BufferGetPage(leafbuf);
1883  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1884  xlrec.leftblk = opaque->btpo_prev;
1885  xlrec.rightblk = opaque->btpo_next;
1886 
1888 
1889  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
1890 
1891  page = BufferGetPage(subtreeparent);
1892  PageSetLSN(page, recptr);
1893  page = BufferGetPage(leafbuf);
1894  PageSetLSN(page, recptr);
1895  }
1896 
1897  END_CRIT_SECTION();
1898 
1899  _bt_relbuf(rel, subtreeparent);
1900  return true;
1901 }
1902 
1903 /*
1904  * Second stage of page deletion.
1905  *
1906  * Unlinks a single page (in the subtree undergoing deletion) from its
1907  * siblings. Also marks the page deleted.
1908  *
1909  * To get rid of the whole subtree, including the leaf page itself, call here
1910  * until the leaf page is deleted. The original "top parent" established in
1911  * the first stage of deletion is deleted in the first call here, while the
1912  * leaf page is deleted in the last call here. Note that the leaf page itself
1913  * is often the initial top parent page.
1914  *
1915  * Returns 'false' if the page could not be unlinked (shouldn't happen). If
1916  * the right sibling of the current target page is empty, *rightsib_empty is
1917  * set to true, allowing caller to delete the target's right sibling page in
1918  * passing. Note that *rightsib_empty is only actually used by caller when
1919  * target page is leafbuf, following last call here for leafbuf/the subtree
1920  * containing leafbuf. (We always set *rightsib_empty for caller, just to be
1921  * consistent.)
1922  *
1923  * We maintain *oldestBtpoXact for pages that are deleted by the current
1924  * VACUUM operation here. This must be handled here because we conservatively
1925  * assume that there needs to be a new call to ReadNewTransactionId() each
1926  * time a page gets deleted. See comments about the underlying assumption
1927  * below.
1928  *
1929  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1930  * On success exit, we'll be holding pin and write lock. On failure exit,
1931  * we'll release both pin and lock before returning (we define it that way
1932  * to avoid having to reacquire a lock we already released).
1933  */
1934 static bool
1936  bool *rightsib_empty, TransactionId *oldestBtpoXact,
1937  uint32 *ndeleted)
1938 {
1939  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
1940  BlockNumber leafleftsib;
1941  BlockNumber leafrightsib;
1942  BlockNumber target;
1943  BlockNumber leftsib;
1944  BlockNumber rightsib;
1945  Buffer lbuf = InvalidBuffer;
1946  Buffer buf;
1947  Buffer rbuf;
1948  Buffer metabuf = InvalidBuffer;
1949  Page metapg = NULL;
1950  BTMetaPageData *metad = NULL;
1951  ItemId itemid;
1952  Page page;
1953  BTPageOpaque opaque;
1954  bool rightsib_is_rightmost;
1955  int targetlevel;
1956  IndexTuple leafhikey;
1957  BlockNumber nextchild;
1958 
1959  page = BufferGetPage(leafbuf);
1960  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1961 
1962  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
1963 
1964  /*
1965  * Remember some information about the leaf page.
1966  */
1967  itemid = PageGetItemId(page, P_HIKEY);
1968  leafhikey = (IndexTuple) PageGetItem(page, itemid);
1969  target = BTreeTupleGetTopParent(leafhikey);
1970  leafleftsib = opaque->btpo_prev;
1971  leafrightsib = opaque->btpo_next;
1972 
1973  LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
1974 
1975  /*
1976  * Check here, as calling loops will have locks held, preventing
1977  * interrupts from being processed.
1978  */
1980 
1981  /* Unlink the current top parent of the subtree */
1982  if (!BlockNumberIsValid(target))
1983  {
1984  /* Target is leaf page (or leaf page is top parent, if you prefer) */
1985  target = leafblkno;
1986 
1987  buf = leafbuf;
1988  leftsib = leafleftsib;
1989  targetlevel = 0;
1990  }
1991  else
1992  {
1993  /* Target is the internal page taken from leaf's top parent link */
1994  Assert(target != leafblkno);
1995 
1996  /* Fetch the block number of the target's left sibling */
1997  buf = _bt_getbuf(rel, target, BT_READ);
1998  page = BufferGetPage(buf);
1999  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2000  leftsib = opaque->btpo_prev;
2001  targetlevel = opaque->btpo.level;
2002  Assert(targetlevel > 0);
2003 
2004  /*
2005  * To avoid deadlocks, we'd better drop the target page lock before
2006  * going further.
2007  */
2009  }
2010 
2011  /*
2012  * We have to lock the pages we need to modify in the standard order:
2013  * moving right, then up. Else we will deadlock against other writers.
2014  *
2015  * So, first lock the leaf page, if it's not the target. Then find and
2016  * write-lock the current left sibling of the target page. The sibling
2017  * that was current a moment ago could have split, so we may have to move
2018  * right. This search could fail if either the sibling or the target page
2019  * was deleted by someone else meanwhile; if so, give up. (Right now,
2020  * that should never happen, since page deletion is only done in VACUUM
2021  * and there shouldn't be multiple VACUUMs concurrently on the same
2022  * table.)
2023  */
2024  if (target != leafblkno)
2025  LockBuffer(leafbuf, BT_WRITE);
2026  if (leftsib != P_NONE)
2027  {
2028  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2029  page = BufferGetPage(lbuf);
2030  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2031  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2032  {
2033  /* step right one page */
2034  leftsib = opaque->btpo_next;
2035  _bt_relbuf(rel, lbuf);
2036 
2037  /*
2038  * It'd be good to check for interrupts here, but it's not easy to
2039  * do so because a lock is always held. This block isn't
2040  * frequently reached, so hopefully the consequences of not
2041  * checking interrupts aren't too bad.
2042  */
2043 
2044  if (leftsib == P_NONE)
2045  {
2046  elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
2047  target,
2049  if (target != leafblkno)
2050  {
2051  /* we have only a pin on target, but pin+lock on leafbuf */
2052  ReleaseBuffer(buf);
2053  _bt_relbuf(rel, leafbuf);
2054  }
2055  else
2056  {
2057  /* we have only a pin on leafbuf */
2058  ReleaseBuffer(leafbuf);
2059  }
2060  return false;
2061  }
2062  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2063  page = BufferGetPage(lbuf);
2064  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2065  }
2066  }
2067  else
2068  lbuf = InvalidBuffer;
2069 
2070  /*
2071  * Next write-lock the target page itself. It's okay to take a write lock
2072  * rather than a superexclusive lock, since no scan will stop on an empty
2073  * page.
2074  */
2075  LockBuffer(buf, BT_WRITE);
2076  page = BufferGetPage(buf);
2077  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2078 
2079  /*
2080  * Check page is still empty etc, else abandon deletion. This is just for
2081  * paranoia's sake; a half-dead page cannot resurrect because there can be
2082  * only one vacuum process running at a time.
2083  */
2084  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2085  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2086  target, RelationGetRelationName(rel));
2087 
2088  if (opaque->btpo_prev != leftsib)
2089  ereport(ERROR,
2090  (errcode(ERRCODE_INDEX_CORRUPTED),
2091  errmsg_internal("left link changed unexpectedly in block %u of index \"%s\"",
2092  target, RelationGetRelationName(rel))));
2093 
2094  if (target == leafblkno)
2095  {
2096  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2097  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2098  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2099  target, RelationGetRelationName(rel));
2100  nextchild = InvalidBlockNumber;
2101  }
2102  else
2103  {
2104  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2105  P_ISLEAF(opaque))
2106  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2107  target, RelationGetRelationName(rel));
2108 
2109  /* Remember the next non-leaf child down in the subtree */
2110  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2111  nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
2112  if (nextchild == leafblkno)
2113  nextchild = InvalidBlockNumber;
2114  }
2115 
2116  /*
2117  * And next write-lock the (current) right sibling.
2118  */
2119  rightsib = opaque->btpo_next;
2120  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2121  page = BufferGetPage(rbuf);
2122  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2123  if (opaque->btpo_prev != target)
2124  ereport(ERROR,
2125  (errcode(ERRCODE_INDEX_CORRUPTED),
2126  errmsg_internal("right sibling's left-link doesn't match: "
2127  "block %u links to %u instead of expected %u in index \"%s\"",
2128  rightsib, opaque->btpo_prev, target,
2129  RelationGetRelationName(rel))));
2130  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2131  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2132 
2133  /*
2134  * If we are deleting the next-to-last page on the target's level, then
2135  * the rightsib is a candidate to become the new fast root. (In theory, it
2136  * might be possible to push the fast root even further down, but the odds
2137  * of doing so are slim, and the locking considerations daunting.)
2138  *
2139  * We can safely acquire a lock on the metapage here --- see comments for
2140  * _bt_newroot().
2141  */
2142  if (leftsib == P_NONE && rightsib_is_rightmost)
2143  {
2144  page = BufferGetPage(rbuf);
2145  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2146  if (P_RIGHTMOST(opaque))
2147  {
2148  /* rightsib will be the only one left on the level */
2149  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2150  metapg = BufferGetPage(metabuf);
2151  metad = BTPageGetMeta(metapg);
2152 
2153  /*
2154  * The expected case here is btm_fastlevel == targetlevel+1; if
2155  * the fastlevel is <= targetlevel, something is wrong, and we
2156  * choose to overwrite it to fix it.
2157  */
2158  if (metad->btm_fastlevel > targetlevel + 1)
2159  {
2160  /* no update wanted */
2161  _bt_relbuf(rel, metabuf);
2162  metabuf = InvalidBuffer;
2163  }
2164  }
2165  }
2166 
2167  /*
2168  * Here we begin doing the deletion.
2169  */
2170 
2171  /* No ereport(ERROR) until changes are logged */
2173 
2174  /*
2175  * Update siblings' side-links. Note the target page's side-links will
2176  * continue to point to the siblings. Asserts here are just rechecking
2177  * things we already verified above.
2178  */
2179  if (BufferIsValid(lbuf))
2180  {
2181  page = BufferGetPage(lbuf);
2182  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2183  Assert(opaque->btpo_next == target);
2184  opaque->btpo_next = rightsib;
2185  }
2186  page = BufferGetPage(rbuf);
2187  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2188  Assert(opaque->btpo_prev == target);
2189  opaque->btpo_prev = leftsib;
2190 
2191  /*
2192  * If we deleted a parent of the targeted leaf page, instead of the leaf
2193  * itself, update the leaf to point to the next remaining child in the
2194  * subtree.
2195  *
2196  * Note: We rely on the fact that a buffer pin on the leaf page has been
2197  * held since leafhikey was initialized. This is safe, though only
2198  * because the page was already half-dead at that point. The leaf page
2199  * cannot have been modified by any other backend during the period when
2200  * no lock was held.
2201  */
2202  if (target != leafblkno)
2203  BTreeTupleSetTopParent(leafhikey, nextchild);
2204 
2205  /*
2206  * Mark the page itself deleted. It can be recycled when all current
2207  * transactions are gone. Storing GetTopTransactionId() would work, but
2208  * we're in VACUUM and would not otherwise have an XID. Having already
2209  * updated links to the target, ReadNewTransactionId() suffices as an
2210  * upper bound. Any scan having retained a now-stale link is advertising
2211  * in its PGXACT an xmin less than or equal to the value we read here. It
2212  * will continue to do so, holding back RecentGlobalXmin, for the duration
2213  * of that scan.
2214  */
2215  page = BufferGetPage(buf);
2216  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2217  Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2218  opaque->btpo_flags &= ~BTP_HALF_DEAD;
2219  opaque->btpo_flags |= BTP_DELETED;
2220  opaque->btpo.xact = ReadNewTransactionId();
2221 
2222  /* And update the metapage, if needed */
2223  if (BufferIsValid(metabuf))
2224  {
2225  /* upgrade metapage if needed */
2226  if (metad->btm_version < BTREE_NOVAC_VERSION)
2227  _bt_upgrademetapage(metapg);
2228  metad->btm_fastroot = rightsib;
2229  metad->btm_fastlevel = targetlevel;
2230  MarkBufferDirty(metabuf);
2231  }
2232 
2233  /* Must mark buffers dirty before XLogInsert */
2234  MarkBufferDirty(rbuf);
2235  MarkBufferDirty(buf);
2236  if (BufferIsValid(lbuf))
2237  MarkBufferDirty(lbuf);
2238  if (target != leafblkno)
2239  MarkBufferDirty(leafbuf);
2240 
2241  /* XLOG stuff */
2242  if (RelationNeedsWAL(rel))
2243  {
2244  xl_btree_unlink_page xlrec;
2245  xl_btree_metadata xlmeta;
2246  uint8 xlinfo;
2247  XLogRecPtr recptr;
2248 
2249  XLogBeginInsert();
2250 
2252  if (BufferIsValid(lbuf))
2255  if (target != leafblkno)
2256  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2257 
2258  /* information on the unlinked block */
2259  xlrec.leftsib = leftsib;
2260  xlrec.rightsib = rightsib;
2261  xlrec.btpo_xact = opaque->btpo.xact;
2262 
2263  /* information needed to recreate the leaf block (if not the target) */
2264  xlrec.leafleftsib = leafleftsib;
2265  xlrec.leafrightsib = leafrightsib;
2266  xlrec.topparent = nextchild;
2267 
2268  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2269 
2270  if (BufferIsValid(metabuf))
2271  {
2273 
2275  xlmeta.version = metad->btm_version;
2276  xlmeta.root = metad->btm_root;
2277  xlmeta.level = metad->btm_level;
2278  xlmeta.fastroot = metad->btm_fastroot;
2279  xlmeta.fastlevel = metad->btm_fastlevel;
2280  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
2282  xlmeta.allequalimage = metad->btm_allequalimage;
2283 
2284  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2285  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2286  }
2287  else
2288  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2289 
2290  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2291 
2292  if (BufferIsValid(metabuf))
2293  {
2294  PageSetLSN(metapg, recptr);
2295  }
2296  page = BufferGetPage(rbuf);
2297  PageSetLSN(page, recptr);
2298  page = BufferGetPage(buf);
2299  PageSetLSN(page, recptr);
2300  if (BufferIsValid(lbuf))
2301  {
2302  page = BufferGetPage(lbuf);
2303  PageSetLSN(page, recptr);
2304  }
2305  if (target != leafblkno)
2306  {
2307  page = BufferGetPage(leafbuf);
2308  PageSetLSN(page, recptr);
2309  }
2310  }
2311 
2312  END_CRIT_SECTION();
2313 
2314  /* release metapage */
2315  if (BufferIsValid(metabuf))
2316  _bt_relbuf(rel, metabuf);
2317 
2318  /* release siblings */
2319  if (BufferIsValid(lbuf))
2320  _bt_relbuf(rel, lbuf);
2321  _bt_relbuf(rel, rbuf);
2322 
2323  if (!TransactionIdIsValid(*oldestBtpoXact) ||
2324  TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
2325  *oldestBtpoXact = opaque->btpo.xact;
2326 
2327  /*
2328  * If btvacuumscan won't revisit this page in a future btvacuumpage call
2329  * and count it as deleted then, we count it as deleted by current
2330  * btvacuumpage call
2331  */
2332  if (target <= scanblkno)
2333  (*ndeleted)++;
2334 
2335  /* If the target is not leafbuf, we're done with it now -- release it */
2336  if (target != leafblkno)
2337  _bt_relbuf(rel, buf);
2338 
2339  return true;
2340 }
2341 
2342 /*
2343  * Establish how tall the to-be-deleted subtree will be during the first stage
2344  * of page deletion.
2345  *
2346  * Caller's child argument is the block number of the page caller wants to
2347  * delete (this is leafbuf's block number, except when we're called
2348  * recursively). stack is a search stack leading to it. Note that we will
2349  * update the stack entry(s) to reflect current downlink positions --- this is
2350  * similar to the corresponding point in page split handling.
2351  *
2352  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2353  * false. Returns true on success, in which case caller can use certain
2354  * details established here to perform the first stage of deletion. This
2355  * function is the last point at which page deletion may be deemed unsafe
2356  * (barring index corruption, or unexpected concurrent page deletions).
2357  *
2358  * We write lock the parent of the root of the to-be-deleted subtree for
2359  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2360  * caller). Caller will have to remove a downlink from *subtreeparent. We
2361  * also set a *subtreeparent offset number in *poffset, to indicate the
2362  * location of the pivot tuple that contains the relevant downlink.
2363  *
2364  * The root of the to-be-deleted subtree is called the "top parent". Note
2365  * that the leafbuf page is often the final "top parent" page (you can think
2366  * of the leafbuf page as a degenerate single page subtree when that happens).
2367  * Caller should initialize *topparent to the target leafbuf page block number
2368  * (while *topparentrightsib should be set to leafbuf's right sibling block
2369  * number). We will update *topparent (and *topparentrightsib) for caller
2370  * here, though only when it turns out that caller will delete at least one
2371  * internal page (i.e. only when caller needs to store a valid link to the top
2372  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2373  */
2374 static bool
2376  Buffer *subtreeparent, OffsetNumber *poffset,
2377  BlockNumber *topparent, BlockNumber *topparentrightsib)
2378 {
2379  BlockNumber parent,
2380  leftsibparent;
2381  OffsetNumber parentoffset,
2382  maxoff;
2383  Buffer pbuf;
2384  Page page;
2385  BTPageOpaque opaque;
2386 
2387  /*
2388  * Locate the pivot tuple whose downlink points to "child". Write lock
2389  * the parent page itself.
2390  */
2391  pbuf = _bt_getstackbuf(rel, stack, child);
2392  if (pbuf == InvalidBuffer)
2393  ereport(ERROR,
2394  (errcode(ERRCODE_INDEX_CORRUPTED),
2395  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2396  RelationGetRelationName(rel), child)));
2397  parent = stack->bts_blkno;
2398  parentoffset = stack->bts_offset;
2399 
2400  page = BufferGetPage(pbuf);
2401  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2402  maxoff = PageGetMaxOffsetNumber(page);
2403  leftsibparent = opaque->btpo_prev;
2404 
2405  /*
2406  * _bt_getstackbuf() completes page splits on returned parent buffer when
2407  * required.
2408  *
2409  * In general it's a bad idea for VACUUM to use up more disk space, which
2410  * is why page deletion does not finish incomplete page splits most of the
2411  * time. We allow this limited exception because the risk is much lower,
2412  * and the potential downside of not proceeding is much higher: A single
2413  * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2414  * prevent us from deleting hundreds of empty leaf pages from one level
2415  * down.
2416  */
2417  Assert(!P_INCOMPLETE_SPLIT(opaque));
2418 
2419  if (parentoffset < maxoff)
2420  {
2421  /*
2422  * Child is not the rightmost child in parent, so it's safe to delete
2423  * the subtree whose root/topparent is child page
2424  */
2425  *subtreeparent = pbuf;
2426  *poffset = parentoffset;
2427  return true;
2428  }
2429 
2430  /*
2431  * Child is the rightmost child of parent.
2432  *
2433  * Since it's the rightmost child of parent, deleting the child (or
2434  * deleting the subtree whose root/topparent is the child page) is only
2435  * safe when it's also possible to delete the parent.
2436  */
2437  Assert(parentoffset == maxoff);
2438  if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2439  {
2440  /*
2441  * Child isn't parent's only child, or parent is rightmost on its
2442  * entire level. Definitely cannot delete any pages.
2443  */
2444  _bt_relbuf(rel, pbuf);
2445  return false;
2446  }
2447 
2448  /*
2449  * Now make sure that the parent deletion is itself safe by examining the
2450  * child's grandparent page. Recurse, passing the parent page as the
2451  * child page (child's grandparent is the parent on the next level up). If
2452  * parent deletion is unsafe, then child deletion must also be unsafe (in
2453  * which case caller cannot delete any pages at all).
2454  */
2455  *topparent = parent;
2456  *topparentrightsib = opaque->btpo_next;
2457 
2458  /*
2459  * Release lock on parent before recursing.
2460  *
2461  * It's OK to release page locks on parent before recursive call locks
2462  * grandparent. An internal page can only acquire an entry if the child
2463  * is split, but that cannot happen as long as we still hold a lock on the
2464  * leafbuf page.
2465  */
2466  _bt_relbuf(rel, pbuf);
2467 
2468  /*
2469  * Before recursing, check that the left sibling of parent (if any) is not
2470  * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2471  * parent lock).
2472  *
2473  * Note: We deliberately avoid completing incomplete splits here.
2474  */
2475  if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2476  return false;
2477 
2478  /* Recurse to examine child page's grandparent page */
2479  return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2480  subtreeparent, poffset,
2481  topparent, topparentrightsib);
2482 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
bool allequalimage
Definition: nbtxlog.h:57
BlockNumber rootblk
Definition: nbtxlog.h:318
TransactionId latestRemovedXid
Definition: nbtxlog.h:189
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
uint16 ndeletedtids
Definition: nbtree.h:782
#define BTP_ROOT
Definition: nbtree.h:73
BlockNumber btpo_next
Definition: nbtree.h:59
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:1071
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:135
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:676
#define P_IGNORE(opaque)
Definition: nbtree.h:219
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:100
uint32 TransactionId
Definition: c.h:520
uint32 btm_version
Definition: nbtree.h:101
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:603
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:928
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:225
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:719
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
#define ExclusiveLock
Definition: lockdefs.h:44
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
#define BTREE_VERSION
Definition: nbtree.h:143
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1017
OffsetNumber updatedoffset
Definition: nbtree.h:779
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
uint32 btm_magic
Definition: nbtree.h:100
#define BTP_LEAF
Definition: nbtree.h:72
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:583
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:273
ItemPointerData t_tid
Definition: itup.h:37
#define BTP_HALF_DEAD
Definition: nbtree.h:76
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:51
IndexTuple itup
Definition: nbtree.h:778
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1391
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:349
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:372
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2375
RelFileNode node
Definition: nbtxlog.h:206
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 level
Definition: nbtxlog.h:319
#define MemSet(start, val, len)
Definition: c.h:978
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:60
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3483
uint16 nupdated
Definition: nbtxlog.h:243
#define P_NEW
Definition: bufmgr.h:91
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:173
#define BTP_DELETED
Definition: nbtree.h:74
#define LOG
Definition: elog.h:26
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:974
#define PANIC
Definition: elog.h:53
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:63
#define BTP_META
Definition: nbtree.h:75
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint32 ndeleted
Definition: nbtxlog.h:190
uint16 OffsetNumber
Definition: off.h:24
ItemPointerData * ItemPointer
Definition: itemptr.h:49
#define P_ISMETA(opaque)
Definition: nbtree.h:217
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:211
static TransactionId table_compute_xid_horizon_for_tuples(Relation rel, ItemPointerData *items, int nitems)
Definition: tableam.h:1105
#define BT_READ
Definition: nbtree.h:587
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
unsigned short uint16
Definition: c.h:373
void pfree(void *pointer)
Definition: mcxt.c:1056
#define BTREE_MAGIC
Definition: nbtree.h:142
BlockNumber block
Definition: nbtxlog.h:207
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:725
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:218
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3506
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:498
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:250
BTCycleId btpo_cycleid
Definition: nbtree.h:66
static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:1713
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1183
#define DEBUG2
Definition: elog.h:24
#define BTPageGetMeta(p)
Definition: nbtree.h:114
uint32 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
Definition: nbtpage.c:1442
bool btm_allequalimage
Definition: nbtree.h:111
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1060
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
OffsetNumber bts_offset
Definition: nbtree.h:603
static char * buf
Definition: pg_test_fsync.c:67
TransactionId RecentGlobalXmin
Definition: snapmgr.c:168
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:38
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:34
unsigned int uint32
Definition: c.h:374
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:430
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:271
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3748
#define BTREE_METAPAGE
Definition: nbtree.h:141
bool pivotsearch
Definition: nbtree.h:663
#define P_ISDELETED(opaque)
Definition: nbtree.h:216
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define P_ISROOT(opaque)
Definition: nbtree.h:215
static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable)
Definition: nbtpage.c:1252
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:667
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:322
uint32 version
Definition: nbtxlog.h:50
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:33
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:759
uint16 ndeleted
Definition: nbtxlog.h:242
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
uint32 level
Definition: nbtree.h:62
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:37
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:783
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3722
BlockNumber btm_root
Definition: nbtree.h:102
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:507
#define XLogStandbyInfoActive()
Definition: xlog.h:205
#define BTREE_MIN_VERSION
Definition: nbtree.h:144
BlockNumber bts_blkno
Definition: nbtree.h:602
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define ereport(elevel,...)
Definition: elog.h:144
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
RelFileNode rd_node
Definition: rel.h:55
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
#define Max(x, y)
Definition: c.h:921
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:828
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:606
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:473
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1531
#define MAXALIGN(LEN)
Definition: c.h:698
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
struct BTStackData * bts_parent
Definition: nbtree.h:604
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
static TransactionId ReadNewTransactionId(void)
Definition: transam.h:258
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:796
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:214
MemoryContext rd_indexcxt
Definition: rel.h:186
uint32 level
Definition: nbtxlog.h:52
int i
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:303
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:959
#define SizeOfBtreeDelete
Definition: nbtxlog.h:195
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:488
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3183
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
BlockNumber fastroot
Definition: nbtxlog.h:53
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:35
TransactionId latestRemovedXid
Definition: nbtxlog.h:208
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void * rd_amcache
Definition: rel.h:211
#define BT_WRITE
Definition: nbtree.h:588
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:494
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:42
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2292
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
uint16 ndeletedtids
Definition: nbtxlog.h:220
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, TransactionId *oldestBtpoXact, uint32 *ndeleted)
Definition: nbtpage.c:1935
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:121
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1334
#define P_ISLEAF(opaque)
Definition: nbtree.h:214
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42