PostgreSQL Source Code  git master
nbtpage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtpage.c
4  * BTree-specific page management code for the Postgres btree access
5  * method.
6  *
7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  * src/backend/access/nbtree/nbtpage.c
13  *
14  * NOTES
15  * Postgres btree pages look like ordinary relation pages. The opaque
16  * data at high addresses includes pointers to left and right siblings
17  * and flag data describing page state. The first page in a btree, page
18  * zero, is special -- it stores meta-information describing the tree.
19  * Pages one and higher store the actual tree data.
20  *
21  *-------------------------------------------------------------------------
22  */
23 #include "postgres.h"
24 
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/tableam.h"
28 #include "access/transam.h"
29 #include "access/xlog.h"
30 #include "access/xloginsert.h"
31 #include "miscadmin.h"
32 #include "storage/indexfsm.h"
33 #include "storage/lmgr.h"
34 #include "storage/predicate.h"
35 #include "utils/memdebug.h"
36 #include "utils/snapmgr.h"
37 
38 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
39 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
40  TransactionId latestRemovedXid);
41 static void _bt_delitems_delete(Relation rel, Buffer buf,
42  TransactionId latestRemovedXid,
43  OffsetNumber *deletable, int ndeletable,
44  BTVacuumPosting *updatable, int nupdatable,
45  Relation heapRel);
46 static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
47  OffsetNumber *updatedoffsets,
48  Size *updatedbuflen, bool needswal);
49 static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
50  BTStack stack);
51 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
52  BlockNumber scanblkno,
53  bool *rightsib_empty,
54  TransactionId *oldestBtpoXact,
55  uint32 *ndeleted);
56 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
57  BTStack stack,
58  Buffer *subtreeparent,
59  OffsetNumber *poffset,
60  BlockNumber *topparent,
61  BlockNumber *topparentrightsib);
62 
63 /*
64  * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
65  */
66 void
67 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
68  bool allequalimage)
69 {
70  BTMetaPageData *metad;
71  BTPageOpaque metaopaque;
72 
73  _bt_pageinit(page, BLCKSZ);
74 
75  metad = BTPageGetMeta(page);
76  metad->btm_magic = BTREE_MAGIC;
77  metad->btm_version = BTREE_VERSION;
78  metad->btm_root = rootbknum;
79  metad->btm_level = level;
80  metad->btm_fastroot = rootbknum;
81  metad->btm_fastlevel = level;
84  metad->btm_allequalimage = allequalimage;
85 
86  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
87  metaopaque->btpo_flags = BTP_META;
88 
89  /*
90  * Set pd_lower just past the end of the metadata. This is essential,
91  * because without doing so, metadata will be lost if xlog.c compresses
92  * the page.
93  */
94  ((PageHeader) page)->pd_lower =
95  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
96 }
97 
98 /*
99  * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
100  * 3, the last version that can be updated without broadly affecting
101  * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
102  *
103  * This routine does purely in-memory image upgrade. Caller is
104  * responsible for locking, WAL-logging etc.
105  */
106 void
108 {
109  BTMetaPageData *metad;
111 
112  metad = BTPageGetMeta(page);
113  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
114 
115  /* It must be really a meta page of upgradable version */
116  Assert(metaopaque->btpo_flags & BTP_META);
119 
120  /* Set version number and fill extra fields added into version 3 */
123  metad->btm_last_cleanup_num_heap_tuples = -1.0;
124  /* Only a REINDEX can set this field */
125  Assert(!metad->btm_allequalimage);
126  metad->btm_allequalimage = false;
127 
128  /* Adjust pd_lower (see _bt_initmetapage() for details) */
129  ((PageHeader) page)->pd_lower =
130  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
131 }
132 
133 /*
134  * Get metadata from share-locked buffer containing metapage, while performing
135  * standard sanity checks.
136  *
137  * Callers that cache data returned here in local cache should note that an
138  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
139  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
140  */
141 static BTMetaPageData *
143 {
144  Page metapg;
145  BTPageOpaque metaopaque;
146  BTMetaPageData *metad;
147 
148  metapg = BufferGetPage(metabuf);
149  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
150  metad = BTPageGetMeta(metapg);
151 
152  /* sanity-check the metapage */
153  if (!P_ISMETA(metaopaque) ||
154  metad->btm_magic != BTREE_MAGIC)
155  ereport(ERROR,
156  (errcode(ERRCODE_INDEX_CORRUPTED),
157  errmsg("index \"%s\" is not a btree",
158  RelationGetRelationName(rel))));
159 
160  if (metad->btm_version < BTREE_MIN_VERSION ||
161  metad->btm_version > BTREE_VERSION)
162  ereport(ERROR,
163  (errcode(ERRCODE_INDEX_CORRUPTED),
164  errmsg("version mismatch in index \"%s\": file version %d, "
165  "current version %d, minimal supported version %d",
168 
169  return metad;
170 }
171 
172 /*
173  * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
174  * the metapage.
175  *
176  * This routine checks if provided cleanup-related information is matching
177  * to those written in the metapage. On mismatch, metapage is overwritten.
178  */
179 void
181  float8 numHeapTuples)
182 {
183  Buffer metabuf;
184  Page metapg;
185  BTMetaPageData *metad;
186  bool needsRewrite = false;
187  XLogRecPtr recptr;
188 
189  /* read the metapage and check if it needs rewrite */
190  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
191  metapg = BufferGetPage(metabuf);
192  metad = BTPageGetMeta(metapg);
193 
194  /* outdated version of metapage always needs rewrite */
195  if (metad->btm_version < BTREE_NOVAC_VERSION)
196  needsRewrite = true;
197  else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
198  metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
199  needsRewrite = true;
200 
201  if (!needsRewrite)
202  {
203  _bt_relbuf(rel, metabuf);
204  return;
205  }
206 
207  /* trade in our read lock for a write lock */
208  _bt_unlockbuf(rel, metabuf);
209  _bt_lockbuf(rel, metabuf, BT_WRITE);
210 
212 
213  /* upgrade meta-page if needed */
214  if (metad->btm_version < BTREE_NOVAC_VERSION)
215  _bt_upgrademetapage(metapg);
216 
217  /* update cleanup-related information */
218  metad->btm_oldest_btpo_xact = oldestBtpoXact;
219  metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
220  MarkBufferDirty(metabuf);
221 
222  /* write wal record if needed */
223  if (RelationNeedsWAL(rel))
224  {
226 
227  XLogBeginInsert();
229 
231  md.version = metad->btm_version;
232  md.root = metad->btm_root;
233  md.level = metad->btm_level;
234  md.fastroot = metad->btm_fastroot;
235  md.fastlevel = metad->btm_fastlevel;
236  md.oldest_btpo_xact = oldestBtpoXact;
237  md.last_cleanup_num_heap_tuples = numHeapTuples;
238  md.allequalimage = metad->btm_allequalimage;
239 
240  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
241 
242  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
243 
244  PageSetLSN(metapg, recptr);
245  }
246 
248  _bt_relbuf(rel, metabuf);
249 }
250 
251 /*
252  * _bt_getroot() -- Get the root page of the btree.
253  *
254  * Since the root page can move around the btree file, we have to read
255  * its location from the metadata page, and then read the root page
256  * itself. If no root page exists yet, we have to create one.
257  *
258  * The access type parameter (BT_READ or BT_WRITE) controls whether
259  * a new root page will be created or not. If access = BT_READ,
260  * and no root page exists, we just return InvalidBuffer. For
261  * BT_WRITE, we try to create the root page if it doesn't exist.
262  * NOTE that the returned root page will have only a read lock set
263  * on it even if access = BT_WRITE!
264  *
265  * The returned page is not necessarily the true root --- it could be
266  * a "fast root" (a page that is alone in its level due to deletions).
267  * Also, if the root page is split while we are "in flight" to it,
268  * what we will return is the old root, which is now just the leftmost
269  * page on a probably-not-very-wide level. For most purposes this is
270  * as good as or better than the true root, so we do not bother to
271  * insist on finding the true root. We do, however, guarantee to
272  * return a live (not deleted or half-dead) page.
273  *
274  * On successful return, the root page is pinned and read-locked.
275  * The metadata page is not locked or pinned on exit.
276  */
277 Buffer
278 _bt_getroot(Relation rel, int access)
279 {
280  Buffer metabuf;
281  Buffer rootbuf;
282  Page rootpage;
283  BTPageOpaque rootopaque;
284  BlockNumber rootblkno;
285  uint32 rootlevel;
286  BTMetaPageData *metad;
287 
288  /*
289  * Try to use previously-cached metapage data to find the root. This
290  * normally saves one buffer access per index search, which is a very
291  * helpful savings in bufmgr traffic and hence contention.
292  */
293  if (rel->rd_amcache != NULL)
294  {
295  metad = (BTMetaPageData *) rel->rd_amcache;
296  /* We shouldn't have cached it if any of these fail */
297  Assert(metad->btm_magic == BTREE_MAGIC);
299  Assert(metad->btm_version <= BTREE_VERSION);
300  Assert(!metad->btm_allequalimage ||
302  Assert(metad->btm_root != P_NONE);
303 
304  rootblkno = metad->btm_fastroot;
305  Assert(rootblkno != P_NONE);
306  rootlevel = metad->btm_fastlevel;
307 
308  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
309  rootpage = BufferGetPage(rootbuf);
310  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
311 
312  /*
313  * Since the cache might be stale, we check the page more carefully
314  * here than normal. We *must* check that it's not deleted. If it's
315  * not alone on its level, then we reject too --- this may be overly
316  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
317  * because that's not set in a "fast root".
318  */
319  if (!P_IGNORE(rootopaque) &&
320  rootopaque->btpo.level == rootlevel &&
321  P_LEFTMOST(rootopaque) &&
322  P_RIGHTMOST(rootopaque))
323  {
324  /* OK, accept cached page as the root */
325  return rootbuf;
326  }
327  _bt_relbuf(rel, rootbuf);
328  /* Cache is stale, throw it away */
329  if (rel->rd_amcache)
330  pfree(rel->rd_amcache);
331  rel->rd_amcache = NULL;
332  }
333 
334  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
335  metad = _bt_getmeta(rel, metabuf);
336 
337  /* if no root page initialized yet, do it */
338  if (metad->btm_root == P_NONE)
339  {
340  Page metapg;
341 
342  /* If access = BT_READ, caller doesn't want us to create root yet */
343  if (access == BT_READ)
344  {
345  _bt_relbuf(rel, metabuf);
346  return InvalidBuffer;
347  }
348 
349  /* trade in our read lock for a write lock */
350  _bt_unlockbuf(rel, metabuf);
351  _bt_lockbuf(rel, metabuf, BT_WRITE);
352 
353  /*
354  * Race condition: if someone else initialized the metadata between
355  * the time we released the read lock and acquired the write lock, we
356  * must avoid doing it again.
357  */
358  if (metad->btm_root != P_NONE)
359  {
360  /*
361  * Metadata initialized by someone else. In order to guarantee no
362  * deadlocks, we have to release the metadata page and start all
363  * over again. (Is that really true? But it's hardly worth trying
364  * to optimize this case.)
365  */
366  _bt_relbuf(rel, metabuf);
367  return _bt_getroot(rel, access);
368  }
369 
370  /*
371  * Get, initialize, write, and leave a lock of the appropriate type on
372  * the new root page. Since this is the first page in the tree, it's
373  * a leaf as well as the root.
374  */
375  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
376  rootblkno = BufferGetBlockNumber(rootbuf);
377  rootpage = BufferGetPage(rootbuf);
378  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
379  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
380  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
381  rootopaque->btpo.level = 0;
382  rootopaque->btpo_cycleid = 0;
383  /* Get raw page pointer for metapage */
384  metapg = BufferGetPage(metabuf);
385 
386  /* NO ELOG(ERROR) till meta is updated */
388 
389  /* upgrade metapage if needed */
390  if (metad->btm_version < BTREE_NOVAC_VERSION)
391  _bt_upgrademetapage(metapg);
392 
393  metad->btm_root = rootblkno;
394  metad->btm_level = 0;
395  metad->btm_fastroot = rootblkno;
396  metad->btm_fastlevel = 0;
398  metad->btm_last_cleanup_num_heap_tuples = -1.0;
399 
400  MarkBufferDirty(rootbuf);
401  MarkBufferDirty(metabuf);
402 
403  /* XLOG stuff */
404  if (RelationNeedsWAL(rel))
405  {
406  xl_btree_newroot xlrec;
407  XLogRecPtr recptr;
409 
410  XLogBeginInsert();
413 
415  md.version = metad->btm_version;
416  md.root = rootblkno;
417  md.level = 0;
418  md.fastroot = rootblkno;
419  md.fastlevel = 0;
422  md.allequalimage = metad->btm_allequalimage;
423 
424  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
425 
426  xlrec.rootblk = rootblkno;
427  xlrec.level = 0;
428 
429  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
430 
431  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
432 
433  PageSetLSN(rootpage, recptr);
434  PageSetLSN(metapg, recptr);
435  }
436 
438 
439  /*
440  * swap root write lock for read lock. There is no danger of anyone
441  * else accessing the new root page while it's unlocked, since no one
442  * else knows where it is yet.
443  */
444  _bt_unlockbuf(rel, rootbuf);
445  _bt_lockbuf(rel, rootbuf, BT_READ);
446 
447  /* okay, metadata is correct, release lock on it without caching */
448  _bt_relbuf(rel, metabuf);
449  }
450  else
451  {
452  rootblkno = metad->btm_fastroot;
453  Assert(rootblkno != P_NONE);
454  rootlevel = metad->btm_fastlevel;
455 
456  /*
457  * Cache the metapage data for next time
458  */
460  sizeof(BTMetaPageData));
461  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
462 
463  /*
464  * We are done with the metapage; arrange to release it via first
465  * _bt_relandgetbuf call
466  */
467  rootbuf = metabuf;
468 
469  for (;;)
470  {
471  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
472  rootpage = BufferGetPage(rootbuf);
473  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
474 
475  if (!P_IGNORE(rootopaque))
476  break;
477 
478  /* it's dead, Jim. step right one page */
479  if (P_RIGHTMOST(rootopaque))
480  elog(ERROR, "no live root page found in index \"%s\"",
482  rootblkno = rootopaque->btpo_next;
483  }
484 
485  /* Note: can't check btpo.level on deleted pages */
486  if (rootopaque->btpo.level != rootlevel)
487  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
488  rootblkno, RelationGetRelationName(rel),
489  rootopaque->btpo.level, rootlevel);
490  }
491 
492  /*
493  * By here, we have a pin and read lock on the root page, and no lock set
494  * on the metadata page. Return the root page's buffer.
495  */
496  return rootbuf;
497 }
498 
499 /*
500  * _bt_gettrueroot() -- Get the true root page of the btree.
501  *
502  * This is the same as the BT_READ case of _bt_getroot(), except
503  * we follow the true-root link not the fast-root link.
504  *
505  * By the time we acquire lock on the root page, it might have been split and
506  * not be the true root anymore. This is okay for the present uses of this
507  * routine; we only really need to be able to move up at least one tree level
508  * from whatever non-root page we were at. If we ever do need to lock the
509  * one true root page, we could loop here, re-reading the metapage on each
510  * failure. (Note that it wouldn't do to hold the lock on the metapage while
511  * moving to the root --- that'd deadlock against any concurrent root split.)
512  */
513 Buffer
515 {
516  Buffer metabuf;
517  Page metapg;
518  BTPageOpaque metaopaque;
519  Buffer rootbuf;
520  Page rootpage;
521  BTPageOpaque rootopaque;
522  BlockNumber rootblkno;
523  uint32 rootlevel;
524  BTMetaPageData *metad;
525 
526  /*
527  * We don't try to use cached metapage data here, since (a) this path is
528  * not performance-critical, and (b) if we are here it suggests our cache
529  * is out-of-date anyway. In light of point (b), it's probably safest to
530  * actively flush any cached metapage info.
531  */
532  if (rel->rd_amcache)
533  pfree(rel->rd_amcache);
534  rel->rd_amcache = NULL;
535 
536  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
537  metapg = BufferGetPage(metabuf);
538  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
539  metad = BTPageGetMeta(metapg);
540 
541  if (!P_ISMETA(metaopaque) ||
542  metad->btm_magic != BTREE_MAGIC)
543  ereport(ERROR,
544  (errcode(ERRCODE_INDEX_CORRUPTED),
545  errmsg("index \"%s\" is not a btree",
546  RelationGetRelationName(rel))));
547 
548  if (metad->btm_version < BTREE_MIN_VERSION ||
549  metad->btm_version > BTREE_VERSION)
550  ereport(ERROR,
551  (errcode(ERRCODE_INDEX_CORRUPTED),
552  errmsg("version mismatch in index \"%s\": file version %d, "
553  "current version %d, minimal supported version %d",
556 
557  /* if no root page initialized yet, fail */
558  if (metad->btm_root == P_NONE)
559  {
560  _bt_relbuf(rel, metabuf);
561  return InvalidBuffer;
562  }
563 
564  rootblkno = metad->btm_root;
565  rootlevel = metad->btm_level;
566 
567  /*
568  * We are done with the metapage; arrange to release it via first
569  * _bt_relandgetbuf call
570  */
571  rootbuf = metabuf;
572 
573  for (;;)
574  {
575  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
576  rootpage = BufferGetPage(rootbuf);
577  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
578 
579  if (!P_IGNORE(rootopaque))
580  break;
581 
582  /* it's dead, Jim. step right one page */
583  if (P_RIGHTMOST(rootopaque))
584  elog(ERROR, "no live root page found in index \"%s\"",
586  rootblkno = rootopaque->btpo_next;
587  }
588 
589  /* Note: can't check btpo.level on deleted pages */
590  if (rootopaque->btpo.level != rootlevel)
591  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
592  rootblkno, RelationGetRelationName(rel),
593  rootopaque->btpo.level, rootlevel);
594 
595  return rootbuf;
596 }
597 
598 /*
599  * _bt_getrootheight() -- Get the height of the btree search tree.
600  *
601  * We return the level (counting from zero) of the current fast root.
602  * This represents the number of tree levels we'd have to descend through
603  * to start any btree index search.
604  *
605  * This is used by the planner for cost-estimation purposes. Since it's
606  * only an estimate, slightly-stale data is fine, hence we don't worry
607  * about updating previously cached data.
608  */
609 int
611 {
612  BTMetaPageData *metad;
613 
614  if (rel->rd_amcache == NULL)
615  {
616  Buffer metabuf;
617 
618  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
619  metad = _bt_getmeta(rel, metabuf);
620 
621  /*
622  * If there's no root page yet, _bt_getroot() doesn't expect a cache
623  * to be made, so just stop here and report the index height is zero.
624  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
625  */
626  if (metad->btm_root == P_NONE)
627  {
628  _bt_relbuf(rel, metabuf);
629  return 0;
630  }
631 
632  /*
633  * Cache the metapage data for next time
634  */
636  sizeof(BTMetaPageData));
637  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
638  _bt_relbuf(rel, metabuf);
639  }
640 
641  /* Get cached page */
642  metad = (BTMetaPageData *) rel->rd_amcache;
643  /* We shouldn't have cached it if any of these fail */
644  Assert(metad->btm_magic == BTREE_MAGIC);
646  Assert(metad->btm_version <= BTREE_VERSION);
647  Assert(!metad->btm_allequalimage ||
649  Assert(metad->btm_fastroot != P_NONE);
650 
651  return metad->btm_fastlevel;
652 }
653 
654 /*
655  * _bt_metaversion() -- Get version/status info from metapage.
656  *
657  * Sets caller's *heapkeyspace and *allequalimage arguments using data
658  * from the B-Tree metapage (could be locally-cached version). This
659  * information needs to be stashed in insertion scankey, so we provide a
660  * single function that fetches both at once.
661  *
662  * This is used to determine the rules that must be used to descend a
663  * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
664  * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
665  * performance when inserting a new BTScanInsert-wise duplicate tuple
666  * among many leaf pages already full of such duplicates.
667  *
668  * Also sets allequalimage field, which indicates whether or not it is
669  * safe to apply deduplication. We rely on the assumption that
670  * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
671  * pg_upgrade'd from Postgres 12.
672  */
673 void
674 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
675 {
676  BTMetaPageData *metad;
677 
678  if (rel->rd_amcache == NULL)
679  {
680  Buffer metabuf;
681 
682  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
683  metad = _bt_getmeta(rel, metabuf);
684 
685  /*
686  * If there's no root page yet, _bt_getroot() doesn't expect a cache
687  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
688  * be changed to allow this case.)
689  */
690  if (metad->btm_root == P_NONE)
691  {
692  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
693  *allequalimage = metad->btm_allequalimage;
694 
695  _bt_relbuf(rel, metabuf);
696  return;
697  }
698 
699  /*
700  * Cache the metapage data for next time
701  *
702  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
703  * can change the nbtree version for an index without invalidating any
704  * local cache. This is okay because it can only happen when moving
705  * from version 2 to version 3, both of which are !heapkeyspace
706  * versions.
707  */
709  sizeof(BTMetaPageData));
710  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
711  _bt_relbuf(rel, metabuf);
712  }
713 
714  /* Get cached page */
715  metad = (BTMetaPageData *) rel->rd_amcache;
716  /* We shouldn't have cached it if any of these fail */
717  Assert(metad->btm_magic == BTREE_MAGIC);
719  Assert(metad->btm_version <= BTREE_VERSION);
720  Assert(!metad->btm_allequalimage ||
722  Assert(metad->btm_fastroot != P_NONE);
723 
724  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
725  *allequalimage = metad->btm_allequalimage;
726 }
727 
728 /*
729  * _bt_checkpage() -- Verify that a freshly-read page looks sane.
730  */
731 void
733 {
734  Page page = BufferGetPage(buf);
735 
736  /*
737  * ReadBuffer verifies that every newly-read page passes
738  * PageHeaderIsValid, which means it either contains a reasonably sane
739  * page header or is all-zero. We have to defend against the all-zero
740  * case, however.
741  */
742  if (PageIsNew(page))
743  ereport(ERROR,
744  (errcode(ERRCODE_INDEX_CORRUPTED),
745  errmsg("index \"%s\" contains unexpected zero page at block %u",
747  BufferGetBlockNumber(buf)),
748  errhint("Please REINDEX it.")));
749 
750  /*
751  * Additionally check that the special area looks sane.
752  */
753  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
754  ereport(ERROR,
755  (errcode(ERRCODE_INDEX_CORRUPTED),
756  errmsg("index \"%s\" contains corrupted page at block %u",
758  BufferGetBlockNumber(buf)),
759  errhint("Please REINDEX it.")));
760 }
761 
762 /*
763  * Log the reuse of a page from the FSM.
764  */
765 static void
767 {
768  xl_btree_reuse_page xlrec_reuse;
769 
770  /*
771  * Note that we don't register the buffer with the record, because this
772  * operation doesn't modify the page. This record only exists to provide a
773  * conflict point for Hot Standby.
774  */
775 
776  /* XLOG stuff */
777  xlrec_reuse.node = rel->rd_node;
778  xlrec_reuse.block = blkno;
779  xlrec_reuse.latestRemovedXid = latestRemovedXid;
780 
781  XLogBeginInsert();
782  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
783 
784  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
785 }
786 
787 /*
788  * _bt_getbuf() -- Get a buffer by block number for read or write.
789  *
790  * blkno == P_NEW means to get an unallocated index page. The page
791  * will be initialized before returning it.
792  *
793  * The general rule in nbtree is that it's never okay to access a
794  * page without holding both a buffer pin and a buffer lock on
795  * the page's buffer.
796  *
797  * When this routine returns, the appropriate lock is set on the
798  * requested buffer and its reference count has been incremented
799  * (ie, the buffer is "locked and pinned"). Also, we apply
800  * _bt_checkpage to sanity-check the page (except in P_NEW case),
801  * and perform Valgrind client requests that help Valgrind detect
802  * unsafe page accesses.
803  *
804  * Note: raw LockBuffer() calls are disallowed in nbtree; all
805  * buffer lock requests need to go through wrapper functions such
806  * as _bt_lockbuf().
807  */
808 Buffer
809 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
810 {
811  Buffer buf;
812 
813  if (blkno != P_NEW)
814  {
815  /* Read an existing block of the relation */
816  buf = ReadBuffer(rel, blkno);
817  _bt_lockbuf(rel, buf, access);
818  _bt_checkpage(rel, buf);
819  }
820  else
821  {
822  bool needLock;
823  Page page;
824 
825  Assert(access == BT_WRITE);
826 
827  /*
828  * First see if the FSM knows of any free pages.
829  *
830  * We can't trust the FSM's report unreservedly; we have to check that
831  * the page is still free. (For example, an already-free page could
832  * have been re-used between the time the last VACUUM scanned it and
833  * the time the VACUUM made its FSM updates.)
834  *
835  * In fact, it's worse than that: we can't even assume that it's safe
836  * to take a lock on the reported page. If somebody else has a lock
837  * on it, or even worse our own caller does, we could deadlock. (The
838  * own-caller scenario is actually not improbable. Consider an index
839  * on a serial or timestamp column. Nearly all splits will be at the
840  * rightmost page, so it's entirely likely that _bt_split will call us
841  * while holding a lock on the page most recently acquired from FSM. A
842  * VACUUM running concurrently with the previous split could well have
843  * placed that page back in FSM.)
844  *
845  * To get around that, we ask for only a conditional lock on the
846  * reported page. If we fail, then someone else is using the page,
847  * and we may reasonably assume it's not free. (If we happen to be
848  * wrong, the worst consequence is the page will be lost to use till
849  * the next VACUUM, which is no big problem.)
850  */
851  for (;;)
852  {
853  blkno = GetFreeIndexPage(rel);
854  if (blkno == InvalidBlockNumber)
855  break;
856  buf = ReadBuffer(rel, blkno);
857  if (_bt_conditionallockbuf(rel, buf))
858  {
859  page = BufferGetPage(buf);
860  if (_bt_page_recyclable(page))
861  {
862  /*
863  * If we are generating WAL for Hot Standby then create a
864  * WAL record that will allow us to conflict with queries
865  * running on standby, in case they have snapshots older
866  * than btpo.xact. This can only apply if the page does
867  * have a valid btpo.xact value, ie not if it's new. (We
868  * must check that because an all-zero page has no special
869  * space.)
870  */
871  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
872  !PageIsNew(page))
873  {
875 
876  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
877  }
878 
879  /* Okay to use page. Re-initialize and return it */
880  _bt_pageinit(page, BufferGetPageSize(buf));
881  return buf;
882  }
883  elog(DEBUG2, "FSM returned nonrecyclable page");
884  _bt_relbuf(rel, buf);
885  }
886  else
887  {
888  elog(DEBUG2, "FSM returned nonlockable page");
889  /* couldn't get lock, so just drop pin */
890  ReleaseBuffer(buf);
891  }
892  }
893 
894  /*
895  * Extend the relation by one page.
896  *
897  * We have to use a lock to ensure no one else is extending the rel at
898  * the same time, else we will both try to initialize the same new
899  * page. We can skip locking for new or temp relations, however,
900  * since no one else could be accessing them.
901  */
902  needLock = !RELATION_IS_LOCAL(rel);
903 
904  if (needLock)
906 
907  buf = ReadBuffer(rel, P_NEW);
908 
909  /* Acquire buffer lock on new page */
910  _bt_lockbuf(rel, buf, BT_WRITE);
911 
912  /*
913  * Release the file-extension lock; it's now OK for someone else to
914  * extend the relation some more. Note that we cannot release this
915  * lock before we have buffer lock on the new page, or we risk a race
916  * condition against btvacuumscan --- see comments therein.
917  */
918  if (needLock)
920 
921  /* Initialize the new page before returning it */
922  page = BufferGetPage(buf);
923  Assert(PageIsNew(page));
924  _bt_pageinit(page, BufferGetPageSize(buf));
925  }
926 
927  /* ref count and lock type are correct */
928  return buf;
929 }
930 
931 /*
932  * _bt_relandgetbuf() -- release a locked buffer and get another one.
933  *
934  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
935  * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
936  * then it reduces to just _bt_getbuf; allowing this case simplifies some
937  * callers.
938  *
939  * The original motivation for using this was to avoid two entries to the
940  * bufmgr when one would do. However, now it's mainly just a notational
941  * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
942  * is when the target page is the same one already in the buffer.
943  */
944 Buffer
945 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
946 {
947  Buffer buf;
948 
949  Assert(blkno != P_NEW);
950  if (BufferIsValid(obuf))
951  _bt_unlockbuf(rel, obuf);
952  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
953  _bt_lockbuf(rel, buf, access);
954 
955  _bt_checkpage(rel, buf);
956  return buf;
957 }
958 
959 /*
960  * _bt_relbuf() -- release a locked buffer.
961  *
962  * Lock and pin (refcount) are both dropped.
963  */
964 void
966 {
967  _bt_unlockbuf(rel, buf);
968  ReleaseBuffer(buf);
969 }
970 
971 /*
972  * _bt_lockbuf() -- lock a pinned buffer.
973  *
974  * Lock is acquired without acquiring another pin. This is like a raw
975  * LockBuffer() call, but performs extra steps needed by Valgrind.
976  *
977  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
978  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
979  */
980 void
981 _bt_lockbuf(Relation rel, Buffer buf, int access)
982 {
983  /* LockBuffer() asserts that pin is held by this backend */
984  LockBuffer(buf, access);
985 
986  /*
987  * It doesn't matter that _bt_unlockbuf() won't get called in the
988  * event of an nbtree error (e.g. a unique violation error). That
989  * won't cause Valgrind false positives.
990  *
991  * The nbtree client requests are superimposed on top of the
992  * bufmgr.c buffer pin client requests. In the event of an nbtree
993  * error the buffer will certainly get marked as defined when the
994  * backend once again acquires its first pin on the buffer. (Of
995  * course, if the backend never touches the buffer again then it
996  * doesn't matter that it remains non-accessible to Valgrind.)
997  *
998  * Note: When an IndexTuple C pointer gets computed using an
999  * ItemId read from a page while a lock was held, the C pointer
1000  * becomes unsafe to dereference forever as soon as the lock is
1001  * released. Valgrind can only detect cases where the pointer
1002  * gets dereferenced with no _current_ lock/pin held, though.
1003  */
1004  if (!RelationUsesLocalBuffers(rel))
1006 }
1007 
1008 /*
1009  * _bt_unlockbuf() -- unlock a pinned buffer.
1010  */
1011 void
1013 {
1014  /*
1015  * Buffer is pinned and locked, which means that it is expected to be
1016  * defined and addressable. Check that proactively.
1017  */
1019 
1020  /* LockBuffer() asserts that pin is held by this backend */
1022 
1023  if (!RelationUsesLocalBuffers(rel))
1025 }
1026 
1027 /*
1028  * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1029  * buffer.
1030  *
1031  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1032  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1033  */
1034 bool
1036 {
1037  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1038  if (!ConditionalLockBuffer(buf))
1039  return false;
1040 
1041  if (!RelationUsesLocalBuffers(rel))
1043 
1044  return true;
1045 }
1046 
1047 /*
1048  * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
1049  * lock.
1050  */
1051 void
1053 {
1054  /*
1055  * Buffer is pinned and locked, which means that it is expected to be
1056  * defined and addressable. Check that proactively.
1057  */
1059 
1060  /* LockBuffer() asserts that pin is held by this backend */
1062  LockBufferForCleanup(buf);
1063 }
1064 
1065 /*
1066  * _bt_pageinit() -- Initialize a new page.
1067  *
1068  * On return, the page header is initialized; data space is empty;
1069  * special space is zeroed out.
1070  */
1071 void
1073 {
1074  PageInit(page, size, sizeof(BTPageOpaqueData));
1075 }
1076 
1077 /*
1078  * _bt_page_recyclable() -- Is an existing page recyclable?
1079  *
1080  * This exists to make sure _bt_getbuf and btvacuumscan have the same
1081  * policy about whether a page is safe to re-use. But note that _bt_getbuf
1082  * knows enough to distinguish the PageIsNew condition from the other one.
1083  * At some point it might be appropriate to redesign this to have a three-way
1084  * result value.
1085  */
1086 bool
1088 {
1089  BTPageOpaque opaque;
1090 
1091  /*
1092  * It's possible to find an all-zeroes page in an index --- for example, a
1093  * backend might successfully extend the relation one page and then crash
1094  * before it is able to make a WAL entry for adding the page. If we find a
1095  * zeroed page then reclaim it.
1096  */
1097  if (PageIsNew(page))
1098  return true;
1099 
1100  /*
1101  * Otherwise, recycle if deleted and too old to have any processes
1102  * interested in it.
1103  */
1104  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1105  if (P_ISDELETED(opaque) &&
1106  GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact))
1107  return true;
1108  return false;
1109 }
1110 
1111 /*
1112  * Delete item(s) from a btree leaf page during VACUUM.
1113  *
1114  * This routine assumes that the caller has a super-exclusive write lock on
1115  * the buffer. Also, the given deletable and updatable arrays *must* be
1116  * sorted in ascending order.
1117  *
1118  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1119  * in an existing posting list item are to be removed. This works by
1120  * updating/overwriting an existing item with caller's new version of the item
1121  * (a version that lacks the TIDs that are to be deleted).
1122  *
1123  * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1124  * generate their own latestRemovedXid by accessing the table directly,
1125  * whereas VACUUMs rely on the initial VACUUM table scan performing
1126  * WAL-logging that takes care of the issue for the table's indexes
1127  * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree
1128  * deletes don't do.
1129  */
1130 void
1132  OffsetNumber *deletable, int ndeletable,
1133  BTVacuumPosting *updatable, int nupdatable)
1134 {
1135  Page page = BufferGetPage(buf);
1136  BTPageOpaque opaque;
1137  bool needswal = RelationNeedsWAL(rel);
1138  char *updatedbuf = NULL;
1139  Size updatedbuflen = 0;
1140  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1141 
1142  /* Shouldn't be called unless there's something to do */
1143  Assert(ndeletable > 0 || nupdatable > 0);
1144 
1145  /* Generate new version of posting lists without deleted TIDs */
1146  if (nupdatable > 0)
1147  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1148  updatedoffsets, &updatedbuflen,
1149  needswal);
1150 
1151  /* No ereport(ERROR) until changes are logged */
1153 
1154  /*
1155  * Handle posting tuple updates.
1156  *
1157  * Deliberately do this before handling simple deletes. If we did it the
1158  * other way around (i.e. WAL record order -- simple deletes before
1159  * updates) then we'd have to make compensating changes to the 'updatable'
1160  * array of offset numbers.
1161  *
1162  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1163  * happens to already be set. It's important that we not interfere with
1164  * _bt_delitems_delete().
1165  */
1166  for (int i = 0; i < nupdatable; i++)
1167  {
1168  OffsetNumber updatedoffset = updatedoffsets[i];
1169  IndexTuple itup;
1170  Size itemsz;
1171 
1172  itup = updatable[i]->itup;
1173  itemsz = MAXALIGN(IndexTupleSize(itup));
1174  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1175  itemsz))
1176  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1178  }
1179 
1180  /* Now handle simple deletes of entire tuples */
1181  if (ndeletable > 0)
1182  PageIndexMultiDelete(page, deletable, ndeletable);
1183 
1184  /*
1185  * We can clear the vacuum cycle ID since this page has certainly been
1186  * processed by the current vacuum scan.
1187  */
1188  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1189  opaque->btpo_cycleid = 0;
1190 
1191  /*
1192  * Clear the BTP_HAS_GARBAGE page flag.
1193  *
1194  * This flag indicates the presence of LP_DEAD items on the page (though
1195  * not reliably). Note that we only rely on it with pg_upgrade'd
1196  * !heapkeyspace indexes. That's why clearing it here won't usually
1197  * interfere with _bt_delitems_delete().
1198  */
1199  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1200 
1201  MarkBufferDirty(buf);
1202 
1203  /* XLOG stuff */
1204  if (needswal)
1205  {
1206  XLogRecPtr recptr;
1207  xl_btree_vacuum xlrec_vacuum;
1208 
1209  xlrec_vacuum.ndeleted = ndeletable;
1210  xlrec_vacuum.nupdated = nupdatable;
1211 
1212  XLogBeginInsert();
1214  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1215 
1216  if (ndeletable > 0)
1217  XLogRegisterBufData(0, (char *) deletable,
1218  ndeletable * sizeof(OffsetNumber));
1219 
1220  if (nupdatable > 0)
1221  {
1222  XLogRegisterBufData(0, (char *) updatedoffsets,
1223  nupdatable * sizeof(OffsetNumber));
1224  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1225  }
1226 
1227  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1228 
1229  PageSetLSN(page, recptr);
1230  }
1231 
1232  END_CRIT_SECTION();
1233 
1234  /* can't leak memory here */
1235  if (updatedbuf != NULL)
1236  pfree(updatedbuf);
1237  /* free tuples allocated within _bt_delitems_update() */
1238  for (int i = 0; i < nupdatable; i++)
1239  pfree(updatable[i]->itup);
1240 }
1241 
1242 /*
1243  * Delete item(s) from a btree leaf page during single-page cleanup.
1244  *
1245  * This routine assumes that the caller has pinned and write locked the
1246  * buffer. Also, the given deletable and updatable arrays *must* be sorted in
1247  * ascending order.
1248  *
1249  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1250  * in an existing posting list item are to be removed. This works by
1251  * updating/overwriting an existing item with caller's new version of the item
1252  * (a version that lacks the TIDs that are to be deleted).
1253  *
1254  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1255  * the page, but it needs its own latestRemovedXid from caller (caller gets
1256  * this from tableam). This is used by the REDO routine to generate recovery
1257  * conflicts. The other difference is that only _bt_delitems_vacuum will
1258  * clear page's VACUUM cycle ID.
1259  */
1260 static void
1262  OffsetNumber *deletable, int ndeletable,
1263  BTVacuumPosting *updatable, int nupdatable,
1264  Relation heapRel)
1265 {
1266  Page page = BufferGetPage(buf);
1267  BTPageOpaque opaque;
1268  bool needswal = RelationNeedsWAL(rel);
1269  char *updatedbuf = NULL;
1270  Size updatedbuflen = 0;
1271  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1272 
1273  /* Shouldn't be called unless there's something to do */
1274  Assert(ndeletable > 0 || nupdatable > 0);
1275 
1276  /* Generate new versions of posting lists without deleted TIDs */
1277  if (nupdatable > 0)
1278  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1279  updatedoffsets, &updatedbuflen,
1280  needswal);
1281 
1282  /* No ereport(ERROR) until changes are logged */
1284 
1285  /* Handle updates and deletes just like _bt_delitems_vacuum */
1286  for (int i = 0; i < nupdatable; i++)
1287  {
1288  OffsetNumber updatedoffset = updatedoffsets[i];
1289  IndexTuple itup;
1290  Size itemsz;
1291 
1292  itup = updatable[i]->itup;
1293  itemsz = MAXALIGN(IndexTupleSize(itup));
1294  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1295  itemsz))
1296  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1298  }
1299 
1300  if (ndeletable > 0)
1301  PageIndexMultiDelete(page, deletable, ndeletable);
1302 
1303  /*
1304  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1305  * this point. The VACUUM command alone controls vacuum cycle IDs.
1306  */
1307  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1308 
1309  /*
1310  * Clear the BTP_HAS_GARBAGE page flag.
1311  *
1312  * This flag indicates the presence of LP_DEAD items on the page (though
1313  * not reliably). Note that we only rely on it with pg_upgrade'd
1314  * !heapkeyspace indexes.
1315  */
1316  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1317 
1318  MarkBufferDirty(buf);
1319 
1320  /* XLOG stuff */
1321  if (needswal)
1322  {
1323  XLogRecPtr recptr;
1324  xl_btree_delete xlrec_delete;
1325 
1326  xlrec_delete.latestRemovedXid = latestRemovedXid;
1327  xlrec_delete.ndeleted = ndeletable;
1328  xlrec_delete.nupdated = nupdatable;
1329 
1330  XLogBeginInsert();
1332  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1333 
1334  if (ndeletable > 0)
1335  XLogRegisterBufData(0, (char *) deletable,
1336  ndeletable * sizeof(OffsetNumber));
1337 
1338  if (nupdatable > 0)
1339  {
1340  XLogRegisterBufData(0, (char *) updatedoffsets,
1341  nupdatable * sizeof(OffsetNumber));
1342  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1343  }
1344 
1345  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1346 
1347  PageSetLSN(page, recptr);
1348  }
1349 
1350  END_CRIT_SECTION();
1351 
1352  /* can't leak memory here */
1353  if (updatedbuf != NULL)
1354  pfree(updatedbuf);
1355  /* free tuples allocated within _bt_delitems_update() */
1356  for (int i = 0; i < nupdatable; i++)
1357  pfree(updatable[i]->itup);
1358 }
1359 
1360 /*
1361  * Set up state needed to delete TIDs from posting list tuples via "updating"
1362  * the tuple. Performs steps common to both _bt_delitems_vacuum and
1363  * _bt_delitems_delete. These steps must take place before each function's
1364  * critical section begins.
1365  *
1366  * updatabable and nupdatable are inputs, though note that we will use
1367  * _bt_update_posting() to replace the original itup with a pointer to a final
1368  * version in palloc()'d memory. Caller should free the tuples when its done.
1369  *
1370  * The first nupdatable entries from updatedoffsets are set to the page offset
1371  * number for posting list tuples that caller updates. This is mostly useful
1372  * because caller may need to WAL-log the page offsets (though we always do
1373  * this for caller out of convenience).
1374  *
1375  * Returns buffer consisting of an array of xl_btree_update structs that
1376  * describe the steps we perform here for caller (though only when needswal is
1377  * true). Also sets *updatedbuflen to the final size of the buffer. This
1378  * buffer is used by caller when WAL logging is required.
1379  */
1380 static char *
1381 _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1382  OffsetNumber *updatedoffsets, Size *updatedbuflen,
1383  bool needswal)
1384 {
1385  char *updatedbuf = NULL;
1386  Size buflen = 0;
1387 
1388  /* Shouldn't be called unless there's something to do */
1389  Assert(nupdatable > 0);
1390 
1391  for (int i = 0; i < nupdatable; i++)
1392  {
1393  BTVacuumPosting vacposting = updatable[i];
1394  Size itemsz;
1395 
1396  /* Replace work area IndexTuple with updated version */
1397  _bt_update_posting(vacposting);
1398 
1399  /* Keep track of size of xl_btree_update for updatedbuf in passing */
1400  itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1401  buflen += itemsz;
1402 
1403  /* Build updatedoffsets buffer in passing */
1404  updatedoffsets[i] = vacposting->updatedoffset;
1405  }
1406 
1407  /* XLOG stuff */
1408  if (needswal)
1409  {
1410  Size offset = 0;
1411 
1412  /* Allocate, set final size for caller */
1413  updatedbuf = palloc(buflen);
1414  *updatedbuflen = buflen;
1415  for (int i = 0; i < nupdatable; i++)
1416  {
1417  BTVacuumPosting vacposting = updatable[i];
1418  Size itemsz;
1419  xl_btree_update update;
1420 
1421  update.ndeletedtids = vacposting->ndeletedtids;
1422  memcpy(updatedbuf + offset, &update.ndeletedtids,
1424  offset += SizeOfBtreeUpdate;
1425 
1426  itemsz = update.ndeletedtids * sizeof(uint16);
1427  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1428  offset += itemsz;
1429  }
1430  }
1431 
1432  return updatedbuf;
1433 }
1434 
1435 /*
1436  * Comparator used by _bt_delitems_delete_check() to restore deltids array
1437  * back to its original leaf-page-wise sort order
1438  */
1439 static int
1440 _bt_delitems_cmp(const void *a, const void *b)
1441 {
1442  TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1443  TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1444 
1445  if (indexdelete1->id > indexdelete2->id)
1446  return 1;
1447  if (indexdelete1->id < indexdelete2->id)
1448  return -1;
1449 
1450  Assert(false);
1451 
1452  return 0;
1453 }
1454 
1455 /*
1456  * Try to delete item(s) from a btree leaf page during single-page cleanup.
1457  *
1458  * nbtree interface to table_index_delete_tuples(). Deletes a subset of index
1459  * tuples from caller's deltids array: those whose TIDs are found safe to
1460  * delete by the tableam (or already marked LP_DEAD in index, and so already
1461  * known to be deletable by our simple index deletion caller). We physically
1462  * delete index tuples from buf leaf page last of all (for index tuples where
1463  * that is known to be safe following our table_index_delete_tuples() call).
1464  *
1465  * Simple index deletion caller only includes TIDs from index tuples marked
1466  * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1467  * included without increasing the total number of distinct table blocks for
1468  * the deletion operation as a whole. This approach often allows us to delete
1469  * some extra index tuples that were practically free for tableam to check in
1470  * passing (when they actually turn out to be safe to delete). It probably
1471  * only makes sense for the tableam to go ahead with these extra checks when
1472  * it is block-orientated (otherwise the checks probably won't be practically
1473  * free, which we rely on). The tableam interface requires the tableam side
1474  * to handle the problem, though, so this is okay (we as an index AM are free
1475  * to make the simplifying assumption that all tableams must be block-based).
1476  *
1477  * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1478  * without expecting that tableam will check most of them. The tableam has
1479  * considerable discretion around which entries/blocks it checks. Our role in
1480  * costing the bottom-up deletion operation is strictly advisory.
1481  *
1482  * Note: Caller must have added deltids entries (i.e. entries that go in
1483  * delstate's main array) in leaf-page-wise order: page offset number order,
1484  * TID order among entries taken from the same posting list tuple (tiebreak on
1485  * TID). This order is convenient to work with here.
1486  *
1487  * Note: We also rely on the id field of each deltids element "capturing" this
1488  * original leaf-page-wise order. That is, we expect to be able to get back
1489  * to the original leaf-page-wise order just by sorting deltids on the id
1490  * field (tableam will sort deltids for its own reasons, so we'll need to put
1491  * it back in leaf-page-wise order afterwards).
1492  */
1493 void
1495  TM_IndexDeleteOp *delstate)
1496 {
1497  Page page = BufferGetPage(buf);
1498  TransactionId latestRemovedXid;
1499  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1500  int ndeletable = 0,
1501  nupdatable = 0;
1504 
1505  /* Use tableam interface to determine which tuples to delete first */
1506  latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
1507 
1508  /* Should not WAL-log latestRemovedXid unless it's required */
1509  if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
1510  latestRemovedXid = InvalidTransactionId;
1511 
1512  /*
1513  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1514  * needs to do to physically delete index tuples from the page.
1515  *
1516  * Must sort deltids array to restore leaf-page-wise order (original order
1517  * before call to tableam). This is the order that the loop expects.
1518  *
1519  * Note that deltids array might be a lot smaller now. It might even have
1520  * no entries at all (with bottom-up deletion caller), in which case there
1521  * is nothing left to do.
1522  */
1523  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1525  if (delstate->ndeltids == 0)
1526  {
1527  Assert(delstate->bottomup);
1528  return;
1529  }
1530 
1531  /* We definitely have to delete at least one index tuple (or one TID) */
1532  for (int i = 0; i < delstate->ndeltids; i++)
1533  {
1534  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1535  OffsetNumber idxoffnum = dstatus->idxoffnum;
1536  ItemId itemid = PageGetItemId(page, idxoffnum);
1537  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1538  int nestedi,
1539  nitem;
1540  BTVacuumPosting vacposting;
1541 
1542  Assert(OffsetNumberIsValid(idxoffnum));
1543 
1544  if (idxoffnum == postingidxoffnum)
1545  {
1546  /*
1547  * This deltid entry is a TID from a posting list tuple that has
1548  * already been completely processed
1549  */
1550  Assert(BTreeTupleIsPosting(itup));
1552  &delstate->deltids[i].tid) < 0);
1554  &delstate->deltids[i].tid) >= 0);
1555  continue;
1556  }
1557 
1558  if (!BTreeTupleIsPosting(itup))
1559  {
1560  /* Plain non-pivot tuple */
1561  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1562  if (dstatus->knowndeletable)
1563  deletable[ndeletable++] = idxoffnum;
1564  continue;
1565  }
1566 
1567  /*
1568  * itup is a posting list tuple whose lowest deltids entry (which may
1569  * or may not be for the first TID from itup) is considered here now.
1570  * We should process all of the deltids entries for the posting list
1571  * together now, though (not just the lowest). Remember to skip over
1572  * later itup-related entries during later iterations of outermost
1573  * loop.
1574  */
1575  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1576  nestedi = i; /* Initialize for first itup deltids entry */
1577  vacposting = NULL; /* Describes final action for itup */
1578  nitem = BTreeTupleGetNPosting(itup);
1579  for (int p = 0; p < nitem; p++)
1580  {
1581  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1582  int ptidcmp = -1;
1583 
1584  /*
1585  * This nested loop reuses work across ptid TIDs taken from itup.
1586  * We take advantage of the fact that both itup's TIDs and deltids
1587  * entries (within a single itup/posting list grouping) must both
1588  * be in ascending TID order.
1589  */
1590  for (; nestedi < delstate->ndeltids; nestedi++)
1591  {
1592  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1593  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1594 
1595  /* Stop once we get past all itup related deltids entries */
1596  Assert(tdstatus->idxoffnum >= idxoffnum);
1597  if (tdstatus->idxoffnum != idxoffnum)
1598  break;
1599 
1600  /* Skip past non-deletable itup related entries up front */
1601  if (!tdstatus->knowndeletable)
1602  continue;
1603 
1604  /* Entry is first partial ptid match (or an exact match)? */
1605  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1606  if (ptidcmp >= 0)
1607  {
1608  /* Greater than or equal (partial or exact) match... */
1609  break;
1610  }
1611  }
1612 
1613  /* ...exact ptid match to a deletable deltids entry? */
1614  if (ptidcmp != 0)
1615  continue;
1616 
1617  /* Exact match for deletable deltids entry -- ptid gets deleted */
1618  if (vacposting == NULL)
1619  {
1620  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1621  nitem * sizeof(uint16));
1622  vacposting->itup = itup;
1623  vacposting->updatedoffset = idxoffnum;
1624  vacposting->ndeletedtids = 0;
1625  }
1626  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1627  }
1628 
1629  /* Final decision on itup, a posting list tuple */
1630 
1631  if (vacposting == NULL)
1632  {
1633  /* No TIDs to delete from itup -- do nothing */
1634  }
1635  else if (vacposting->ndeletedtids == nitem)
1636  {
1637  /* Straight delete of itup (to delete all TIDs) */
1638  deletable[ndeletable++] = idxoffnum;
1639  /* Turns out we won't need granular information */
1640  pfree(vacposting);
1641  }
1642  else
1643  {
1644  /* Delete some (but not all) TIDs from itup */
1645  Assert(vacposting->ndeletedtids > 0 &&
1646  vacposting->ndeletedtids < nitem);
1647  updatable[nupdatable++] = vacposting;
1648  }
1649  }
1650 
1651  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1652  _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
1653  updatable, nupdatable, heapRel);
1654 
1655  /* be tidy */
1656  for (int i = 0; i < nupdatable; i++)
1657  pfree(updatable[i]);
1658 }
1659 
1660 /*
1661  * Check that leftsib page (the btpo_prev of target page) is not marked with
1662  * INCOMPLETE_SPLIT flag. Used during page deletion.
1663  *
1664  * Returning true indicates that page flag is set in leftsib (which is
1665  * definitely still the left sibling of target). When that happens, the
1666  * target doesn't have a downlink in parent, and the page deletion algorithm
1667  * isn't prepared to handle that. Deletion of the target page (or the whole
1668  * subtree that contains the target page) cannot take place.
1669  *
1670  * Caller should not have a lock on the target page itself, since pages on the
1671  * same level must always be locked left to right to avoid deadlocks.
1672  */
1673 static bool
1675 {
1676  Buffer buf;
1677  Page page;
1678  BTPageOpaque opaque;
1679  bool result;
1680 
1681  /* Easy case: No left sibling */
1682  if (leftsib == P_NONE)
1683  return false;
1684 
1685  buf = _bt_getbuf(rel, leftsib, BT_READ);
1686  page = BufferGetPage(buf);
1687  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1688 
1689  /*
1690  * If the left sibling was concurrently split, so that its next-pointer
1691  * doesn't point to the current page anymore, the split that created
1692  * target must be completed. Caller can reasonably expect that there will
1693  * be a downlink to the target page that it can relocate using its stack.
1694  * (We don't allow splitting an incompletely split page again until the
1695  * previous split has been completed.)
1696  */
1697  result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1698  _bt_relbuf(rel, buf);
1699 
1700  return result;
1701 }
1702 
1703 /*
1704  * Check that leafrightsib page (the btpo_next of target leaf page) is not
1705  * marked with ISHALFDEAD flag. Used during page deletion.
1706  *
1707  * Returning true indicates that page flag is set in leafrightsib, so page
1708  * deletion cannot go ahead. Our caller is not prepared to deal with the case
1709  * where the parent page does not have a pivot tuples whose downlink points to
1710  * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1711  * seem worth going to the trouble of teaching our caller to deal with it.
1712  * The situation will be resolved after VACUUM finishes the deletion of the
1713  * half-dead page (when a future VACUUM operation reaches the target page
1714  * again).
1715  *
1716  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1717  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1718  * okay because of the restriction on deleting pages that are the rightmost
1719  * page of their parent (i.e. that such deletions can only take place when the
1720  * entire subtree must be deleted). The leaf level check made here will apply
1721  * to a right "cousin" leaf page rather than a simple right sibling leaf page
1722  * in cases where caller actually goes on to attempt deleting pages that are
1723  * above the leaf page. The right cousin leaf page is representative of the
1724  * left edge of the subtree to the right of the to-be-deleted subtree as a
1725  * whole, which is exactly the condition that our caller cares about.
1726  * (Besides, internal pages are never marked half-dead, so it isn't even
1727  * possible to _directly_ assess if an internal page is part of some other
1728  * to-be-deleted subtree.)
1729  */
1730 static bool
1732 {
1733  Buffer buf;
1734  Page page;
1735  BTPageOpaque opaque;
1736  bool result;
1737 
1738  Assert(leafrightsib != P_NONE);
1739 
1740  buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1741  page = BufferGetPage(buf);
1742  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1743 
1744  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1745  result = P_ISHALFDEAD(opaque);
1746  _bt_relbuf(rel, buf);
1747 
1748  return result;
1749 }
1750 
1751 /*
1752  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1753  *
1754  * This action unlinks the leaf page from the b-tree structure, removing all
1755  * pointers leading to it --- but not touching its own left and right links.
1756  * The page cannot be physically reclaimed right away, since other processes
1757  * may currently be trying to follow links leading to the page; they have to
1758  * be allowed to use its right-link to recover. See nbtree/README.
1759  *
1760  * On entry, the target buffer must be pinned and locked (either read or write
1761  * lock is OK). The page must be an empty leaf page, which may be half-dead
1762  * already (a half-dead page should only be passed to us when an earlier
1763  * VACUUM operation was interrupted, though). Note in particular that caller
1764  * should never pass a buffer containing an existing deleted page here. The
1765  * lock and pin on caller's buffer will be dropped before we return.
1766  *
1767  * Returns the number of pages successfully deleted (zero if page cannot
1768  * be deleted now; could be more than one if parent or right sibling pages
1769  * were deleted too). Note that this does not include pages that we delete
1770  * that the btvacuumscan scan has yet to reach; they'll get counted later
1771  * instead.
1772  *
1773  * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
1774  * responsible for maintaining *oldestBtpoXact in the case of pages that were
1775  * deleted by a previous VACUUM.
1776  *
1777  * NOTE: this leaks memory. Rather than trying to clean up everything
1778  * carefully, it's better to run it in a temp context that can be reset
1779  * frequently.
1780  */
1781 uint32
1782 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
1783 {
1784  uint32 ndeleted = 0;
1785  BlockNumber rightsib;
1786  bool rightsib_empty;
1787  Page page;
1788  BTPageOpaque opaque;
1789 
1790  /*
1791  * Save original leafbuf block number from caller. Only deleted blocks
1792  * that are <= scanblkno get counted in ndeleted return value.
1793  */
1794  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1795 
1796  /*
1797  * "stack" is a search stack leading (approximately) to the target page.
1798  * It is initially NULL, but when iterating, we keep it to avoid
1799  * duplicated search effort.
1800  *
1801  * Also, when "stack" is not NULL, we have already checked that the
1802  * current page is not the right half of an incomplete split, i.e. the
1803  * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1804  * when the current target page is to the right of caller's initial page
1805  * (the scanblkno page).
1806  */
1807  BTStack stack = NULL;
1808 
1809  for (;;)
1810  {
1811  page = BufferGetPage(leafbuf);
1812  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1813 
1814  /*
1815  * Internal pages are never deleted directly, only as part of deleting
1816  * the whole subtree all the way down to leaf level.
1817  *
1818  * Also check for deleted pages here. Caller never passes us a fully
1819  * deleted page. Only VACUUM can delete pages, so there can't have
1820  * been a concurrent deletion. Assume that we reached any deleted
1821  * page encountered here by following a sibling link, and that the
1822  * index is corrupt.
1823  */
1824  Assert(!P_ISDELETED(opaque));
1825  if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1826  {
1827  /*
1828  * Pre-9.4 page deletion only marked internal pages as half-dead,
1829  * but now we only use that flag on leaf pages. The old algorithm
1830  * was never supposed to leave half-dead pages in the tree, it was
1831  * just a transient state, but it was nevertheless possible in
1832  * error scenarios. We don't know how to deal with them here. They
1833  * are harmless as far as searches are considered, but inserts
1834  * into the deleted keyspace could add out-of-order downlinks in
1835  * the upper levels. Log a notice, hopefully the admin will notice
1836  * and reindex.
1837  */
1838  if (P_ISHALFDEAD(opaque))
1839  ereport(LOG,
1840  (errcode(ERRCODE_INDEX_CORRUPTED),
1841  errmsg("index \"%s\" contains a half-dead internal page",
1843  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1844 
1845  if (P_ISDELETED(opaque))
1846  ereport(LOG,
1847  (errcode(ERRCODE_INDEX_CORRUPTED),
1848  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1849  BufferGetBlockNumber(leafbuf),
1850  scanblkno,
1851  RelationGetRelationName(rel))));
1852 
1853  _bt_relbuf(rel, leafbuf);
1854  return ndeleted;
1855  }
1856 
1857  /*
1858  * We can never delete rightmost pages nor root pages. While at it,
1859  * check that page is empty, since it's possible that the leafbuf page
1860  * was empty a moment ago, but has since had some inserts.
1861  *
1862  * To keep the algorithm simple, we also never delete an incompletely
1863  * split page (they should be rare enough that this doesn't make any
1864  * meaningful difference to disk usage):
1865  *
1866  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1867  * left half of an incomplete split, but ensuring that it's not the
1868  * right half is more complicated. For that, we have to check that
1869  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1870  * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1871  * release the lock on scanblkno/leafbuf, check the left sibling, and
1872  * construct a search stack to scanblkno. On subsequent iterations,
1873  * we know we stepped right from a page that passed these tests, so
1874  * it's OK.
1875  */
1876  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1877  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1878  P_INCOMPLETE_SPLIT(opaque))
1879  {
1880  /* Should never fail to delete a half-dead page */
1881  Assert(!P_ISHALFDEAD(opaque));
1882 
1883  _bt_relbuf(rel, leafbuf);
1884  return ndeleted;
1885  }
1886 
1887  /*
1888  * First, remove downlink pointing to the page (or a parent of the
1889  * page, if we are going to delete a taller subtree), and mark the
1890  * leafbuf page half-dead
1891  */
1892  if (!P_ISHALFDEAD(opaque))
1893  {
1894  /*
1895  * We need an approximate pointer to the page's parent page. We
1896  * use a variant of the standard search mechanism to search for
1897  * the page's high key; this will give us a link to either the
1898  * current parent or someplace to its left (if there are multiple
1899  * equal high keys, which is possible with !heapkeyspace indexes).
1900  *
1901  * Also check if this is the right-half of an incomplete split
1902  * (see comment above).
1903  */
1904  if (!stack)
1905  {
1906  BTScanInsert itup_key;
1907  ItemId itemid;
1908  IndexTuple targetkey;
1909  BlockNumber leftsib,
1910  leafblkno;
1911  Buffer sleafbuf;
1912 
1913  itemid = PageGetItemId(page, P_HIKEY);
1914  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1915 
1916  leftsib = opaque->btpo_prev;
1917  leafblkno = BufferGetBlockNumber(leafbuf);
1918 
1919  /*
1920  * To avoid deadlocks, we'd better drop the leaf page lock
1921  * before going further.
1922  */
1923  _bt_unlockbuf(rel, leafbuf);
1924 
1925  /*
1926  * Check that the left sibling of leafbuf (if any) is not
1927  * marked with INCOMPLETE_SPLIT flag before proceeding
1928  */
1929  Assert(leafblkno == scanblkno);
1930  if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1931  {
1932  ReleaseBuffer(leafbuf);
1933  Assert(ndeleted == 0);
1934  return ndeleted;
1935  }
1936 
1937  /* we need an insertion scan key for the search, so build one */
1938  itup_key = _bt_mkscankey(rel, targetkey);
1939  /* find the leftmost leaf page with matching pivot/high key */
1940  itup_key->pivotsearch = true;
1941  stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1942  /* won't need a second lock or pin on leafbuf */
1943  _bt_relbuf(rel, sleafbuf);
1944 
1945  /*
1946  * Re-lock the leaf page, and start over to use our stack
1947  * within _bt_mark_page_halfdead. We must do it that way
1948  * because it's possible that leafbuf can no longer be
1949  * deleted. We need to recheck.
1950  *
1951  * Note: We can't simply hold on to the sleafbuf lock instead,
1952  * because it's barely possible that sleafbuf is not the same
1953  * page as leafbuf. This happens when leafbuf split after our
1954  * original lock was dropped, but before _bt_search finished
1955  * its descent. We rely on the assumption that we'll find
1956  * leafbuf isn't safe to delete anymore in this scenario.
1957  * (Page deletion can cope with the stack being to the left of
1958  * leafbuf, but not to the right of leafbuf.)
1959  */
1960  _bt_lockbuf(rel, leafbuf, BT_WRITE);
1961  continue;
1962  }
1963 
1964  /*
1965  * See if it's safe to delete the leaf page, and determine how
1966  * many parent/internal pages above the leaf level will be
1967  * deleted. If it's safe then _bt_mark_page_halfdead will also
1968  * perform the first phase of deletion, which includes marking the
1969  * leafbuf page half-dead.
1970  */
1971  Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
1972  if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
1973  {
1974  _bt_relbuf(rel, leafbuf);
1975  return ndeleted;
1976  }
1977  }
1978 
1979  /*
1980  * Then unlink it from its siblings. Each call to
1981  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
1982  * making it shallower. Iterate until the leafbuf page is deleted.
1983  *
1984  * _bt_unlink_halfdead_page should never fail, since we established
1985  * that deletion is generally safe in _bt_mark_page_halfdead.
1986  */
1987  rightsib_empty = false;
1988  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1989  while (P_ISHALFDEAD(opaque))
1990  {
1991  /* Check for interrupts in _bt_unlink_halfdead_page */
1992  if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
1993  &rightsib_empty, oldestBtpoXact,
1994  &ndeleted))
1995  {
1996  /* _bt_unlink_halfdead_page failed, released buffer */
1997  return ndeleted;
1998  }
1999  }
2000 
2001  Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2003  *oldestBtpoXact));
2004 
2005  rightsib = opaque->btpo_next;
2006 
2007  _bt_relbuf(rel, leafbuf);
2008 
2009  /*
2010  * Check here, as calling loops will have locks held, preventing
2011  * interrupts from being processed.
2012  */
2014 
2015  /*
2016  * The page has now been deleted. If its right sibling is completely
2017  * empty, it's possible that the reason we haven't deleted it earlier
2018  * is that it was the rightmost child of the parent. Now that we
2019  * removed the downlink for this page, the right sibling might now be
2020  * the only child of the parent, and could be removed. It would be
2021  * picked up by the next vacuum anyway, but might as well try to
2022  * remove it now, so loop back to process the right sibling.
2023  *
2024  * Note: This relies on the assumption that _bt_getstackbuf() will be
2025  * able to reuse our original descent stack with a different child
2026  * block (provided that the child block is to the right of the
2027  * original leaf page reached by _bt_search()). It will even update
2028  * the descent stack each time we loop around, avoiding repeated work.
2029  */
2030  if (!rightsib_empty)
2031  break;
2032 
2033  leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2034  }
2035 
2036  return ndeleted;
2037 }
2038 
2039 /*
2040  * First stage of page deletion.
2041  *
2042  * Establish the height of the to-be-deleted subtree with leafbuf at its
2043  * lowest level, remove the downlink to the subtree, and mark leafbuf
2044  * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
2045  * but may include additional internal pages (at most one per level of the
2046  * tree below the root).
2047  *
2048  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2049  * the rightmost child of its parent (and parent has more than one downlink).
2050  * Returns 'true' when the first stage of page deletion completed
2051  * successfully.
2052  */
2053 static bool
2055 {
2056  BlockNumber leafblkno;
2057  BlockNumber leafrightsib;
2058  BlockNumber topparent;
2059  BlockNumber topparentrightsib;
2060  ItemId itemid;
2061  Page page;
2062  BTPageOpaque opaque;
2063  Buffer subtreeparent;
2064  OffsetNumber poffset;
2065  OffsetNumber nextoffset;
2066  IndexTuple itup;
2067  IndexTupleData trunctuple;
2068 
2069  page = BufferGetPage(leafbuf);
2070  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2071 
2072  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2073  P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2074  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2075 
2076  /*
2077  * Save info about the leaf page.
2078  */
2079  leafblkno = BufferGetBlockNumber(leafbuf);
2080  leafrightsib = opaque->btpo_next;
2081 
2082  /*
2083  * Before attempting to lock the parent page, check that the right sibling
2084  * is not in half-dead state. A half-dead right sibling would have no
2085  * downlink in the parent, which would be highly confusing later when we
2086  * delete the downlink. It would fail the "right sibling of target page
2087  * is also the next child in parent page" cross-check below.
2088  */
2089  if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2090  {
2091  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2092  leafblkno, leafrightsib);
2093  return false;
2094  }
2095 
2096  /*
2097  * We cannot delete a page that is the rightmost child of its immediate
2098  * parent, unless it is the only child --- in which case the parent has to
2099  * be deleted too, and the same condition applies recursively to it. We
2100  * have to check this condition all the way up before trying to delete,
2101  * and lock the parent of the root of the to-be-deleted subtree (the
2102  * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2103  * for us. We remove the downlink to the "top parent" page (subtree root
2104  * page) from the subtree parent page below.
2105  *
2106  * Initialize topparent to be leafbuf page now. The final to-be-deleted
2107  * subtree is often a degenerate one page subtree consisting only of the
2108  * leafbuf page. When that happens, the leafbuf page is the final subtree
2109  * root page/top parent page.
2110  */
2111  topparent = leafblkno;
2112  topparentrightsib = leafrightsib;
2113  if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
2114  &subtreeparent, &poffset,
2115  &topparent, &topparentrightsib))
2116  return false;
2117 
2118  /*
2119  * Check that the parent-page index items we're about to delete/overwrite
2120  * in subtree parent page contain what we expect. This can fail if the
2121  * index has become corrupt for some reason. We want to throw any error
2122  * before entering the critical section --- otherwise it'd be a PANIC.
2123  */
2124  page = BufferGetPage(subtreeparent);
2125  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2126 
2127 #ifdef USE_ASSERT_CHECKING
2128 
2129  /*
2130  * This is just an assertion because _bt_lock_subtree_parent should have
2131  * guaranteed tuple has the expected contents
2132  */
2133  itemid = PageGetItemId(page, poffset);
2134  itup = (IndexTuple) PageGetItem(page, itemid);
2135  Assert(BTreeTupleGetDownLink(itup) == topparent);
2136 #endif
2137 
2138  nextoffset = OffsetNumberNext(poffset);
2139  itemid = PageGetItemId(page, nextoffset);
2140  itup = (IndexTuple) PageGetItem(page, itemid);
2141  if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2142  ereport(ERROR,
2143  (errcode(ERRCODE_INDEX_CORRUPTED),
2144  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2145  topparentrightsib, topparent,
2146  BTreeTupleGetDownLink(itup),
2147  BufferGetBlockNumber(subtreeparent),
2148  RelationGetRelationName(rel))));
2149 
2150  /*
2151  * Any insert which would have gone on the leaf block will now go to its
2152  * right sibling. In other words, the key space moves right.
2153  */
2154  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2155 
2156  /* No ereport(ERROR) until changes are logged */
2158 
2159  /*
2160  * Update parent of subtree. We want to delete the downlink to the top
2161  * parent page/root of the subtree, and the *following* key. Easiest way
2162  * is to copy the right sibling's downlink over the downlink that points
2163  * to top parent page, and then delete the right sibling's original pivot
2164  * tuple.
2165  *
2166  * Lanin and Shasha make the key space move left when deleting a page,
2167  * whereas the key space moves right here. That's why we cannot simply
2168  * delete the pivot tuple with the downlink to the top parent page. See
2169  * nbtree/README.
2170  */
2171  page = BufferGetPage(subtreeparent);
2172  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2173 
2174  itemid = PageGetItemId(page, poffset);
2175  itup = (IndexTuple) PageGetItem(page, itemid);
2176  BTreeTupleSetDownLink(itup, topparentrightsib);
2177 
2178  nextoffset = OffsetNumberNext(poffset);
2179  PageIndexTupleDelete(page, nextoffset);
2180 
2181  /*
2182  * Mark the leaf page as half-dead, and stamp it with a link to the top
2183  * parent page. When the leaf page is also the top parent page, the link
2184  * is set to InvalidBlockNumber.
2185  */
2186  page = BufferGetPage(leafbuf);
2187  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2188  opaque->btpo_flags |= BTP_HALF_DEAD;
2189 
2191  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2192  trunctuple.t_info = sizeof(IndexTupleData);
2193  if (topparent != leafblkno)
2194  BTreeTupleSetTopParent(&trunctuple, topparent);
2195  else
2197 
2198  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2199  IndexTupleSize(&trunctuple)))
2200  elog(ERROR, "could not overwrite high key in half-dead page");
2201 
2202  /* Must mark buffers dirty before XLogInsert */
2203  MarkBufferDirty(subtreeparent);
2204  MarkBufferDirty(leafbuf);
2205 
2206  /* XLOG stuff */
2207  if (RelationNeedsWAL(rel))
2208  {
2210  XLogRecPtr recptr;
2211 
2212  xlrec.poffset = poffset;
2213  xlrec.leafblk = leafblkno;
2214  if (topparent != leafblkno)
2215  xlrec.topparent = topparent;
2216  else
2217  xlrec.topparent = InvalidBlockNumber;
2218 
2219  XLogBeginInsert();
2220  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2221  XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2222 
2223  page = BufferGetPage(leafbuf);
2224  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2225  xlrec.leftblk = opaque->btpo_prev;
2226  xlrec.rightblk = opaque->btpo_next;
2227 
2229 
2230  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2231 
2232  page = BufferGetPage(subtreeparent);
2233  PageSetLSN(page, recptr);
2234  page = BufferGetPage(leafbuf);
2235  PageSetLSN(page, recptr);
2236  }
2237 
2238  END_CRIT_SECTION();
2239 
2240  _bt_relbuf(rel, subtreeparent);
2241  return true;
2242 }
2243 
2244 /*
2245  * Second stage of page deletion.
2246  *
2247  * Unlinks a single page (in the subtree undergoing deletion) from its
2248  * siblings. Also marks the page deleted.
2249  *
2250  * To get rid of the whole subtree, including the leaf page itself, call here
2251  * until the leaf page is deleted. The original "top parent" established in
2252  * the first stage of deletion is deleted in the first call here, while the
2253  * leaf page is deleted in the last call here. Note that the leaf page itself
2254  * is often the initial top parent page.
2255  *
2256  * Returns 'false' if the page could not be unlinked (shouldn't happen). If
2257  * the right sibling of the current target page is empty, *rightsib_empty is
2258  * set to true, allowing caller to delete the target's right sibling page in
2259  * passing. Note that *rightsib_empty is only actually used by caller when
2260  * target page is leafbuf, following last call here for leafbuf/the subtree
2261  * containing leafbuf. (We always set *rightsib_empty for caller, just to be
2262  * consistent.)
2263  *
2264  * We maintain *oldestBtpoXact for pages that are deleted by the current
2265  * VACUUM operation here. This must be handled here because we conservatively
2266  * assume that there needs to be a new call to ReadNewTransactionId() each
2267  * time a page gets deleted. See comments about the underlying assumption
2268  * below.
2269  *
2270  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2271  * On success exit, we'll be holding pin and write lock. On failure exit,
2272  * we'll release both pin and lock before returning (we define it that way
2273  * to avoid having to reacquire a lock we already released).
2274  */
2275 static bool
2277  bool *rightsib_empty, TransactionId *oldestBtpoXact,
2278  uint32 *ndeleted)
2279 {
2280  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2281  BlockNumber leafleftsib;
2282  BlockNumber leafrightsib;
2283  BlockNumber target;
2284  BlockNumber leftsib;
2285  BlockNumber rightsib;
2286  Buffer lbuf = InvalidBuffer;
2287  Buffer buf;
2288  Buffer rbuf;
2289  Buffer metabuf = InvalidBuffer;
2290  Page metapg = NULL;
2291  BTMetaPageData *metad = NULL;
2292  ItemId itemid;
2293  Page page;
2295  BTPageOpaque opaque;
2296  bool rightsib_is_rightmost;
2297  int targetlevel;
2298  IndexTuple leafhikey;
2299  BlockNumber nextchild;
2300 
2301  page = BufferGetPage(leafbuf);
2302  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2303 
2304  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2305 
2306  /*
2307  * Remember some information about the leaf page.
2308  */
2309  itemid = PageGetItemId(page, P_HIKEY);
2310  leafhikey = (IndexTuple) PageGetItem(page, itemid);
2311  target = BTreeTupleGetTopParent(leafhikey);
2312  leafleftsib = opaque->btpo_prev;
2313  leafrightsib = opaque->btpo_next;
2314 
2315  _bt_unlockbuf(rel, leafbuf);
2316 
2317  /*
2318  * Check here, as calling loops will have locks held, preventing
2319  * interrupts from being processed.
2320  */
2322 
2323  /* Unlink the current top parent of the subtree */
2324  if (!BlockNumberIsValid(target))
2325  {
2326  /* Target is leaf page (or leaf page is top parent, if you prefer) */
2327  target = leafblkno;
2328 
2329  buf = leafbuf;
2330  leftsib = leafleftsib;
2331  targetlevel = 0;
2332  }
2333  else
2334  {
2335  /* Target is the internal page taken from leaf's top parent link */
2336  Assert(target != leafblkno);
2337 
2338  /* Fetch the block number of the target's left sibling */
2339  buf = _bt_getbuf(rel, target, BT_READ);
2340  page = BufferGetPage(buf);
2341  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2342  leftsib = opaque->btpo_prev;
2343  targetlevel = opaque->btpo.level;
2344  Assert(targetlevel > 0);
2345 
2346  /*
2347  * To avoid deadlocks, we'd better drop the target page lock before
2348  * going further.
2349  */
2350  _bt_unlockbuf(rel, buf);
2351  }
2352 
2353  /*
2354  * We have to lock the pages we need to modify in the standard order:
2355  * moving right, then up. Else we will deadlock against other writers.
2356  *
2357  * So, first lock the leaf page, if it's not the target. Then find and
2358  * write-lock the current left sibling of the target page. The sibling
2359  * that was current a moment ago could have split, so we may have to move
2360  * right. This search could fail if either the sibling or the target page
2361  * was deleted by someone else meanwhile; if so, give up. (Right now,
2362  * that should never happen, since page deletion is only done in VACUUM
2363  * and there shouldn't be multiple VACUUMs concurrently on the same
2364  * table.)
2365  */
2366  if (target != leafblkno)
2367  _bt_lockbuf(rel, leafbuf, BT_WRITE);
2368  if (leftsib != P_NONE)
2369  {
2370  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2371  page = BufferGetPage(lbuf);
2372  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2373  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2374  {
2375  /* step right one page */
2376  leftsib = opaque->btpo_next;
2377  _bt_relbuf(rel, lbuf);
2378 
2379  /*
2380  * It'd be good to check for interrupts here, but it's not easy to
2381  * do so because a lock is always held. This block isn't
2382  * frequently reached, so hopefully the consequences of not
2383  * checking interrupts aren't too bad.
2384  */
2385 
2386  if (leftsib == P_NONE)
2387  {
2388  ereport(LOG,
2389  (errmsg("no left sibling (concurrent deletion?) of block %u in \"%s\"",
2390  target,
2391  RelationGetRelationName(rel))));
2392  if (target != leafblkno)
2393  {
2394  /* we have only a pin on target, but pin+lock on leafbuf */
2395  ReleaseBuffer(buf);
2396  _bt_relbuf(rel, leafbuf);
2397  }
2398  else
2399  {
2400  /* we have only a pin on leafbuf */
2401  ReleaseBuffer(leafbuf);
2402  }
2403  return false;
2404  }
2405  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2406  page = BufferGetPage(lbuf);
2407  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2408  }
2409  }
2410  else
2411  lbuf = InvalidBuffer;
2412 
2413  /*
2414  * Next write-lock the target page itself. It's okay to take a write lock
2415  * rather than a superexclusive lock, since no scan will stop on an empty
2416  * page.
2417  */
2418  _bt_lockbuf(rel, buf, BT_WRITE);
2419  page = BufferGetPage(buf);
2420  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2421 
2422  /*
2423  * Check page is still empty etc, else abandon deletion. This is just for
2424  * paranoia's sake; a half-dead page cannot resurrect because there can be
2425  * only one vacuum process running at a time.
2426  */
2427  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2428  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2429  target, RelationGetRelationName(rel));
2430 
2431  if (opaque->btpo_prev != leftsib)
2432  ereport(ERROR,
2433  (errcode(ERRCODE_INDEX_CORRUPTED),
2434  errmsg_internal("left link changed unexpectedly in block %u of index \"%s\"",
2435  target, RelationGetRelationName(rel))));
2436 
2437  if (target == leafblkno)
2438  {
2439  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2440  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2441  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2442  target, RelationGetRelationName(rel));
2443  nextchild = InvalidBlockNumber;
2444  }
2445  else
2446  {
2447  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2448  P_ISLEAF(opaque))
2449  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2450  target, RelationGetRelationName(rel));
2451 
2452  /* Remember the next non-leaf child down in the subtree */
2453  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2454  nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
2455  if (nextchild == leafblkno)
2456  nextchild = InvalidBlockNumber;
2457  }
2458 
2459  /*
2460  * And next write-lock the (current) right sibling.
2461  */
2462  rightsib = opaque->btpo_next;
2463  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2464  page = BufferGetPage(rbuf);
2465  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2466  if (opaque->btpo_prev != target)
2467  ereport(ERROR,
2468  (errcode(ERRCODE_INDEX_CORRUPTED),
2469  errmsg_internal("right sibling's left-link doesn't match: "
2470  "block %u links to %u instead of expected %u in index \"%s\"",
2471  rightsib, opaque->btpo_prev, target,
2472  RelationGetRelationName(rel))));
2473  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2474  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2475 
2476  /*
2477  * If we are deleting the next-to-last page on the target's level, then
2478  * the rightsib is a candidate to become the new fast root. (In theory, it
2479  * might be possible to push the fast root even further down, but the odds
2480  * of doing so are slim, and the locking considerations daunting.)
2481  *
2482  * We can safely acquire a lock on the metapage here --- see comments for
2483  * _bt_newroot().
2484  */
2485  if (leftsib == P_NONE && rightsib_is_rightmost)
2486  {
2487  page = BufferGetPage(rbuf);
2488  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2489  if (P_RIGHTMOST(opaque))
2490  {
2491  /* rightsib will be the only one left on the level */
2492  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2493  metapg = BufferGetPage(metabuf);
2494  metad = BTPageGetMeta(metapg);
2495 
2496  /*
2497  * The expected case here is btm_fastlevel == targetlevel+1; if
2498  * the fastlevel is <= targetlevel, something is wrong, and we
2499  * choose to overwrite it to fix it.
2500  */
2501  if (metad->btm_fastlevel > targetlevel + 1)
2502  {
2503  /* no update wanted */
2504  _bt_relbuf(rel, metabuf);
2505  metabuf = InvalidBuffer;
2506  }
2507  }
2508  }
2509 
2510  /*
2511  * Here we begin doing the deletion.
2512  */
2513 
2514  /* No ereport(ERROR) until changes are logged */
2516 
2517  /*
2518  * Update siblings' side-links. Note the target page's side-links will
2519  * continue to point to the siblings. Asserts here are just rechecking
2520  * things we already verified above.
2521  */
2522  if (BufferIsValid(lbuf))
2523  {
2524  page = BufferGetPage(lbuf);
2525  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2526  Assert(opaque->btpo_next == target);
2527  opaque->btpo_next = rightsib;
2528  }
2529  page = BufferGetPage(rbuf);
2530  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2531  Assert(opaque->btpo_prev == target);
2532  opaque->btpo_prev = leftsib;
2533 
2534  /*
2535  * If we deleted a parent of the targeted leaf page, instead of the leaf
2536  * itself, update the leaf to point to the next remaining child in the
2537  * subtree.
2538  *
2539  * Note: We rely on the fact that a buffer pin on the leaf page has been
2540  * held since leafhikey was initialized. This is safe, though only
2541  * because the page was already half-dead at that point. The leaf page
2542  * cannot have been modified by any other backend during the period when
2543  * no lock was held.
2544  */
2545  if (target != leafblkno)
2546  BTreeTupleSetTopParent(leafhikey, nextchild);
2547 
2548  /*
2549  * Mark the page itself deleted. It can be recycled when all current
2550  * transactions are gone. Storing GetTopTransactionId() would work, but
2551  * we're in VACUUM and would not otherwise have an XID. Having already
2552  * updated links to the target, ReadNewTransactionId() suffices as an
2553  * upper bound. Any scan having retained a now-stale link is advertising
2554  * in its PGPROC an xmin less than or equal to the value we read here. It
2555  * will continue to do so, holding back the xmin horizon, for the duration
2556  * of that scan.
2557  */
2558  page = BufferGetPage(buf);
2559  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2560  Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2561  opaque->btpo_flags &= ~BTP_HALF_DEAD;
2562  opaque->btpo_flags |= BTP_DELETED;
2563  opaque->btpo.xact = ReadNewTransactionId();
2564 
2565  /*
2566  * Remove the remaining tuples on the page. This keeps things simple for
2567  * WAL consistency checking.
2568  */
2569  header = (PageHeader) page;
2570  header->pd_lower = SizeOfPageHeaderData;
2571  header->pd_upper = header->pd_special;
2572 
2573  /* And update the metapage, if needed */
2574  if (BufferIsValid(metabuf))
2575  {
2576  /* upgrade metapage if needed */
2577  if (metad->btm_version < BTREE_NOVAC_VERSION)
2578  _bt_upgrademetapage(metapg);
2579  metad->btm_fastroot = rightsib;
2580  metad->btm_fastlevel = targetlevel;
2581  MarkBufferDirty(metabuf);
2582  }
2583 
2584  /* Must mark buffers dirty before XLogInsert */
2585  MarkBufferDirty(rbuf);
2586  MarkBufferDirty(buf);
2587  if (BufferIsValid(lbuf))
2588  MarkBufferDirty(lbuf);
2589  if (target != leafblkno)
2590  MarkBufferDirty(leafbuf);
2591 
2592  /* XLOG stuff */
2593  if (RelationNeedsWAL(rel))
2594  {
2595  xl_btree_unlink_page xlrec;
2596  xl_btree_metadata xlmeta;
2597  uint8 xlinfo;
2598  XLogRecPtr recptr;
2599 
2600  XLogBeginInsert();
2601 
2603  if (BufferIsValid(lbuf))
2606  if (target != leafblkno)
2607  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2608 
2609  /* information on the unlinked block */
2610  xlrec.leftsib = leftsib;
2611  xlrec.rightsib = rightsib;
2612  xlrec.btpo_xact = opaque->btpo.xact;
2613 
2614  /* information needed to recreate the leaf block (if not the target) */
2615  xlrec.leafleftsib = leafleftsib;
2616  xlrec.leafrightsib = leafrightsib;
2617  xlrec.topparent = nextchild;
2618 
2619  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2620 
2621  if (BufferIsValid(metabuf))
2622  {
2624 
2626  xlmeta.version = metad->btm_version;
2627  xlmeta.root = metad->btm_root;
2628  xlmeta.level = metad->btm_level;
2629  xlmeta.fastroot = metad->btm_fastroot;
2630  xlmeta.fastlevel = metad->btm_fastlevel;
2631  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
2633  xlmeta.allequalimage = metad->btm_allequalimage;
2634 
2635  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2636  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2637  }
2638  else
2639  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2640 
2641  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2642 
2643  if (BufferIsValid(metabuf))
2644  {
2645  PageSetLSN(metapg, recptr);
2646  }
2647  page = BufferGetPage(rbuf);
2648  PageSetLSN(page, recptr);
2649  page = BufferGetPage(buf);
2650  PageSetLSN(page, recptr);
2651  if (BufferIsValid(lbuf))
2652  {
2653  page = BufferGetPage(lbuf);
2654  PageSetLSN(page, recptr);
2655  }
2656  if (target != leafblkno)
2657  {
2658  page = BufferGetPage(leafbuf);
2659  PageSetLSN(page, recptr);
2660  }
2661  }
2662 
2663  END_CRIT_SECTION();
2664 
2665  /* release metapage */
2666  if (BufferIsValid(metabuf))
2667  _bt_relbuf(rel, metabuf);
2668 
2669  /* release siblings */
2670  if (BufferIsValid(lbuf))
2671  _bt_relbuf(rel, lbuf);
2672  _bt_relbuf(rel, rbuf);
2673 
2674  if (!TransactionIdIsValid(*oldestBtpoXact) ||
2675  TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
2676  *oldestBtpoXact = opaque->btpo.xact;
2677 
2678  /*
2679  * If btvacuumscan won't revisit this page in a future btvacuumpage call
2680  * and count it as deleted then, we count it as deleted by current
2681  * btvacuumpage call
2682  */
2683  if (target <= scanblkno)
2684  (*ndeleted)++;
2685 
2686  /* If the target is not leafbuf, we're done with it now -- release it */
2687  if (target != leafblkno)
2688  _bt_relbuf(rel, buf);
2689 
2690  return true;
2691 }
2692 
2693 /*
2694  * Establish how tall the to-be-deleted subtree will be during the first stage
2695  * of page deletion.
2696  *
2697  * Caller's child argument is the block number of the page caller wants to
2698  * delete (this is leafbuf's block number, except when we're called
2699  * recursively). stack is a search stack leading to it. Note that we will
2700  * update the stack entry(s) to reflect current downlink positions --- this is
2701  * similar to the corresponding point in page split handling.
2702  *
2703  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2704  * false. Returns true on success, in which case caller can use certain
2705  * details established here to perform the first stage of deletion. This
2706  * function is the last point at which page deletion may be deemed unsafe
2707  * (barring index corruption, or unexpected concurrent page deletions).
2708  *
2709  * We write lock the parent of the root of the to-be-deleted subtree for
2710  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2711  * caller). Caller will have to remove a downlink from *subtreeparent. We
2712  * also set a *subtreeparent offset number in *poffset, to indicate the
2713  * location of the pivot tuple that contains the relevant downlink.
2714  *
2715  * The root of the to-be-deleted subtree is called the "top parent". Note
2716  * that the leafbuf page is often the final "top parent" page (you can think
2717  * of the leafbuf page as a degenerate single page subtree when that happens).
2718  * Caller should initialize *topparent to the target leafbuf page block number
2719  * (while *topparentrightsib should be set to leafbuf's right sibling block
2720  * number). We will update *topparent (and *topparentrightsib) for caller
2721  * here, though only when it turns out that caller will delete at least one
2722  * internal page (i.e. only when caller needs to store a valid link to the top
2723  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2724  */
2725 static bool
2727  Buffer *subtreeparent, OffsetNumber *poffset,
2728  BlockNumber *topparent, BlockNumber *topparentrightsib)
2729 {
2730  BlockNumber parent,
2731  leftsibparent;
2732  OffsetNumber parentoffset,
2733  maxoff;
2734  Buffer pbuf;
2735  Page page;
2736  BTPageOpaque opaque;
2737 
2738  /*
2739  * Locate the pivot tuple whose downlink points to "child". Write lock
2740  * the parent page itself.
2741  */
2742  pbuf = _bt_getstackbuf(rel, stack, child);
2743  if (pbuf == InvalidBuffer)
2744  ereport(ERROR,
2745  (errcode(ERRCODE_INDEX_CORRUPTED),
2746  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2747  RelationGetRelationName(rel), child)));
2748  parent = stack->bts_blkno;
2749  parentoffset = stack->bts_offset;
2750 
2751  page = BufferGetPage(pbuf);
2752  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2753  maxoff = PageGetMaxOffsetNumber(page);
2754  leftsibparent = opaque->btpo_prev;
2755 
2756  /*
2757  * _bt_getstackbuf() completes page splits on returned parent buffer when
2758  * required.
2759  *
2760  * In general it's a bad idea for VACUUM to use up more disk space, which
2761  * is why page deletion does not finish incomplete page splits most of the
2762  * time. We allow this limited exception because the risk is much lower,
2763  * and the potential downside of not proceeding is much higher: A single
2764  * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2765  * prevent us from deleting hundreds of empty leaf pages from one level
2766  * down.
2767  */
2768  Assert(!P_INCOMPLETE_SPLIT(opaque));
2769 
2770  if (parentoffset < maxoff)
2771  {
2772  /*
2773  * Child is not the rightmost child in parent, so it's safe to delete
2774  * the subtree whose root/topparent is child page
2775  */
2776  *subtreeparent = pbuf;
2777  *poffset = parentoffset;
2778  return true;
2779  }
2780 
2781  /*
2782  * Child is the rightmost child of parent.
2783  *
2784  * Since it's the rightmost child of parent, deleting the child (or
2785  * deleting the subtree whose root/topparent is the child page) is only
2786  * safe when it's also possible to delete the parent.
2787  */
2788  Assert(parentoffset == maxoff);
2789  if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2790  {
2791  /*
2792  * Child isn't parent's only child, or parent is rightmost on its
2793  * entire level. Definitely cannot delete any pages.
2794  */
2795  _bt_relbuf(rel, pbuf);
2796  return false;
2797  }
2798 
2799  /*
2800  * Now make sure that the parent deletion is itself safe by examining the
2801  * child's grandparent page. Recurse, passing the parent page as the
2802  * child page (child's grandparent is the parent on the next level up). If
2803  * parent deletion is unsafe, then child deletion must also be unsafe (in
2804  * which case caller cannot delete any pages at all).
2805  */
2806  *topparent = parent;
2807  *topparentrightsib = opaque->btpo_next;
2808 
2809  /*
2810  * Release lock on parent before recursing.
2811  *
2812  * It's OK to release page locks on parent before recursive call locks
2813  * grandparent. An internal page can only acquire an entry if the child
2814  * is split, but that cannot happen as long as we still hold a lock on the
2815  * leafbuf page.
2816  */
2817  _bt_relbuf(rel, pbuf);
2818 
2819  /*
2820  * Before recursing, check that the left sibling of parent (if any) is not
2821  * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2822  * parent lock).
2823  *
2824  * Note: We deliberately avoid completing incomplete splits here.
2825  */
2826  if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2827  return false;
2828 
2829  /* Recurse to examine child page's grandparent page */
2830  return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2831  subtreeparent, poffset,
2832  topparent, topparentrightsib);
2833 }
uint16 ndeleted
Definition: nbtxlog.h:236
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
bool allequalimage
Definition: nbtxlog.h:57
BlockNumber rootblk
Definition: nbtxlog.h:327
TransactionId latestRemovedXid
Definition: nbtxlog.h:235
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1238
TM_IndexDelete * deltids
Definition: tableam.h:227
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
uint16 ndeletedtids
Definition: nbtree.h:784
#define BTP_ROOT
Definition: nbtree.h:74
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3996
BlockNumber btpo_next
Definition: nbtree.h:60
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1381
#define DEBUG1
Definition: elog.h:25
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
int errhint(const char *fmt,...)
Definition: elog.c:1162
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1494
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:142
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable, Relation heapRel)
Definition: nbtpage.c:1261
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:919
#define P_IGNORE(opaque)
Definition: nbtree.h:220
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
uint32 TransactionId
Definition: c.h:575
uint32 btm_version
Definition: nbtree.h:102
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:507
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:610
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:945
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:259
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:809
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:935
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1483
#define ExclusiveLock
Definition: lockdefs.h:44
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
#define BTREE_VERSION
Definition: nbtree.h:144
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1131
OffsetNumber updatedoffset
Definition: nbtree.h:781
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:245
uint32 btm_magic
Definition: nbtree.h:101
#define BTP_LEAF
Definition: nbtree.h:73
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:584
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:282
ItemPointerData t_tid
Definition: itup.h:37
#define BTP_HALF_DEAD
Definition: nbtree.h:77
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:135
BlockNumber root
Definition: nbtxlog.h:51
IndexTuple itup
Definition: nbtree.h:780
static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
Definition: nbtpage.c:1731
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:349
unsigned char uint8
Definition: c.h:427
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:207
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:425
#define InvalidBuffer
Definition: buf.h:25
bool knowndeletable
Definition: tableam.h:195
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
uint16 nupdated
Definition: nbtxlog.h:237
static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)
Definition: nbtpage.c:2726
RelFileNode node
Definition: nbtxlog.h:188
#define START_CRIT_SECTION()
Definition: miscadmin.h:133
int errcode(int sqlerrcode)
Definition: elog.c:704
uint32 level
Definition: nbtxlog.h:328
#define MemSet(start, val, len)
Definition: c.h:996
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3700
uint16 nupdated
Definition: nbtxlog.h:224
#define P_NEW
Definition: bufmgr.h:91
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:222
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:180
OffsetNumber idxoffnum
Definition: tableam.h:194
#define BTP_DELETED
Definition: nbtree.h:75
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define LOG
Definition: elog.h:26
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:1087
#define PANIC
Definition: elog.h:55
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:64
#define BTP_META
Definition: nbtree.h:76
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:70
uint16 OffsetNumber
Definition: off.h:24
#define P_ISMETA(opaque)
Definition: nbtree.h:218
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:193
#define BT_READ
Definition: nbtree.h:588
BlockNumber btm_fastroot
Definition: nbtree.h:105
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
unsigned short uint16
Definition: c.h:428
void pfree(void *pointer)
Definition: mcxt.c:1057
#define BTREE_MAGIC
Definition: nbtree.h:143
BlockNumber block
Definition: nbtxlog.h:189
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:732
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:219
#define ERROR
Definition: elog.h:45
double float8
Definition: c.h:553
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:231
BTCycleId btpo_cycleid
Definition: nbtree.h:67
static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2054
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
#define DEBUG2
Definition: elog.h:24
#define BTPageGetMeta(p)
Definition: nbtree.h:115
uint32 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
Definition: nbtpage.c:1782
bool btm_allequalimage
Definition: nbtree.h:112
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1288
BlockNumber btpo_prev
Definition: nbtree.h:59
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
OffsetNumber bts_offset
Definition: nbtree.h:604
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:38
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:491
#define P_LEFTMOST(opaque)
Definition: nbtree.h:213
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:34
unsigned int uint32
Definition: c.h:429
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:431
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:146
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:278
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3965
#define BTREE_METAPAGE
Definition: nbtree.h:142
bool pivotsearch
Definition: nbtree.h:664
#define P_ISDELETED(opaque)
Definition: nbtree.h:217
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define P_ISROOT(opaque)
Definition: nbtree.h:216
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:674
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:331
uint32 version
Definition: nbtxlog.h:50
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:33
LocationIndex pd_special
Definition: bufpage.h:160
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:766
uint16 ndeleted
Definition: nbtxlog.h:223
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1440
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:106
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:533
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
uint32 level
Definition: nbtree.h:63
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:981
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:37
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:785
struct IndexTupleData IndexTupleData
TM_IndexStatus * status
Definition: tableam.h:228
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3939
BlockNumber btm_root
Definition: nbtree.h:103
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:514
#define XLogStandbyInfoActive()
Definition: xlog.h:205
#define BTREE_MIN_VERSION
Definition: nbtree.h:145
ItemPointerData tid
Definition: tableam.h:188
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:603
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:965
#define ereport(elevel,...)
Definition: elog.h:155
bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
Definition: procarray.c:4163
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:413
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:361
RelFileNode rd_node
Definition: rel.h:55
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1002
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:792
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1044
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:619
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:528
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:110
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1546
#define MAXALIGN(LEN)
Definition: c.h:745
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
struct BTStackData * bts_parent
Definition: nbtree.h:605
#define RelationNeedsWAL(relation)
Definition: rel.h:563
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
static void header(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:210
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:573
static TransactionId ReadNewTransactionId(void)
Definition: transam.h:308
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1012
#define P_HIKEY
Definition: nbtree.h:243
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2674
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:915
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:797
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:104
#define elog(elevel,...)
Definition: elog.h:228
MemoryContext rd_indexcxt
Definition: rel.h:187
uint32 level
Definition: nbtxlog.h:52
int i
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:312
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1072
#define SizeOfBtreeDelete
Definition: nbtxlog.h:244
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:489
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3252
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:387
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
BlockNumber fastroot
Definition: nbtxlog.h:53
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:100
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:35
TransactionId latestRemovedXid
Definition: nbtxlog.h:190
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:108
unsigned short t_info
Definition: itup.h:49
#define qsort(a, b, c, d)
Definition: port.h:503
LocationIndex pd_upper
Definition: bufpage.h:159
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void * rd_amcache
Definition: rel.h:212
#define BT_WRITE
Definition: nbtree.h:589
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1052
void XLogBeginInsert(void)
Definition: xloginsert.c:123
uint16 btpo_flags
Definition: nbtree.h:66
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:214
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:495
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:42
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2292
#define offsetof(type, field)
Definition: c.h:715
#define BTP_HAS_GARBAGE
Definition: nbtree.h:79
uint16 ndeletedtids
Definition: nbtxlog.h:254
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
LocationIndex pd_lower
Definition: bufpage.h:158
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, TransactionId *oldestBtpoXact, uint32 *ndeleted)
Definition: nbtpage.c:2276
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:143
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1674
#define P_ISLEAF(opaque)
Definition: nbtree.h:215
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42