PostgreSQL Source Code  git master
nbtpage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtpage.c
4  * BTree-specific page management code for the Postgres btree access
5  * method.
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  * src/backend/access/nbtree/nbtpage.c
13  *
14  * NOTES
15  * Postgres btree pages look like ordinary relation pages. The opaque
16  * data at high addresses includes pointers to left and right siblings
17  * and flag data describing page state. The first page in a btree, page
18  * zero, is special -- it stores meta-information describing the tree.
19  * Pages one and higher store the actual tree data.
20  *
21  *-------------------------------------------------------------------------
22  */
23 #include "postgres.h"
24 
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/tableam.h"
28 #include "access/transam.h"
29 #include "access/xlog.h"
30 #include "access/xloginsert.h"
31 #include "miscadmin.h"
32 #include "storage/indexfsm.h"
33 #include "storage/lmgr.h"
34 #include "storage/predicate.h"
35 #include "utils/snapmgr.h"
36 
37 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
38 static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
39 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
40  bool *rightsib_empty);
41 static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
42  OffsetNumber *deletable, int ndeletable);
43 static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
44  BTStack stack, Buffer *topparent, OffsetNumber *topoff,
45  BlockNumber *target, BlockNumber *rightsib);
46 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
47  TransactionId latestRemovedXid);
48 
49 /*
50  * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
51  */
52 void
53 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
54  bool allequalimage)
55 {
56  BTMetaPageData *metad;
57  BTPageOpaque metaopaque;
58 
59  _bt_pageinit(page, BLCKSZ);
60 
61  metad = BTPageGetMeta(page);
62  metad->btm_magic = BTREE_MAGIC;
63  metad->btm_version = BTREE_VERSION;
64  metad->btm_root = rootbknum;
65  metad->btm_level = level;
66  metad->btm_fastroot = rootbknum;
67  metad->btm_fastlevel = level;
70  metad->btm_allequalimage = allequalimage;
71 
72  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
73  metaopaque->btpo_flags = BTP_META;
74 
75  /*
76  * Set pd_lower just past the end of the metadata. This is essential,
77  * because without doing so, metadata will be lost if xlog.c compresses
78  * the page.
79  */
80  ((PageHeader) page)->pd_lower =
81  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
82 }
83 
84 /*
85  * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
86  * 3, the last version that can be updated without broadly affecting
87  * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
88  *
89  * This routine does purely in-memory image upgrade. Caller is
90  * responsible for locking, WAL-logging etc.
91  */
92 void
94 {
95  BTMetaPageData *metad;
97 
98  metad = BTPageGetMeta(page);
99  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
100 
101  /* It must be really a meta page of upgradable version */
102  Assert(metaopaque->btpo_flags & BTP_META);
105 
106  /* Set version number and fill extra fields added into version 3 */
109  metad->btm_last_cleanup_num_heap_tuples = -1.0;
110  /* Only a REINDEX can set this field */
111  Assert(!metad->btm_allequalimage);
112  metad->btm_allequalimage = false;
113 
114  /* Adjust pd_lower (see _bt_initmetapage() for details) */
115  ((PageHeader) page)->pd_lower =
116  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
117 }
118 
119 /*
120  * Get metadata from share-locked buffer containing metapage, while performing
121  * standard sanity checks.
122  *
123  * Callers that cache data returned here in local cache should note that an
124  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
125  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
126  */
127 static BTMetaPageData *
129 {
130  Page metapg;
131  BTPageOpaque metaopaque;
132  BTMetaPageData *metad;
133 
134  metapg = BufferGetPage(metabuf);
135  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
136  metad = BTPageGetMeta(metapg);
137 
138  /* sanity-check the metapage */
139  if (!P_ISMETA(metaopaque) ||
140  metad->btm_magic != BTREE_MAGIC)
141  ereport(ERROR,
142  (errcode(ERRCODE_INDEX_CORRUPTED),
143  errmsg("index \"%s\" is not a btree",
144  RelationGetRelationName(rel))));
145 
146  if (metad->btm_version < BTREE_MIN_VERSION ||
147  metad->btm_version > BTREE_VERSION)
148  ereport(ERROR,
149  (errcode(ERRCODE_INDEX_CORRUPTED),
150  errmsg("version mismatch in index \"%s\": file version %d, "
151  "current version %d, minimal supported version %d",
154 
155  return metad;
156 }
157 
158 /*
159  * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
160  * the metapage.
161  *
162  * This routine checks if provided cleanup-related information is matching
163  * to those written in the metapage. On mismatch, metapage is overwritten.
164  */
165 void
167  float8 numHeapTuples)
168 {
169  Buffer metabuf;
170  Page metapg;
171  BTMetaPageData *metad;
172  bool needsRewrite = false;
173  XLogRecPtr recptr;
174 
175  /* read the metapage and check if it needs rewrite */
176  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
177  metapg = BufferGetPage(metabuf);
178  metad = BTPageGetMeta(metapg);
179 
180  /* outdated version of metapage always needs rewrite */
181  if (metad->btm_version < BTREE_NOVAC_VERSION)
182  needsRewrite = true;
183  else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
184  metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
185  needsRewrite = true;
186 
187  if (!needsRewrite)
188  {
189  _bt_relbuf(rel, metabuf);
190  return;
191  }
192 
193  /* trade in our read lock for a write lock */
194  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
195  LockBuffer(metabuf, BT_WRITE);
196 
198 
199  /* upgrade meta-page if needed */
200  if (metad->btm_version < BTREE_NOVAC_VERSION)
201  _bt_upgrademetapage(metapg);
202 
203  /* update cleanup-related information */
204  metad->btm_oldest_btpo_xact = oldestBtpoXact;
205  metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
206  MarkBufferDirty(metabuf);
207 
208  /* write wal record if needed */
209  if (RelationNeedsWAL(rel))
210  {
212 
213  XLogBeginInsert();
215 
217  md.version = metad->btm_version;
218  md.root = metad->btm_root;
219  md.level = metad->btm_level;
220  md.fastroot = metad->btm_fastroot;
221  md.fastlevel = metad->btm_fastlevel;
222  md.oldest_btpo_xact = oldestBtpoXact;
223  md.last_cleanup_num_heap_tuples = numHeapTuples;
224  md.allequalimage = metad->btm_allequalimage;
225 
226  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
227 
228  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
229 
230  PageSetLSN(metapg, recptr);
231  }
232 
234  _bt_relbuf(rel, metabuf);
235 }
236 
237 /*
238  * _bt_getroot() -- Get the root page of the btree.
239  *
240  * Since the root page can move around the btree file, we have to read
241  * its location from the metadata page, and then read the root page
242  * itself. If no root page exists yet, we have to create one.
243  *
244  * The access type parameter (BT_READ or BT_WRITE) controls whether
245  * a new root page will be created or not. If access = BT_READ,
246  * and no root page exists, we just return InvalidBuffer. For
247  * BT_WRITE, we try to create the root page if it doesn't exist.
248  * NOTE that the returned root page will have only a read lock set
249  * on it even if access = BT_WRITE!
250  *
251  * The returned page is not necessarily the true root --- it could be
252  * a "fast root" (a page that is alone in its level due to deletions).
253  * Also, if the root page is split while we are "in flight" to it,
254  * what we will return is the old root, which is now just the leftmost
255  * page on a probably-not-very-wide level. For most purposes this is
256  * as good as or better than the true root, so we do not bother to
257  * insist on finding the true root. We do, however, guarantee to
258  * return a live (not deleted or half-dead) page.
259  *
260  * On successful return, the root page is pinned and read-locked.
261  * The metadata page is not locked or pinned on exit.
262  */
263 Buffer
264 _bt_getroot(Relation rel, int access)
265 {
266  Buffer metabuf;
267  Buffer rootbuf;
268  Page rootpage;
269  BTPageOpaque rootopaque;
270  BlockNumber rootblkno;
271  uint32 rootlevel;
272  BTMetaPageData *metad;
273 
274  /*
275  * Try to use previously-cached metapage data to find the root. This
276  * normally saves one buffer access per index search, which is a very
277  * helpful savings in bufmgr traffic and hence contention.
278  */
279  if (rel->rd_amcache != NULL)
280  {
281  metad = (BTMetaPageData *) rel->rd_amcache;
282  /* We shouldn't have cached it if any of these fail */
283  Assert(metad->btm_magic == BTREE_MAGIC);
285  Assert(metad->btm_version <= BTREE_VERSION);
286  Assert(!metad->btm_allequalimage ||
288  Assert(metad->btm_root != P_NONE);
289 
290  rootblkno = metad->btm_fastroot;
291  Assert(rootblkno != P_NONE);
292  rootlevel = metad->btm_fastlevel;
293 
294  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
295  rootpage = BufferGetPage(rootbuf);
296  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
297 
298  /*
299  * Since the cache might be stale, we check the page more carefully
300  * here than normal. We *must* check that it's not deleted. If it's
301  * not alone on its level, then we reject too --- this may be overly
302  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
303  * because that's not set in a "fast root".
304  */
305  if (!P_IGNORE(rootopaque) &&
306  rootopaque->btpo.level == rootlevel &&
307  P_LEFTMOST(rootopaque) &&
308  P_RIGHTMOST(rootopaque))
309  {
310  /* OK, accept cached page as the root */
311  return rootbuf;
312  }
313  _bt_relbuf(rel, rootbuf);
314  /* Cache is stale, throw it away */
315  if (rel->rd_amcache)
316  pfree(rel->rd_amcache);
317  rel->rd_amcache = NULL;
318  }
319 
320  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
321  metad = _bt_getmeta(rel, metabuf);
322 
323  /* if no root page initialized yet, do it */
324  if (metad->btm_root == P_NONE)
325  {
326  Page metapg;
327 
328  /* If access = BT_READ, caller doesn't want us to create root yet */
329  if (access == BT_READ)
330  {
331  _bt_relbuf(rel, metabuf);
332  return InvalidBuffer;
333  }
334 
335  /* trade in our read lock for a write lock */
336  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
337  LockBuffer(metabuf, BT_WRITE);
338 
339  /*
340  * Race condition: if someone else initialized the metadata between
341  * the time we released the read lock and acquired the write lock, we
342  * must avoid doing it again.
343  */
344  if (metad->btm_root != P_NONE)
345  {
346  /*
347  * Metadata initialized by someone else. In order to guarantee no
348  * deadlocks, we have to release the metadata page and start all
349  * over again. (Is that really true? But it's hardly worth trying
350  * to optimize this case.)
351  */
352  _bt_relbuf(rel, metabuf);
353  return _bt_getroot(rel, access);
354  }
355 
356  /*
357  * Get, initialize, write, and leave a lock of the appropriate type on
358  * the new root page. Since this is the first page in the tree, it's
359  * a leaf as well as the root.
360  */
361  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
362  rootblkno = BufferGetBlockNumber(rootbuf);
363  rootpage = BufferGetPage(rootbuf);
364  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
365  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
366  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
367  rootopaque->btpo.level = 0;
368  rootopaque->btpo_cycleid = 0;
369  /* Get raw page pointer for metapage */
370  metapg = BufferGetPage(metabuf);
371 
372  /* NO ELOG(ERROR) till meta is updated */
374 
375  /* upgrade metapage if needed */
376  if (metad->btm_version < BTREE_NOVAC_VERSION)
377  _bt_upgrademetapage(metapg);
378 
379  metad->btm_root = rootblkno;
380  metad->btm_level = 0;
381  metad->btm_fastroot = rootblkno;
382  metad->btm_fastlevel = 0;
384  metad->btm_last_cleanup_num_heap_tuples = -1.0;
385 
386  MarkBufferDirty(rootbuf);
387  MarkBufferDirty(metabuf);
388 
389  /* XLOG stuff */
390  if (RelationNeedsWAL(rel))
391  {
392  xl_btree_newroot xlrec;
393  XLogRecPtr recptr;
395 
396  XLogBeginInsert();
399 
401  md.version = metad->btm_version;
402  md.root = rootblkno;
403  md.level = 0;
404  md.fastroot = rootblkno;
405  md.fastlevel = 0;
408  md.allequalimage = metad->btm_allequalimage;
409 
410  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
411 
412  xlrec.rootblk = rootblkno;
413  xlrec.level = 0;
414 
415  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
416 
417  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
418 
419  PageSetLSN(rootpage, recptr);
420  PageSetLSN(metapg, recptr);
421  }
422 
424 
425  /*
426  * swap root write lock for read lock. There is no danger of anyone
427  * else accessing the new root page while it's unlocked, since no one
428  * else knows where it is yet.
429  */
430  LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
431  LockBuffer(rootbuf, BT_READ);
432 
433  /* okay, metadata is correct, release lock on it without caching */
434  _bt_relbuf(rel, metabuf);
435  }
436  else
437  {
438  rootblkno = metad->btm_fastroot;
439  Assert(rootblkno != P_NONE);
440  rootlevel = metad->btm_fastlevel;
441 
442  /*
443  * Cache the metapage data for next time
444  */
446  sizeof(BTMetaPageData));
447  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
448 
449  /*
450  * We are done with the metapage; arrange to release it via first
451  * _bt_relandgetbuf call
452  */
453  rootbuf = metabuf;
454 
455  for (;;)
456  {
457  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
458  rootpage = BufferGetPage(rootbuf);
459  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
460 
461  if (!P_IGNORE(rootopaque))
462  break;
463 
464  /* it's dead, Jim. step right one page */
465  if (P_RIGHTMOST(rootopaque))
466  elog(ERROR, "no live root page found in index \"%s\"",
468  rootblkno = rootopaque->btpo_next;
469  }
470 
471  /* Note: can't check btpo.level on deleted pages */
472  if (rootopaque->btpo.level != rootlevel)
473  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
474  rootblkno, RelationGetRelationName(rel),
475  rootopaque->btpo.level, rootlevel);
476  }
477 
478  /*
479  * By here, we have a pin and read lock on the root page, and no lock set
480  * on the metadata page. Return the root page's buffer.
481  */
482  return rootbuf;
483 }
484 
485 /*
486  * _bt_gettrueroot() -- Get the true root page of the btree.
487  *
488  * This is the same as the BT_READ case of _bt_getroot(), except
489  * we follow the true-root link not the fast-root link.
490  *
491  * By the time we acquire lock on the root page, it might have been split and
492  * not be the true root anymore. This is okay for the present uses of this
493  * routine; we only really need to be able to move up at least one tree level
494  * from whatever non-root page we were at. If we ever do need to lock the
495  * one true root page, we could loop here, re-reading the metapage on each
496  * failure. (Note that it wouldn't do to hold the lock on the metapage while
497  * moving to the root --- that'd deadlock against any concurrent root split.)
498  */
499 Buffer
501 {
502  Buffer metabuf;
503  Page metapg;
504  BTPageOpaque metaopaque;
505  Buffer rootbuf;
506  Page rootpage;
507  BTPageOpaque rootopaque;
508  BlockNumber rootblkno;
509  uint32 rootlevel;
510  BTMetaPageData *metad;
511 
512  /*
513  * We don't try to use cached metapage data here, since (a) this path is
514  * not performance-critical, and (b) if we are here it suggests our cache
515  * is out-of-date anyway. In light of point (b), it's probably safest to
516  * actively flush any cached metapage info.
517  */
518  if (rel->rd_amcache)
519  pfree(rel->rd_amcache);
520  rel->rd_amcache = NULL;
521 
522  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
523  metapg = BufferGetPage(metabuf);
524  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
525  metad = BTPageGetMeta(metapg);
526 
527  if (!P_ISMETA(metaopaque) ||
528  metad->btm_magic != BTREE_MAGIC)
529  ereport(ERROR,
530  (errcode(ERRCODE_INDEX_CORRUPTED),
531  errmsg("index \"%s\" is not a btree",
532  RelationGetRelationName(rel))));
533 
534  if (metad->btm_version < BTREE_MIN_VERSION ||
535  metad->btm_version > BTREE_VERSION)
536  ereport(ERROR,
537  (errcode(ERRCODE_INDEX_CORRUPTED),
538  errmsg("version mismatch in index \"%s\": file version %d, "
539  "current version %d, minimal supported version %d",
542 
543  /* if no root page initialized yet, fail */
544  if (metad->btm_root == P_NONE)
545  {
546  _bt_relbuf(rel, metabuf);
547  return InvalidBuffer;
548  }
549 
550  rootblkno = metad->btm_root;
551  rootlevel = metad->btm_level;
552 
553  /*
554  * We are done with the metapage; arrange to release it via first
555  * _bt_relandgetbuf call
556  */
557  rootbuf = metabuf;
558 
559  for (;;)
560  {
561  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
562  rootpage = BufferGetPage(rootbuf);
563  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
564 
565  if (!P_IGNORE(rootopaque))
566  break;
567 
568  /* it's dead, Jim. step right one page */
569  if (P_RIGHTMOST(rootopaque))
570  elog(ERROR, "no live root page found in index \"%s\"",
572  rootblkno = rootopaque->btpo_next;
573  }
574 
575  /* Note: can't check btpo.level on deleted pages */
576  if (rootopaque->btpo.level != rootlevel)
577  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
578  rootblkno, RelationGetRelationName(rel),
579  rootopaque->btpo.level, rootlevel);
580 
581  return rootbuf;
582 }
583 
584 /*
585  * _bt_getrootheight() -- Get the height of the btree search tree.
586  *
587  * We return the level (counting from zero) of the current fast root.
588  * This represents the number of tree levels we'd have to descend through
589  * to start any btree index search.
590  *
591  * This is used by the planner for cost-estimation purposes. Since it's
592  * only an estimate, slightly-stale data is fine, hence we don't worry
593  * about updating previously cached data.
594  */
595 int
597 {
598  BTMetaPageData *metad;
599 
600  if (rel->rd_amcache == NULL)
601  {
602  Buffer metabuf;
603 
604  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
605  metad = _bt_getmeta(rel, metabuf);
606 
607  /*
608  * If there's no root page yet, _bt_getroot() doesn't expect a cache
609  * to be made, so just stop here and report the index height is zero.
610  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
611  */
612  if (metad->btm_root == P_NONE)
613  {
614  _bt_relbuf(rel, metabuf);
615  return 0;
616  }
617 
618  /*
619  * Cache the metapage data for next time
620  */
622  sizeof(BTMetaPageData));
623  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
624  _bt_relbuf(rel, metabuf);
625  }
626 
627  /* Get cached page */
628  metad = (BTMetaPageData *) rel->rd_amcache;
629  /* We shouldn't have cached it if any of these fail */
630  Assert(metad->btm_magic == BTREE_MAGIC);
632  Assert(metad->btm_version <= BTREE_VERSION);
633  Assert(!metad->btm_allequalimage ||
635  Assert(metad->btm_fastroot != P_NONE);
636 
637  return metad->btm_fastlevel;
638 }
639 
640 /*
641  * _bt_metaversion() -- Get version/status info from metapage.
642  *
643  * Sets caller's *heapkeyspace and *allequalimage arguments using data
644  * from the B-Tree metapage (could be locally-cached version). This
645  * information needs to be stashed in insertion scankey, so we provide a
646  * single function that fetches both at once.
647  *
648  * This is used to determine the rules that must be used to descend a
649  * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
650  * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
651  * performance when inserting a new BTScanInsert-wise duplicate tuple
652  * among many leaf pages already full of such duplicates.
653  *
654  * Also sets allequalimage field, which indicates whether or not it is
655  * safe to apply deduplication. We rely on the assumption that
656  * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
657  * pg_upgrade'd from Postgres 12.
658  */
659 void
660 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
661 {
662  BTMetaPageData *metad;
663 
664  if (rel->rd_amcache == NULL)
665  {
666  Buffer metabuf;
667 
668  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
669  metad = _bt_getmeta(rel, metabuf);
670 
671  /*
672  * If there's no root page yet, _bt_getroot() doesn't expect a cache
673  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
674  * be changed to allow this case.)
675  */
676  if (metad->btm_root == P_NONE)
677  {
678  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
679  *allequalimage = metad->btm_allequalimage;
680 
681  _bt_relbuf(rel, metabuf);
682  return;
683  }
684 
685  /*
686  * Cache the metapage data for next time
687  *
688  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
689  * can change the nbtree version for an index without invalidating any
690  * local cache. This is okay because it can only happen when moving
691  * from version 2 to version 3, both of which are !heapkeyspace
692  * versions.
693  */
695  sizeof(BTMetaPageData));
696  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
697  _bt_relbuf(rel, metabuf);
698  }
699 
700  /* Get cached page */
701  metad = (BTMetaPageData *) rel->rd_amcache;
702  /* We shouldn't have cached it if any of these fail */
703  Assert(metad->btm_magic == BTREE_MAGIC);
705  Assert(metad->btm_version <= BTREE_VERSION);
706  Assert(!metad->btm_allequalimage ||
708  Assert(metad->btm_fastroot != P_NONE);
709 
710  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
711  *allequalimage = metad->btm_allequalimage;
712 }
713 
714 /*
715  * _bt_checkpage() -- Verify that a freshly-read page looks sane.
716  */
717 void
719 {
720  Page page = BufferGetPage(buf);
721 
722  /*
723  * ReadBuffer verifies that every newly-read page passes
724  * PageHeaderIsValid, which means it either contains a reasonably sane
725  * page header or is all-zero. We have to defend against the all-zero
726  * case, however.
727  */
728  if (PageIsNew(page))
729  ereport(ERROR,
730  (errcode(ERRCODE_INDEX_CORRUPTED),
731  errmsg("index \"%s\" contains unexpected zero page at block %u",
733  BufferGetBlockNumber(buf)),
734  errhint("Please REINDEX it.")));
735 
736  /*
737  * Additionally check that the special area looks sane.
738  */
739  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
740  ereport(ERROR,
741  (errcode(ERRCODE_INDEX_CORRUPTED),
742  errmsg("index \"%s\" contains corrupted page at block %u",
744  BufferGetBlockNumber(buf)),
745  errhint("Please REINDEX it.")));
746 }
747 
748 /*
749  * Log the reuse of a page from the FSM.
750  */
751 static void
753 {
754  xl_btree_reuse_page xlrec_reuse;
755 
756  /*
757  * Note that we don't register the buffer with the record, because this
758  * operation doesn't modify the page. This record only exists to provide a
759  * conflict point for Hot Standby.
760  */
761 
762  /* XLOG stuff */
763  xlrec_reuse.node = rel->rd_node;
764  xlrec_reuse.block = blkno;
765  xlrec_reuse.latestRemovedXid = latestRemovedXid;
766 
767  XLogBeginInsert();
768  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
769 
770  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
771 }
772 
773 /*
774  * _bt_getbuf() -- Get a buffer by block number for read or write.
775  *
776  * blkno == P_NEW means to get an unallocated index page. The page
777  * will be initialized before returning it.
778  *
779  * When this routine returns, the appropriate lock is set on the
780  * requested buffer and its reference count has been incremented
781  * (ie, the buffer is "locked and pinned"). Also, we apply
782  * _bt_checkpage to sanity-check the page (except in P_NEW case).
783  */
784 Buffer
785 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
786 {
787  Buffer buf;
788 
789  if (blkno != P_NEW)
790  {
791  /* Read an existing block of the relation */
792  buf = ReadBuffer(rel, blkno);
793  LockBuffer(buf, access);
794  _bt_checkpage(rel, buf);
795  }
796  else
797  {
798  bool needLock;
799  Page page;
800 
801  Assert(access == BT_WRITE);
802 
803  /*
804  * First see if the FSM knows of any free pages.
805  *
806  * We can't trust the FSM's report unreservedly; we have to check that
807  * the page is still free. (For example, an already-free page could
808  * have been re-used between the time the last VACUUM scanned it and
809  * the time the VACUUM made its FSM updates.)
810  *
811  * In fact, it's worse than that: we can't even assume that it's safe
812  * to take a lock on the reported page. If somebody else has a lock
813  * on it, or even worse our own caller does, we could deadlock. (The
814  * own-caller scenario is actually not improbable. Consider an index
815  * on a serial or timestamp column. Nearly all splits will be at the
816  * rightmost page, so it's entirely likely that _bt_split will call us
817  * while holding a lock on the page most recently acquired from FSM. A
818  * VACUUM running concurrently with the previous split could well have
819  * placed that page back in FSM.)
820  *
821  * To get around that, we ask for only a conditional lock on the
822  * reported page. If we fail, then someone else is using the page,
823  * and we may reasonably assume it's not free. (If we happen to be
824  * wrong, the worst consequence is the page will be lost to use till
825  * the next VACUUM, which is no big problem.)
826  */
827  for (;;)
828  {
829  blkno = GetFreeIndexPage(rel);
830  if (blkno == InvalidBlockNumber)
831  break;
832  buf = ReadBuffer(rel, blkno);
833  if (ConditionalLockBuffer(buf))
834  {
835  page = BufferGetPage(buf);
836  if (_bt_page_recyclable(page))
837  {
838  /*
839  * If we are generating WAL for Hot Standby then create a
840  * WAL record that will allow us to conflict with queries
841  * running on standby, in case they have snapshots older
842  * than btpo.xact. This can only apply if the page does
843  * have a valid btpo.xact value, ie not if it's new. (We
844  * must check that because an all-zero page has no special
845  * space.)
846  */
847  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
848  !PageIsNew(page))
849  {
851 
852  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
853  }
854 
855  /* Okay to use page. Re-initialize and return it */
856  _bt_pageinit(page, BufferGetPageSize(buf));
857  return buf;
858  }
859  elog(DEBUG2, "FSM returned nonrecyclable page");
860  _bt_relbuf(rel, buf);
861  }
862  else
863  {
864  elog(DEBUG2, "FSM returned nonlockable page");
865  /* couldn't get lock, so just drop pin */
866  ReleaseBuffer(buf);
867  }
868  }
869 
870  /*
871  * Extend the relation by one page.
872  *
873  * We have to use a lock to ensure no one else is extending the rel at
874  * the same time, else we will both try to initialize the same new
875  * page. We can skip locking for new or temp relations, however,
876  * since no one else could be accessing them.
877  */
878  needLock = !RELATION_IS_LOCAL(rel);
879 
880  if (needLock)
882 
883  buf = ReadBuffer(rel, P_NEW);
884 
885  /* Acquire buffer lock on new page */
886  LockBuffer(buf, BT_WRITE);
887 
888  /*
889  * Release the file-extension lock; it's now OK for someone else to
890  * extend the relation some more. Note that we cannot release this
891  * lock before we have buffer lock on the new page, or we risk a race
892  * condition against btvacuumscan --- see comments therein.
893  */
894  if (needLock)
896 
897  /* Initialize the new page before returning it */
898  page = BufferGetPage(buf);
899  Assert(PageIsNew(page));
900  _bt_pageinit(page, BufferGetPageSize(buf));
901  }
902 
903  /* ref count and lock type are correct */
904  return buf;
905 }
906 
907 /*
908  * _bt_relandgetbuf() -- release a locked buffer and get another one.
909  *
910  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
911  * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
912  * then it reduces to just _bt_getbuf; allowing this case simplifies some
913  * callers.
914  *
915  * The original motivation for using this was to avoid two entries to the
916  * bufmgr when one would do. However, now it's mainly just a notational
917  * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
918  * is when the target page is the same one already in the buffer.
919  */
920 Buffer
921 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
922 {
923  Buffer buf;
924 
925  Assert(blkno != P_NEW);
926  if (BufferIsValid(obuf))
928  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
929  LockBuffer(buf, access);
930  _bt_checkpage(rel, buf);
931  return buf;
932 }
933 
934 /*
935  * _bt_relbuf() -- release a locked buffer.
936  *
937  * Lock and pin (refcount) are both dropped.
938  */
939 void
941 {
942  UnlockReleaseBuffer(buf);
943 }
944 
945 /*
946  * _bt_pageinit() -- Initialize a new page.
947  *
948  * On return, the page header is initialized; data space is empty;
949  * special space is zeroed out.
950  */
951 void
952 _bt_pageinit(Page page, Size size)
953 {
954  PageInit(page, size, sizeof(BTPageOpaqueData));
955 }
956 
957 /*
958  * _bt_page_recyclable() -- Is an existing page recyclable?
959  *
960  * This exists to make sure _bt_getbuf and btvacuumscan have the same
961  * policy about whether a page is safe to re-use. But note that _bt_getbuf
962  * knows enough to distinguish the PageIsNew condition from the other one.
963  * At some point it might be appropriate to redesign this to have a three-way
964  * result value.
965  */
966 bool
968 {
969  BTPageOpaque opaque;
970 
971  /*
972  * It's possible to find an all-zeroes page in an index --- for example, a
973  * backend might successfully extend the relation one page and then crash
974  * before it is able to make a WAL entry for adding the page. If we find a
975  * zeroed page then reclaim it.
976  */
977  if (PageIsNew(page))
978  return true;
979 
980  /*
981  * Otherwise, recycle if deleted and too old to have any processes
982  * interested in it.
983  */
984  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
985  if (P_ISDELETED(opaque) &&
987  return true;
988  return false;
989 }
990 
991 /*
992  * Delete item(s) from a btree leaf page during VACUUM.
993  *
994  * This routine assumes that the caller has a super-exclusive write lock on
995  * the buffer. Also, the given deletable and updatable arrays *must* be
996  * sorted in ascending order.
997  *
998  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
999  * in an existing posting list item are to be removed by VACUUM. This works
1000  * by updating/overwriting an existing item with caller's new version of the
1001  * item (a version that lacks the TIDs that are to be deleted).
1002  *
1003  * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1004  * generate their own latestRemovedXid by accessing the heap directly, whereas
1005  * VACUUMs rely on the initial heap scan taking care of it indirectly. Also,
1006  * only VACUUM can perform granular deletes of individual TIDs in posting list
1007  * tuples.
1008  */
1009 void
1011  OffsetNumber *deletable, int ndeletable,
1012  BTVacuumPosting *updatable, int nupdatable)
1013 {
1014  Page page = BufferGetPage(buf);
1015  BTPageOpaque opaque;
1016  Size itemsz;
1017  char *updatedbuf = NULL;
1018  Size updatedbuflen = 0;
1019  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1020 
1021  /* Shouldn't be called unless there's something to do */
1022  Assert(ndeletable > 0 || nupdatable > 0);
1023 
1024  for (int i = 0; i < nupdatable; i++)
1025  {
1026  /* Replace work area IndexTuple with updated version */
1027  _bt_update_posting(updatable[i]);
1028 
1029  /* Maintain array of updatable page offsets for WAL record */
1030  updatedoffsets[i] = updatable[i]->updatedoffset;
1031  }
1032 
1033  /* XLOG stuff -- allocate and fill buffer before critical section */
1034  if (nupdatable > 0 && RelationNeedsWAL(rel))
1035  {
1036  Size offset = 0;
1037 
1038  for (int i = 0; i < nupdatable; i++)
1039  {
1040  BTVacuumPosting vacposting = updatable[i];
1041 
1042  itemsz = SizeOfBtreeUpdate +
1043  vacposting->ndeletedtids * sizeof(uint16);
1044  updatedbuflen += itemsz;
1045  }
1046 
1047  updatedbuf = palloc(updatedbuflen);
1048  for (int i = 0; i < nupdatable; i++)
1049  {
1050  BTVacuumPosting vacposting = updatable[i];
1051  xl_btree_update update;
1052 
1053  update.ndeletedtids = vacposting->ndeletedtids;
1054  memcpy(updatedbuf + offset, &update.ndeletedtids,
1056  offset += SizeOfBtreeUpdate;
1057 
1058  itemsz = update.ndeletedtids * sizeof(uint16);
1059  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1060  offset += itemsz;
1061  }
1062  }
1063 
1064  /* No ereport(ERROR) until changes are logged */
1066 
1067  /*
1068  * Handle posting tuple updates.
1069  *
1070  * Deliberately do this before handling simple deletes. If we did it the
1071  * other way around (i.e. WAL record order -- simple deletes before
1072  * updates) then we'd have to make compensating changes to the 'updatable'
1073  * array of offset numbers.
1074  *
1075  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1076  * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
1077  * level flag, unsetting individual LP_DEAD bits should still be avoided.
1078  */
1079  for (int i = 0; i < nupdatable; i++)
1080  {
1081  OffsetNumber updatedoffset = updatedoffsets[i];
1082  IndexTuple itup;
1083 
1084  itup = updatable[i]->itup;
1085  itemsz = MAXALIGN(IndexTupleSize(itup));
1086  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1087  itemsz))
1088  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1090  }
1091 
1092  /* Now handle simple deletes of entire tuples */
1093  if (ndeletable > 0)
1094  PageIndexMultiDelete(page, deletable, ndeletable);
1095 
1096  /*
1097  * We can clear the vacuum cycle ID since this page has certainly been
1098  * processed by the current vacuum scan.
1099  */
1100  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1101  opaque->btpo_cycleid = 0;
1102 
1103  /*
1104  * Mark the page as not containing any LP_DEAD items. This is not
1105  * certainly true (there might be some that have recently been marked, but
1106  * weren't targeted by VACUUM's heap scan), but it will be true often
1107  * enough. VACUUM does not delete items purely because they have their
1108  * LP_DEAD bit set, since doing so would necessitate explicitly logging a
1109  * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
1110  *
1111  * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
1112  * limited, since we never falsely unset an LP_DEAD bit. Workloads that
1113  * are particularly dependent on LP_DEAD bits being set quickly will
1114  * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
1115  * again anyway. Furthermore, attempting a deduplication pass will remove
1116  * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
1117  * is set or not.
1118  */
1119  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1120 
1121  MarkBufferDirty(buf);
1122 
1123  /* XLOG stuff */
1124  if (RelationNeedsWAL(rel))
1125  {
1126  XLogRecPtr recptr;
1127  xl_btree_vacuum xlrec_vacuum;
1128 
1129  xlrec_vacuum.ndeleted = ndeletable;
1130  xlrec_vacuum.nupdated = nupdatable;
1131 
1132  XLogBeginInsert();
1134  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1135 
1136  if (ndeletable > 0)
1137  XLogRegisterBufData(0, (char *) deletable,
1138  ndeletable * sizeof(OffsetNumber));
1139 
1140  if (nupdatable > 0)
1141  {
1142  XLogRegisterBufData(0, (char *) updatedoffsets,
1143  nupdatable * sizeof(OffsetNumber));
1144  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1145  }
1146 
1147  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1148 
1149  PageSetLSN(page, recptr);
1150  }
1151 
1152  END_CRIT_SECTION();
1153 
1154  /* can't leak memory here */
1155  if (updatedbuf != NULL)
1156  pfree(updatedbuf);
1157  /* free tuples generated by calling _bt_update_posting() */
1158  for (int i = 0; i < nupdatable; i++)
1159  pfree(updatable[i]->itup);
1160 }
1161 
1162 /*
1163  * Delete item(s) from a btree leaf page during single-page cleanup.
1164  *
1165  * This routine assumes that the caller has pinned and write locked the
1166  * buffer. Also, the given deletable array *must* be sorted in ascending
1167  * order.
1168  *
1169  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1170  * the page, but it needs to generate its own latestRemovedXid by accessing
1171  * the heap. This is used by the REDO routine to generate recovery conflicts.
1172  * Also, it doesn't handle posting list tuples unless the entire tuple can be
1173  * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
1174  */
1175 void
1177  OffsetNumber *deletable, int ndeletable,
1178  Relation heapRel)
1179 {
1180  Page page = BufferGetPage(buf);
1181  BTPageOpaque opaque;
1182  TransactionId latestRemovedXid = InvalidTransactionId;
1183 
1184  /* Shouldn't be called unless there's something to do */
1185  Assert(ndeletable > 0);
1186 
1188  latestRemovedXid =
1189  _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
1190 
1191  /* No ereport(ERROR) until changes are logged */
1193 
1194  /* Fix the page */
1195  PageIndexMultiDelete(page, deletable, ndeletable);
1196 
1197  /*
1198  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1199  * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE
1200  * page flag, since we deleted all items with their LP_DEAD bit set.
1201  */
1202  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1203  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1204 
1205  MarkBufferDirty(buf);
1206 
1207  /* XLOG stuff */
1208  if (RelationNeedsWAL(rel))
1209  {
1210  XLogRecPtr recptr;
1211  xl_btree_delete xlrec_delete;
1212 
1213  xlrec_delete.latestRemovedXid = latestRemovedXid;
1214  xlrec_delete.ndeleted = ndeletable;
1215 
1216  XLogBeginInsert();
1218  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1219 
1220  /*
1221  * The deletable array is not in the buffer, but pretend that it is.
1222  * When XLogInsert stores the whole buffer, the array need not be
1223  * stored too.
1224  */
1225  XLogRegisterBufData(0, (char *) deletable,
1226  ndeletable * sizeof(OffsetNumber));
1227 
1228  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1229 
1230  PageSetLSN(page, recptr);
1231  }
1232 
1233  END_CRIT_SECTION();
1234 }
1235 
1236 /*
1237  * Get the latestRemovedXid from the table entries pointed to by the non-pivot
1238  * tuples being deleted.
1239  *
1240  * This is a specialized version of index_compute_xid_horizon_for_tuples().
1241  * It's needed because btree tuples don't always store table TID using the
1242  * standard index tuple header field.
1243  */
1244 static TransactionId
1246  OffsetNumber *deletable, int ndeletable)
1247 {
1248  TransactionId latestRemovedXid = InvalidTransactionId;
1249  int spacenhtids;
1250  int nhtids;
1251  ItemPointer htids;
1252 
1253  /* Array will grow iff there are posting list tuples to consider */
1254  spacenhtids = ndeletable;
1255  nhtids = 0;
1256  htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
1257  for (int i = 0; i < ndeletable; i++)
1258  {
1259  ItemId itemid;
1260  IndexTuple itup;
1261 
1262  itemid = PageGetItemId(page, deletable[i]);
1263  itup = (IndexTuple) PageGetItem(page, itemid);
1264 
1265  Assert(ItemIdIsDead(itemid));
1266  Assert(!BTreeTupleIsPivot(itup));
1267 
1268  if (!BTreeTupleIsPosting(itup))
1269  {
1270  if (nhtids + 1 > spacenhtids)
1271  {
1272  spacenhtids *= 2;
1273  htids = (ItemPointer)
1274  repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1275  }
1276 
1277  Assert(ItemPointerIsValid(&itup->t_tid));
1278  ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
1279  nhtids++;
1280  }
1281  else
1282  {
1283  int nposting = BTreeTupleGetNPosting(itup);
1284 
1285  if (nhtids + nposting > spacenhtids)
1286  {
1287  spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
1288  htids = (ItemPointer)
1289  repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1290  }
1291 
1292  for (int j = 0; j < nposting; j++)
1293  {
1294  ItemPointer htid = BTreeTupleGetPostingN(itup, j);
1295 
1296  Assert(ItemPointerIsValid(htid));
1297  ItemPointerCopy(htid, &htids[nhtids]);
1298  nhtids++;
1299  }
1300  }
1301  }
1302 
1303  Assert(nhtids >= ndeletable);
1304 
1305  latestRemovedXid =
1306  table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
1307 
1308  pfree(htids);
1309 
1310  return latestRemovedXid;
1311 }
1312 
1313 /*
1314  * Returns true, if the given block has the half-dead flag set.
1315  */
1316 static bool
1318 {
1319  Buffer buf;
1320  Page page;
1321  BTPageOpaque opaque;
1322  bool result;
1323 
1324  buf = _bt_getbuf(rel, blk, BT_READ);
1325  page = BufferGetPage(buf);
1326  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1327 
1328  result = P_ISHALFDEAD(opaque);
1329  _bt_relbuf(rel, buf);
1330 
1331  return result;
1332 }
1333 
1334 /*
1335  * Subroutine to find the parent of the branch we're deleting. This climbs
1336  * up the tree until it finds a page with more than one child, i.e. a page
1337  * that will not be totally emptied by the deletion. The chain of pages below
1338  * it, with one downlink each, will form the branch that we need to delete.
1339  *
1340  * If we cannot remove the downlink from the parent, because it's the
1341  * rightmost entry, returns false. On success, *topparent and *topoff are set
1342  * to the buffer holding the parent, and the offset of the downlink in it.
1343  * *topparent is write-locked, the caller is responsible for releasing it when
1344  * done. *target is set to the topmost page in the branch to-be-deleted, i.e.
1345  * the page whose downlink *topparent / *topoff point to, and *rightsib to its
1346  * right sibling.
1347  *
1348  * "child" is the leaf page we wish to delete, and "stack" is a search stack
1349  * leading to it (it actually leads to the leftmost leaf page with a high key
1350  * matching that of the page to be deleted in !heapkeyspace indexes). Note
1351  * that we will update the stack entry(s) to reflect current downlink
1352  * positions --- this is essentially the same as the corresponding step of
1353  * splitting, and is not expected to affect caller. The caller should
1354  * initialize *target and *rightsib to the leaf page and its right sibling.
1355  *
1356  * Note: it's OK to release page locks on any internal pages between the leaf
1357  * and *topparent, because a safe deletion can't become unsafe due to
1358  * concurrent activity. An internal page can only acquire an entry if the
1359  * child is split, but that cannot happen as long as we hold a lock on the
1360  * leaf.
1361  */
1362 static bool
1364  Buffer *topparent, OffsetNumber *topoff,
1365  BlockNumber *target, BlockNumber *rightsib)
1366 {
1367  BlockNumber parent;
1368  OffsetNumber poffset,
1369  maxoff;
1370  Buffer pbuf;
1371  Page page;
1372  BTPageOpaque opaque;
1373  BlockNumber leftsib;
1374 
1375  /*
1376  * Locate the downlink of "child" in the parent, updating the stack entry
1377  * if needed. This is how !heapkeyspace indexes deal with having
1378  * non-unique high keys in leaf level pages. Even heapkeyspace indexes
1379  * can have a stale stack due to insertions into the parent.
1380  */
1381  pbuf = _bt_getstackbuf(rel, stack, child);
1382  if (pbuf == InvalidBuffer)
1383  ereport(ERROR,
1384  (errcode(ERRCODE_INDEX_CORRUPTED),
1385  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
1386  RelationGetRelationName(rel), child)));
1387  parent = stack->bts_blkno;
1388  poffset = stack->bts_offset;
1389 
1390  page = BufferGetPage(pbuf);
1391  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1392  maxoff = PageGetMaxOffsetNumber(page);
1393 
1394  /*
1395  * If the target is the rightmost child of its parent, then we can't
1396  * delete, unless it's also the only child.
1397  */
1398  if (poffset >= maxoff)
1399  {
1400  /* It's rightmost child... */
1401  if (poffset == P_FIRSTDATAKEY(opaque))
1402  {
1403  /*
1404  * It's only child, so safe if parent would itself be removable.
1405  * We have to check the parent itself, and then recurse to test
1406  * the conditions at the parent's parent.
1407  */
1408  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1409  P_INCOMPLETE_SPLIT(opaque))
1410  {
1411  _bt_relbuf(rel, pbuf);
1412  return false;
1413  }
1414 
1415  *target = parent;
1416  *rightsib = opaque->btpo_next;
1417  leftsib = opaque->btpo_prev;
1418 
1419  _bt_relbuf(rel, pbuf);
1420 
1421  /*
1422  * Like in _bt_pagedel, check that the left sibling is not marked
1423  * with INCOMPLETE_SPLIT flag. That would mean that there is no
1424  * downlink to the page to be deleted, and the page deletion
1425  * algorithm isn't prepared to handle that.
1426  */
1427  if (leftsib != P_NONE)
1428  {
1429  Buffer lbuf;
1430  Page lpage;
1431  BTPageOpaque lopaque;
1432 
1433  lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1434  lpage = BufferGetPage(lbuf);
1435  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1436 
1437  /*
1438  * If the left sibling was concurrently split, so that its
1439  * next-pointer doesn't point to the current page anymore, the
1440  * split that created the current page must be completed. (We
1441  * don't allow splitting an incompletely split page again
1442  * until the previous split has been completed)
1443  */
1444  if (lopaque->btpo_next == parent &&
1445  P_INCOMPLETE_SPLIT(lopaque))
1446  {
1447  _bt_relbuf(rel, lbuf);
1448  return false;
1449  }
1450  _bt_relbuf(rel, lbuf);
1451  }
1452 
1453  return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
1454  topparent, topoff, target, rightsib);
1455  }
1456  else
1457  {
1458  /* Unsafe to delete */
1459  _bt_relbuf(rel, pbuf);
1460  return false;
1461  }
1462  }
1463  else
1464  {
1465  /* Not rightmost child, so safe to delete */
1466  *topparent = pbuf;
1467  *topoff = poffset;
1468  return true;
1469  }
1470 }
1471 
1472 /*
1473  * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1474  *
1475  * This action unlinks the page from the b-tree structure, removing all
1476  * pointers leading to it --- but not touching its own left and right links.
1477  * The page cannot be physically reclaimed right away, since other processes
1478  * may currently be trying to follow links leading to the page; they have to
1479  * be allowed to use its right-link to recover. See nbtree/README.
1480  *
1481  * On entry, the target buffer must be pinned and locked (either read or write
1482  * lock is OK). This lock and pin will be dropped before exiting.
1483  *
1484  * Returns the number of pages successfully deleted (zero if page cannot
1485  * be deleted now; could be more than one if parent or sibling pages were
1486  * deleted too).
1487  *
1488  * NOTE: this leaks memory. Rather than trying to clean up everything
1489  * carefully, it's better to run it in a temp context that can be reset
1490  * frequently.
1491  */
1492 int
1494 {
1495  int ndeleted = 0;
1496  BlockNumber rightsib;
1497  bool rightsib_empty;
1498  Page page;
1499  BTPageOpaque opaque;
1500 
1501  /*
1502  * "stack" is a search stack leading (approximately) to the target page.
1503  * It is initially NULL, but when iterating, we keep it to avoid
1504  * duplicated search effort.
1505  *
1506  * Also, when "stack" is not NULL, we have already checked that the
1507  * current page is not the right half of an incomplete split, i.e. the
1508  * left sibling does not have its INCOMPLETE_SPLIT flag set.
1509  */
1510  BTStack stack = NULL;
1511 
1512  for (;;)
1513  {
1514  page = BufferGetPage(buf);
1515  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1516 
1517  /*
1518  * Internal pages are never deleted directly, only as part of deleting
1519  * the whole branch all the way down to leaf level.
1520  */
1521  if (!P_ISLEAF(opaque))
1522  {
1523  /*
1524  * Pre-9.4 page deletion only marked internal pages as half-dead,
1525  * but now we only use that flag on leaf pages. The old algorithm
1526  * was never supposed to leave half-dead pages in the tree, it was
1527  * just a transient state, but it was nevertheless possible in
1528  * error scenarios. We don't know how to deal with them here. They
1529  * are harmless as far as searches are considered, but inserts
1530  * into the deleted keyspace could add out-of-order downlinks in
1531  * the upper levels. Log a notice, hopefully the admin will notice
1532  * and reindex.
1533  */
1534  if (P_ISHALFDEAD(opaque))
1535  ereport(LOG,
1536  (errcode(ERRCODE_INDEX_CORRUPTED),
1537  errmsg("index \"%s\" contains a half-dead internal page",
1539  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1540  _bt_relbuf(rel, buf);
1541  return ndeleted;
1542  }
1543 
1544  /*
1545  * We can never delete rightmost pages nor root pages. While at it,
1546  * check that page is not already deleted and is empty.
1547  *
1548  * To keep the algorithm simple, we also never delete an incompletely
1549  * split page (they should be rare enough that this doesn't make any
1550  * meaningful difference to disk usage):
1551  *
1552  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1553  * left half of an incomplete split, but ensuring that it's not the
1554  * right half is more complicated. For that, we have to check that
1555  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
1556  * the first iteration, we temporarily release the lock on the current
1557  * page, and check the left sibling and also construct a search stack
1558  * to. On subsequent iterations, we know we stepped right from a page
1559  * that passed these tests, so it's OK.
1560  */
1561  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
1562  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1563  P_INCOMPLETE_SPLIT(opaque))
1564  {
1565  /* Should never fail to delete a half-dead page */
1566  Assert(!P_ISHALFDEAD(opaque));
1567 
1568  _bt_relbuf(rel, buf);
1569  return ndeleted;
1570  }
1571 
1572  /*
1573  * First, remove downlink pointing to the page (or a parent of the
1574  * page, if we are going to delete a taller branch), and mark the page
1575  * as half-dead.
1576  */
1577  if (!P_ISHALFDEAD(opaque))
1578  {
1579  /*
1580  * We need an approximate pointer to the page's parent page. We
1581  * use a variant of the standard search mechanism to search for
1582  * the page's high key; this will give us a link to either the
1583  * current parent or someplace to its left (if there are multiple
1584  * equal high keys, which is possible with !heapkeyspace indexes).
1585  *
1586  * Also check if this is the right-half of an incomplete split
1587  * (see comment above).
1588  */
1589  if (!stack)
1590  {
1591  BTScanInsert itup_key;
1592  ItemId itemid;
1593  IndexTuple targetkey;
1594  Buffer lbuf;
1595  BlockNumber leftsib;
1596 
1597  itemid = PageGetItemId(page, P_HIKEY);
1598  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1599 
1600  leftsib = opaque->btpo_prev;
1601 
1602  /*
1603  * To avoid deadlocks, we'd better drop the leaf page lock
1604  * before going further.
1605  */
1607 
1608  /*
1609  * Fetch the left sibling, to check that it's not marked with
1610  * INCOMPLETE_SPLIT flag. That would mean that the page
1611  * to-be-deleted doesn't have a downlink, and the page
1612  * deletion algorithm isn't prepared to handle that.
1613  */
1614  if (!P_LEFTMOST(opaque))
1615  {
1616  BTPageOpaque lopaque;
1617  Page lpage;
1618 
1619  lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1620  lpage = BufferGetPage(lbuf);
1621  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1622 
1623  /*
1624  * If the left sibling is split again by another backend,
1625  * after we released the lock, we know that the first
1626  * split must have finished, because we don't allow an
1627  * incompletely-split page to be split again. So we don't
1628  * need to walk right here.
1629  */
1630  if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
1631  P_INCOMPLETE_SPLIT(lopaque))
1632  {
1633  ReleaseBuffer(buf);
1634  _bt_relbuf(rel, lbuf);
1635  return ndeleted;
1636  }
1637  _bt_relbuf(rel, lbuf);
1638  }
1639 
1640  /* we need an insertion scan key for the search, so build one */
1641  itup_key = _bt_mkscankey(rel, targetkey);
1642  /* find the leftmost leaf page with matching pivot/high key */
1643  itup_key->pivotsearch = true;
1644  stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
1645  /* don't need a lock or second pin on the page */
1646  _bt_relbuf(rel, lbuf);
1647 
1648  /*
1649  * Re-lock the leaf page, and start over, to re-check that the
1650  * page can still be deleted.
1651  */
1652  LockBuffer(buf, BT_WRITE);
1653  continue;
1654  }
1655 
1656  if (!_bt_mark_page_halfdead(rel, buf, stack))
1657  {
1658  _bt_relbuf(rel, buf);
1659  return ndeleted;
1660  }
1661  }
1662 
1663  /*
1664  * Then unlink it from its siblings. Each call to
1665  * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1666  * making it shallower. Iterate until the leaf page is gone.
1667  */
1668  rightsib_empty = false;
1669  while (P_ISHALFDEAD(opaque))
1670  {
1671  /* will check for interrupts, once lock is released */
1672  if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
1673  {
1674  /* _bt_unlink_halfdead_page already released buffer */
1675  return ndeleted;
1676  }
1677  ndeleted++;
1678  }
1679 
1680  rightsib = opaque->btpo_next;
1681 
1682  _bt_relbuf(rel, buf);
1683 
1684  /*
1685  * Check here, as calling loops will have locks held, preventing
1686  * interrupts from being processed.
1687  */
1689 
1690  /*
1691  * The page has now been deleted. If its right sibling is completely
1692  * empty, it's possible that the reason we haven't deleted it earlier
1693  * is that it was the rightmost child of the parent. Now that we
1694  * removed the downlink for this page, the right sibling might now be
1695  * the only child of the parent, and could be removed. It would be
1696  * picked up by the next vacuum anyway, but might as well try to
1697  * remove it now, so loop back to process the right sibling.
1698  */
1699  if (!rightsib_empty)
1700  break;
1701 
1702  buf = _bt_getbuf(rel, rightsib, BT_WRITE);
1703  }
1704 
1705  return ndeleted;
1706 }
1707 
1708 /*
1709  * First stage of page deletion. Remove the downlink to the top of the
1710  * branch being deleted, and mark the leaf page as half-dead.
1711  */
1712 static bool
1714 {
1715  BlockNumber leafblkno;
1716  BlockNumber leafrightsib;
1717  BlockNumber target;
1718  BlockNumber rightsib;
1719  ItemId itemid;
1720  Page page;
1721  BTPageOpaque opaque;
1722  Buffer topparent;
1723  OffsetNumber topoff;
1724  OffsetNumber nextoffset;
1725  IndexTuple itup;
1726  IndexTupleData trunctuple;
1727 
1728  page = BufferGetPage(leafbuf);
1729  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1730 
1731  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) &&
1732  !P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) &&
1733  P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1734 
1735  /*
1736  * Save info about the leaf page.
1737  */
1738  leafblkno = BufferGetBlockNumber(leafbuf);
1739  leafrightsib = opaque->btpo_next;
1740 
1741  /*
1742  * Before attempting to lock the parent page, check that the right sibling
1743  * is not in half-dead state. A half-dead right sibling would have no
1744  * downlink in the parent, which would be highly confusing later when we
1745  * delete the downlink that follows the current page's downlink. (I
1746  * believe the deletion would work correctly, but it would fail the
1747  * cross-check we make that the following downlink points to the right
1748  * sibling of the delete page.)
1749  */
1750  if (_bt_is_page_halfdead(rel, leafrightsib))
1751  {
1752  elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
1753  leafblkno, leafrightsib);
1754  return false;
1755  }
1756 
1757  /*
1758  * We cannot delete a page that is the rightmost child of its immediate
1759  * parent, unless it is the only child --- in which case the parent has to
1760  * be deleted too, and the same condition applies recursively to it. We
1761  * have to check this condition all the way up before trying to delete,
1762  * and lock the final parent of the to-be-deleted subtree.
1763  *
1764  * However, we won't need to repeat the above _bt_is_page_halfdead() check
1765  * for parent/ancestor pages because of the rightmost restriction. The
1766  * leaf check will apply to a right "cousin" leaf page rather than a
1767  * simple right sibling leaf page in cases where we actually go on to
1768  * perform internal page deletion. The right cousin leaf page is
1769  * representative of the left edge of the subtree to the right of the
1770  * to-be-deleted subtree as a whole. (Besides, internal pages are never
1771  * marked half-dead, so it isn't even possible to directly assess if an
1772  * internal page is part of some other to-be-deleted subtree.)
1773  */
1774  rightsib = leafrightsib;
1775  target = leafblkno;
1776  if (!_bt_lock_branch_parent(rel, leafblkno, stack,
1777  &topparent, &topoff, &target, &rightsib))
1778  return false;
1779 
1780  /*
1781  * Check that the parent-page index items we're about to delete/overwrite
1782  * contain what we expect. This can fail if the index has become corrupt
1783  * for some reason. We want to throw any error before entering the
1784  * critical section --- otherwise it'd be a PANIC.
1785  *
1786  * The test on the target item is just an Assert because
1787  * _bt_lock_branch_parent should have guaranteed it has the expected
1788  * contents. The test on the next-child downlink is known to sometimes
1789  * fail in the field, though.
1790  */
1791  page = BufferGetPage(topparent);
1792  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1793 
1794 #ifdef USE_ASSERT_CHECKING
1795  itemid = PageGetItemId(page, topoff);
1796  itup = (IndexTuple) PageGetItem(page, itemid);
1797  Assert(BTreeTupleGetDownLink(itup) == target);
1798 #endif
1799 
1800  nextoffset = OffsetNumberNext(topoff);
1801  itemid = PageGetItemId(page, nextoffset);
1802  itup = (IndexTuple) PageGetItem(page, itemid);
1803  if (BTreeTupleGetDownLink(itup) != rightsib)
1804  ereport(ERROR,
1805  (errcode(ERRCODE_INDEX_CORRUPTED),
1806  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1807  rightsib, target, BTreeTupleGetDownLink(itup),
1808  BufferGetBlockNumber(topparent), RelationGetRelationName(rel))));
1809 
1810  /*
1811  * Any insert which would have gone on the leaf block will now go to its
1812  * right sibling.
1813  */
1814  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
1815 
1816  /* No ereport(ERROR) until changes are logged */
1818 
1819  /*
1820  * Update parent. The normal case is a tad tricky because we want to
1821  * delete the target's downlink and the *following* key. Easiest way is
1822  * to copy the right sibling's downlink over the target downlink, and then
1823  * delete the following item.
1824  */
1825  page = BufferGetPage(topparent);
1826  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1827 
1828  itemid = PageGetItemId(page, topoff);
1829  itup = (IndexTuple) PageGetItem(page, itemid);
1830  BTreeTupleSetDownLink(itup, rightsib);
1831 
1832  nextoffset = OffsetNumberNext(topoff);
1833  PageIndexTupleDelete(page, nextoffset);
1834 
1835  /*
1836  * Mark the leaf page as half-dead, and stamp it with a pointer to the
1837  * highest internal page in the branch we're deleting. We use the tid of
1838  * the high key to store it.
1839  */
1840  page = BufferGetPage(leafbuf);
1841  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1842  opaque->btpo_flags |= BTP_HALF_DEAD;
1843 
1845  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
1846  trunctuple.t_info = sizeof(IndexTupleData);
1847  if (target != leafblkno)
1848  BTreeTupleSetTopParent(&trunctuple, target);
1849  else
1851 
1852  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
1853  IndexTupleSize(&trunctuple)))
1854  elog(ERROR, "could not overwrite high key in half-dead page");
1855 
1856  /* Must mark buffers dirty before XLogInsert */
1857  MarkBufferDirty(topparent);
1858  MarkBufferDirty(leafbuf);
1859 
1860  /* XLOG stuff */
1861  if (RelationNeedsWAL(rel))
1862  {
1864  XLogRecPtr recptr;
1865 
1866  xlrec.poffset = topoff;
1867  xlrec.leafblk = leafblkno;
1868  if (target != leafblkno)
1869  xlrec.topparent = target;
1870  else
1871  xlrec.topparent = InvalidBlockNumber;
1872 
1873  XLogBeginInsert();
1874  XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
1875  XLogRegisterBuffer(1, topparent, REGBUF_STANDARD);
1876 
1877  page = BufferGetPage(leafbuf);
1878  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1879  xlrec.leftblk = opaque->btpo_prev;
1880  xlrec.rightblk = opaque->btpo_next;
1881 
1883 
1884  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
1885 
1886  page = BufferGetPage(topparent);
1887  PageSetLSN(page, recptr);
1888  page = BufferGetPage(leafbuf);
1889  PageSetLSN(page, recptr);
1890  }
1891 
1892  END_CRIT_SECTION();
1893 
1894  _bt_relbuf(rel, topparent);
1895  return true;
1896 }
1897 
1898 /*
1899  * Unlink a page in a branch of half-dead pages from its siblings.
1900  *
1901  * If the leaf page still has a downlink pointing to it, unlinks the highest
1902  * parent in the to-be-deleted branch instead of the leaf page. To get rid
1903  * of the whole branch, including the leaf page itself, iterate until the
1904  * leaf page is deleted.
1905  *
1906  * Returns 'false' if the page could not be unlinked (shouldn't happen).
1907  * If the (new) right sibling of the page is empty, *rightsib_empty is set
1908  * to true.
1909  *
1910  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1911  * On success exit, we'll be holding pin and write lock. On failure exit,
1912  * we'll release both pin and lock before returning (we define it that way
1913  * to avoid having to reacquire a lock we already released).
1914  */
1915 static bool
1916 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
1917 {
1918  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
1919  BlockNumber leafleftsib;
1920  BlockNumber leafrightsib;
1921  BlockNumber target;
1922  BlockNumber leftsib;
1923  BlockNumber rightsib;
1924  Buffer lbuf = InvalidBuffer;
1925  Buffer buf;
1926  Buffer rbuf;
1927  Buffer metabuf = InvalidBuffer;
1928  Page metapg = NULL;
1929  BTMetaPageData *metad = NULL;
1930  ItemId itemid;
1931  Page page;
1932  BTPageOpaque opaque;
1933  bool rightsib_is_rightmost;
1934  int targetlevel;
1935  IndexTuple leafhikey;
1936  BlockNumber nextchild;
1937 
1938  page = BufferGetPage(leafbuf);
1939  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1940 
1941  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1942 
1943  /*
1944  * Remember some information about the leaf page.
1945  */
1946  itemid = PageGetItemId(page, P_HIKEY);
1947  leafhikey = (IndexTuple) PageGetItem(page, itemid);
1948  leafleftsib = opaque->btpo_prev;
1949  leafrightsib = opaque->btpo_next;
1950 
1951  LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
1952 
1953  /*
1954  * Check here, as calling loops will have locks held, preventing
1955  * interrupts from being processed.
1956  */
1958 
1959  /*
1960  * If the leaf page still has a parent pointing to it (or a chain of
1961  * parents), we don't unlink the leaf page yet, but the topmost remaining
1962  * parent in the branch. Set 'target' and 'buf' to reference the page
1963  * actually being unlinked.
1964  */
1965  target = BTreeTupleGetTopParent(leafhikey);
1966 
1967  if (target != InvalidBlockNumber)
1968  {
1969  Assert(target != leafblkno);
1970 
1971  /* fetch the block number of the topmost parent's left sibling */
1972  buf = _bt_getbuf(rel, target, BT_READ);
1973  page = BufferGetPage(buf);
1974  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1975  leftsib = opaque->btpo_prev;
1976  targetlevel = opaque->btpo.level;
1977 
1978  /*
1979  * To avoid deadlocks, we'd better drop the target page lock before
1980  * going further.
1981  */
1983  }
1984  else
1985  {
1986  target = leafblkno;
1987 
1988  buf = leafbuf;
1989  leftsib = leafleftsib;
1990  targetlevel = 0;
1991  }
1992 
1993  /*
1994  * We have to lock the pages we need to modify in the standard order:
1995  * moving right, then up. Else we will deadlock against other writers.
1996  *
1997  * So, first lock the leaf page, if it's not the target. Then find and
1998  * write-lock the current left sibling of the target page. The sibling
1999  * that was current a moment ago could have split, so we may have to move
2000  * right. This search could fail if either the sibling or the target page
2001  * was deleted by someone else meanwhile; if so, give up. (Right now,
2002  * that should never happen, since page deletion is only done in VACUUM
2003  * and there shouldn't be multiple VACUUMs concurrently on the same
2004  * table.)
2005  */
2006  if (target != leafblkno)
2007  LockBuffer(leafbuf, BT_WRITE);
2008  if (leftsib != P_NONE)
2009  {
2010  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2011  page = BufferGetPage(lbuf);
2012  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2013  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2014  {
2015  /* step right one page */
2016  leftsib = opaque->btpo_next;
2017  _bt_relbuf(rel, lbuf);
2018 
2019  /*
2020  * It'd be good to check for interrupts here, but it's not easy to
2021  * do so because a lock is always held. This block isn't
2022  * frequently reached, so hopefully the consequences of not
2023  * checking interrupts aren't too bad.
2024  */
2025 
2026  if (leftsib == P_NONE)
2027  {
2028  elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
2029  target,
2031  if (target != leafblkno)
2032  {
2033  /* we have only a pin on target, but pin+lock on leafbuf */
2034  ReleaseBuffer(buf);
2035  _bt_relbuf(rel, leafbuf);
2036  }
2037  else
2038  {
2039  /* we have only a pin on leafbuf */
2040  ReleaseBuffer(leafbuf);
2041  }
2042  return false;
2043  }
2044  lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2045  page = BufferGetPage(lbuf);
2046  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2047  }
2048  }
2049  else
2050  lbuf = InvalidBuffer;
2051 
2052  /*
2053  * Next write-lock the target page itself. It should be okay to take just
2054  * a write lock not a superexclusive lock, since no scans would stop on an
2055  * empty page.
2056  */
2057  LockBuffer(buf, BT_WRITE);
2058  page = BufferGetPage(buf);
2059  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2060 
2061  /*
2062  * Check page is still empty etc, else abandon deletion. This is just for
2063  * paranoia's sake; a half-dead page cannot resurrect because there can be
2064  * only one vacuum process running at a time.
2065  */
2066  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2067  {
2068  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2069  target, RelationGetRelationName(rel));
2070  }
2071  if (opaque->btpo_prev != leftsib)
2072  ereport(ERROR,
2073  (errcode(ERRCODE_INDEX_CORRUPTED),
2074  errmsg_internal("left link changed unexpectedly in block %u of index \"%s\"",
2075  target, RelationGetRelationName(rel))));
2076 
2077  if (target == leafblkno)
2078  {
2079  if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2080  !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2081  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2082  target, RelationGetRelationName(rel));
2083  nextchild = InvalidBlockNumber;
2084  }
2085  else
2086  {
2087  if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2088  P_ISLEAF(opaque))
2089  elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2090  target, RelationGetRelationName(rel));
2091 
2092  /* remember the next non-leaf child down in the branch. */
2093  itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2094  nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
2095  if (nextchild == leafblkno)
2096  nextchild = InvalidBlockNumber;
2097  }
2098 
2099  /*
2100  * And next write-lock the (current) right sibling.
2101  */
2102  rightsib = opaque->btpo_next;
2103  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2104  page = BufferGetPage(rbuf);
2105  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2106  if (opaque->btpo_prev != target)
2107  ereport(ERROR,
2108  (errcode(ERRCODE_INDEX_CORRUPTED),
2109  errmsg_internal("right sibling's left-link doesn't match: "
2110  "block %u links to %u instead of expected %u in index \"%s\"",
2111  rightsib, opaque->btpo_prev, target,
2112  RelationGetRelationName(rel))));
2113  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2114  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2115 
2116  /*
2117  * If we are deleting the next-to-last page on the target's level, then
2118  * the rightsib is a candidate to become the new fast root. (In theory, it
2119  * might be possible to push the fast root even further down, but the odds
2120  * of doing so are slim, and the locking considerations daunting.)
2121  *
2122  * We can safely acquire a lock on the metapage here --- see comments for
2123  * _bt_newroot().
2124  */
2125  if (leftsib == P_NONE && rightsib_is_rightmost)
2126  {
2127  page = BufferGetPage(rbuf);
2128  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2129  if (P_RIGHTMOST(opaque))
2130  {
2131  /* rightsib will be the only one left on the level */
2132  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2133  metapg = BufferGetPage(metabuf);
2134  metad = BTPageGetMeta(metapg);
2135 
2136  /*
2137  * The expected case here is btm_fastlevel == targetlevel+1; if
2138  * the fastlevel is <= targetlevel, something is wrong, and we
2139  * choose to overwrite it to fix it.
2140  */
2141  if (metad->btm_fastlevel > targetlevel + 1)
2142  {
2143  /* no update wanted */
2144  _bt_relbuf(rel, metabuf);
2145  metabuf = InvalidBuffer;
2146  }
2147  }
2148  }
2149 
2150  /*
2151  * Here we begin doing the deletion.
2152  */
2153 
2154  /* No ereport(ERROR) until changes are logged */
2156 
2157  /*
2158  * Update siblings' side-links. Note the target page's side-links will
2159  * continue to point to the siblings. Asserts here are just rechecking
2160  * things we already verified above.
2161  */
2162  if (BufferIsValid(lbuf))
2163  {
2164  page = BufferGetPage(lbuf);
2165  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2166  Assert(opaque->btpo_next == target);
2167  opaque->btpo_next = rightsib;
2168  }
2169  page = BufferGetPage(rbuf);
2170  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2171  Assert(opaque->btpo_prev == target);
2172  opaque->btpo_prev = leftsib;
2173 
2174  /*
2175  * If we deleted a parent of the targeted leaf page, instead of the leaf
2176  * itself, update the leaf to point to the next remaining child in the
2177  * branch.
2178  */
2179  if (target != leafblkno)
2180  BTreeTupleSetTopParent(leafhikey, nextchild);
2181 
2182  /*
2183  * Mark the page itself deleted. It can be recycled when all current
2184  * transactions are gone. Storing GetTopTransactionId() would work, but
2185  * we're in VACUUM and would not otherwise have an XID. Having already
2186  * updated links to the target, ReadNewTransactionId() suffices as an
2187  * upper bound. Any scan having retained a now-stale link is advertising
2188  * in its PGXACT an xmin less than or equal to the value we read here. It
2189  * will continue to do so, holding back RecentGlobalXmin, for the duration
2190  * of that scan.
2191  */
2192  page = BufferGetPage(buf);
2193  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2194  opaque->btpo_flags &= ~BTP_HALF_DEAD;
2195  opaque->btpo_flags |= BTP_DELETED;
2196  opaque->btpo.xact = ReadNewTransactionId();
2197 
2198  /* And update the metapage, if needed */
2199  if (BufferIsValid(metabuf))
2200  {
2201  /* upgrade metapage if needed */
2202  if (metad->btm_version < BTREE_NOVAC_VERSION)
2203  _bt_upgrademetapage(metapg);
2204  metad->btm_fastroot = rightsib;
2205  metad->btm_fastlevel = targetlevel;
2206  MarkBufferDirty(metabuf);
2207  }
2208 
2209  /* Must mark buffers dirty before XLogInsert */
2210  MarkBufferDirty(rbuf);
2211  MarkBufferDirty(buf);
2212  if (BufferIsValid(lbuf))
2213  MarkBufferDirty(lbuf);
2214  if (target != leafblkno)
2215  MarkBufferDirty(leafbuf);
2216 
2217  /* XLOG stuff */
2218  if (RelationNeedsWAL(rel))
2219  {
2220  xl_btree_unlink_page xlrec;
2221  xl_btree_metadata xlmeta;
2222  uint8 xlinfo;
2223  XLogRecPtr recptr;
2224 
2225  XLogBeginInsert();
2226 
2228  if (BufferIsValid(lbuf))
2231  if (target != leafblkno)
2232  XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2233 
2234  /* information on the unlinked block */
2235  xlrec.leftsib = leftsib;
2236  xlrec.rightsib = rightsib;
2237  xlrec.btpo_xact = opaque->btpo.xact;
2238 
2239  /* information needed to recreate the leaf block (if not the target) */
2240  xlrec.leafleftsib = leafleftsib;
2241  xlrec.leafrightsib = leafrightsib;
2242  xlrec.topparent = nextchild;
2243 
2244  XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2245 
2246  if (BufferIsValid(metabuf))
2247  {
2249 
2251  xlmeta.version = metad->btm_version;
2252  xlmeta.root = metad->btm_root;
2253  xlmeta.level = metad->btm_level;
2254  xlmeta.fastroot = metad->btm_fastroot;
2255  xlmeta.fastlevel = metad->btm_fastlevel;
2256  xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
2258  xlmeta.allequalimage = metad->btm_allequalimage;
2259 
2260  XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2261  xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2262  }
2263  else
2264  xlinfo = XLOG_BTREE_UNLINK_PAGE;
2265 
2266  recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2267 
2268  if (BufferIsValid(metabuf))
2269  {
2270  PageSetLSN(metapg, recptr);
2271  }
2272  page = BufferGetPage(rbuf);
2273  PageSetLSN(page, recptr);
2274  page = BufferGetPage(buf);
2275  PageSetLSN(page, recptr);
2276  if (BufferIsValid(lbuf))
2277  {
2278  page = BufferGetPage(lbuf);
2279  PageSetLSN(page, recptr);
2280  }
2281  if (target != leafblkno)
2282  {
2283  page = BufferGetPage(leafbuf);
2284  PageSetLSN(page, recptr);
2285  }
2286  }
2287 
2288  END_CRIT_SECTION();
2289 
2290  /* release metapage */
2291  if (BufferIsValid(metabuf))
2292  _bt_relbuf(rel, metabuf);
2293 
2294  /* release siblings */
2295  if (BufferIsValid(lbuf))
2296  _bt_relbuf(rel, lbuf);
2297  _bt_relbuf(rel, rbuf);
2298 
2299  /*
2300  * Release the target, if it was not the leaf block. The leaf is always
2301  * kept locked.
2302  */
2303  if (target != leafblkno)
2304  _bt_relbuf(rel, buf);
2305 
2306  return true;
2307 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
bool allequalimage
Definition: nbtxlog.h:57
BlockNumber rootblk
Definition: nbtxlog.h:318
TransactionId latestRemovedXid
Definition: nbtxlog.h:189
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:84
uint16 ndeletedtids
Definition: nbtree.h:791
#define BTP_ROOT
Definition: nbtree.h:73
BlockNumber btpo_next
Definition: nbtree.h:59
#define DEBUG1
Definition: elog.h:25
int errhint(const char *fmt,...)
Definition: elog.c:1071
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:128
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:661
#define P_IGNORE(opaque)
Definition: nbtree.h:219
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:93
uint32 TransactionId
Definition: c.h:513
uint32 btm_version
Definition: nbtree.h:101
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:596
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:921
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:225
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:785
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:719
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1405
#define ExclusiveLock
Definition: lockdefs.h:44
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:143
static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack)
Definition: nbtpage.c:1713
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1010
OffsetNumber updatedoffset
Definition: nbtree.h:788
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
uint32 btm_magic
Definition: nbtree.h:100
#define BTP_LEAF
Definition: nbtree.h:72
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:555
#define SizeOfBtreeMarkPageHalfDead
Definition: nbtxlog.h:273
ItemPointerData t_tid
Definition: itup.h:37
#define BTP_HALF_DEAD
Definition: nbtree.h:76
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:51
IndexTuple itup
Definition: nbtree.h:787
union BTPageOpaqueData::@45 btpo
unsigned char uint8
Definition: c.h:365
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:431
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
RelFileNode node
Definition: nbtxlog.h:206
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 level
Definition: nbtxlog.h:319
#define MemSet(start, val, len)
Definition: c.h:971
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:53
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3322
uint16 nupdated
Definition: nbtxlog.h:243
#define P_NEW
Definition: bufmgr.h:79
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:166
#define BTP_DELETED
Definition: nbtree.h:74
#define LOG
Definition: elog.h:26
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:967
#define PANIC
Definition: elog.h:53
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:63
#define BTP_META
Definition: nbtree.h:75
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint32 ndeleted
Definition: nbtxlog.h:190
uint16 OffsetNumber
Definition: off.h:24
ItemPointerData * ItemPointer
Definition: itemptr.h:49
#define P_ISMETA(opaque)
Definition: nbtree.h:217
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:211
static TransactionId table_compute_xid_horizon_for_tuples(Relation rel, ItemPointerData *items, int nitems)
Definition: tableam.h:1103
#define BT_READ
Definition: nbtree.h:597
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
unsigned short uint16
Definition: c.h:366
void pfree(void *pointer)
Definition: mcxt.c:1056
#define BTREE_MAGIC
Definition: nbtree.h:142
BlockNumber block
Definition: nbtxlog.h:207
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:718
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:218
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3345
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:491
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:56
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:250
BTCycleId btpo_cycleid
Definition: nbtree.h:66
static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *topparent, OffsetNumber *topoff, BlockNumber *target, BlockNumber *rightsib)
Definition: nbtpage.c:1363
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:55
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1176
#define DEBUG2
Definition: elog.h:24
#define BTPageGetMeta(p)
Definition: nbtree.h:114
bool btm_allequalimage
Definition: nbtree.h:111
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1060
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
OffsetNumber bts_offset
Definition: nbtree.h:613
static char * buf
Definition: pg_test_fsync.c:67
TransactionId RecentGlobalXmin
Definition: snapmgr.c:168
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:38
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:469
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:34
unsigned int uint32
Definition: c.h:367
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:437
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:264
#define BufferGetPage(buffer)
Definition: bufmgr.h:157
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3585
#define BTREE_METAPAGE
Definition: nbtree.h:141
bool pivotsearch
Definition: nbtree.h:673
#define P_ISDELETED(opaque)
Definition: nbtree.h:216
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define P_ISROOT(opaque)
Definition: nbtree.h:215
static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable)
Definition: nbtpage.c:1245
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:660
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:322
uint32 version
Definition: nbtxlog.h:50
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:33
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:752
uint16 ndeleted
Definition: nbtxlog.h:242
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:62
static bool _bt_is_page_halfdead(Relation rel, BlockNumber blk)
Definition: nbtpage.c:1317
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:37
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:792
struct IndexTupleData IndexTupleData
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:144
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3559
BlockNumber btm_root
Definition: nbtree.h:102
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:500
#define XLogStandbyInfoActive()
Definition: xlog.h:196
#define BTREE_MIN_VERSION
Definition: nbtree.h:144
BlockNumber bts_blkno
Definition: nbtree.h:612
int _bt_pagedel(Relation rel, Buffer buf)
Definition: nbtpage.c:1493
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:940
#define ereport(elevel,...)
Definition: elog.h:144
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:419
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
RelFileNode rd_node
Definition: rel.h:55
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
#define Max(x, y)
Definition: c.h:914
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:828
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:543
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1468
#define MAXALIGN(LEN)
Definition: c.h:691
#define BufferIsValid(bufnum)
Definition: bufmgr.h:111
struct BTStackData * bts_parent
Definition: nbtree.h:614
#define RelationNeedsWAL(relation)
Definition: rel.h:537
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
static TransactionId ReadNewTransactionId(void)
Definition: transam.h:244
#define P_HIKEY
Definition: nbtree.h:242
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2570
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
Definition: nbtpage.c:1916
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:796
uint32 fastlevel
Definition: nbtxlog.h:54
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:214
MemoryContext rd_indexcxt
Definition: rel.h:166
uint32 level
Definition: nbtxlog.h:52
int i
#define SizeOfBtreeUnlinkPage
Definition: nbtxlog.h:303
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:952
#define SizeOfBtreeDelete
Definition: nbtxlog.h:195
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:499
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3183
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:393
BlockNumber fastroot
Definition: nbtxlog.h:53
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:35
TransactionId latestRemovedXid
Definition: nbtxlog.h:208
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
void * rd_amcache
Definition: rel.h:190
#define BT_WRITE
Definition: nbtree.h:598
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
Definition: nbtree.h:505
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:42
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:100
Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
Definition: nbtinsert.c:2248
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
uint16 ndeletedtids
Definition: nbtxlog.h:220
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:121
#define P_ISLEAF(opaque)
Definition: nbtree.h:214
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42