PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/progress.h"
26 #include "commands/vacuum.h"
27 #include "miscadmin.h"
28 #include "nodes/execnodes.h"
29 #include "pgstat.h"
30 #include "postmaster/autovacuum.h"
32 #include "storage/indexfsm.h"
33 #include "storage/ipc.h"
34 #include "storage/lmgr.h"
35 #include "storage/smgr.h"
36 #include "utils/builtins.h"
37 #include "utils/index_selfuncs.h"
38 #include "utils/memutils.h"
39 
40 
41 /* Working state needed by btvacuumpage */
42 typedef struct
43 {
49  BlockNumber totFreePages; /* true total # of free pages */
52 } BTVacState;
53 
54 /*
55  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
56  *
57  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
58  * a new page; others must wait.
59  *
60  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
61  * to a new page; some process can start doing that.
62  *
63  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
64  * We reach this state once for every distinct combination of array keys.
65  */
66 typedef enum
67 {
72 } BTPS_State;
73 
74 /*
75  * BTParallelScanDescData contains btree specific shared information required
76  * for parallel scan.
77  */
78 typedef struct BTParallelScanDescData
79 {
80  BlockNumber btps_scanPage; /* latest or next page to be scanned */
81  BTPS_State btps_pageStatus; /* indicates whether next page is
82  * available for scan. see above for
83  * possible states of parallel scan. */
84  int btps_arrayKeyCount; /* count indicating number of array scan
85  * keys processed by parallel scan */
86  slock_t btps_mutex; /* protects above variables */
87  ConditionVariable btps_cv; /* used to synchronize parallel scan */
89 
91 
92 
93 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
94  IndexBulkDeleteCallback callback, void *callback_state,
95  BTCycleId cycleid);
96 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
98  IndexTuple posting,
99  OffsetNumber updatedoffset,
100  int *nremaining);
101 
102 
103 /*
104  * Btree handler function: return IndexAmRoutine with access method parameters
105  * and callbacks.
106  */
107 Datum
109 {
110  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
111 
112  amroutine->amstrategies = BTMaxStrategyNumber;
113  amroutine->amsupport = BTNProcs;
114  amroutine->amoptsprocnum = BTOPTIONS_PROC;
115  amroutine->amcanorder = true;
116  amroutine->amcanorderbyop = false;
117  amroutine->amcanbackward = true;
118  amroutine->amcanunique = true;
119  amroutine->amcanmulticol = true;
120  amroutine->amoptionalkey = true;
121  amroutine->amsearcharray = true;
122  amroutine->amsearchnulls = true;
123  amroutine->amstorage = false;
124  amroutine->amclusterable = true;
125  amroutine->ampredlocks = true;
126  amroutine->amcanparallel = true;
127  amroutine->amcaninclude = true;
128  amroutine->amusemaintenanceworkmem = false;
129  amroutine->amparallelvacuumoptions =
131  amroutine->amkeytype = InvalidOid;
132 
133  amroutine->ambuild = btbuild;
134  amroutine->ambuildempty = btbuildempty;
135  amroutine->aminsert = btinsert;
136  amroutine->ambulkdelete = btbulkdelete;
137  amroutine->amvacuumcleanup = btvacuumcleanup;
138  amroutine->amcanreturn = btcanreturn;
139  amroutine->amcostestimate = btcostestimate;
140  amroutine->amoptions = btoptions;
141  amroutine->amproperty = btproperty;
142  amroutine->ambuildphasename = btbuildphasename;
143  amroutine->amvalidate = btvalidate;
144  amroutine->amadjustmembers = btadjustmembers;
145  amroutine->ambeginscan = btbeginscan;
146  amroutine->amrescan = btrescan;
147  amroutine->amgettuple = btgettuple;
148  amroutine->amgetbitmap = btgetbitmap;
149  amroutine->amendscan = btendscan;
150  amroutine->ammarkpos = btmarkpos;
151  amroutine->amrestrpos = btrestrpos;
154  amroutine->amparallelrescan = btparallelrescan;
155 
156  PG_RETURN_POINTER(amroutine);
157 }
158 
159 /*
160  * btbuildempty() -- build an empty btree index in the initialization fork
161  */
162 void
164 {
165  Page metapage;
166 
167  /* Construct metapage. */
168  metapage = (Page) palloc(BLCKSZ);
169  _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
170 
171  /*
172  * Write the page and log it. It might seem that an immediate sync would
173  * be sufficient to guarantee that the file exists on disk, but recovery
174  * itself might remove it while replaying, for example, an
175  * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
176  * this even when wal_level=minimal.
177  */
180  (char *) metapage, true);
182  BTREE_METAPAGE, metapage, true);
183 
184  /*
185  * An immediate sync is required even if we xlog'd the page, because the
186  * write did not go through shared_buffers and therefore a concurrent
187  * checkpoint may have moved the redo pointer past our xlog record.
188  */
190 }
191 
192 /*
193  * btinsert() -- insert an index tuple into a btree.
194  *
195  * Descend the tree recursively, find the appropriate location for our
196  * new tuple, and put it there.
197  */
198 bool
199 btinsert(Relation rel, Datum *values, bool *isnull,
200  ItemPointer ht_ctid, Relation heapRel,
201  IndexUniqueCheck checkUnique,
202  bool indexUnchanged,
203  IndexInfo *indexInfo)
204 {
205  bool result;
206  IndexTuple itup;
207 
208  /* generate an index tuple */
209  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
210  itup->t_tid = *ht_ctid;
211 
212  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
213 
214  pfree(itup);
215 
216  return result;
217 }
218 
219 /*
220  * btgettuple() -- Get the next tuple in the scan.
221  */
222 bool
224 {
225  BTScanOpaque so = (BTScanOpaque) scan->opaque;
226  bool res;
227 
228  /* btree indexes are never lossy */
229  scan->xs_recheck = false;
230 
231  /*
232  * If we have any array keys, initialize them during first call for a
233  * scan. We can't do this in btrescan because we don't know the scan
234  * direction at that time.
235  */
236  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
237  {
238  /* punt if we have any unsatisfiable array keys */
239  if (so->numArrayKeys < 0)
240  return false;
241 
242  _bt_start_array_keys(scan, dir);
243  }
244 
245  /* This loop handles advancing to the next array elements, if any */
246  do
247  {
248  /*
249  * If we've already initialized this scan, we can just advance it in
250  * the appropriate direction. If we haven't done so yet, we call
251  * _bt_first() to get the first item in the scan.
252  */
253  if (!BTScanPosIsValid(so->currPos))
254  res = _bt_first(scan, dir);
255  else
256  {
257  /*
258  * Check to see if we should kill the previously-fetched tuple.
259  */
260  if (scan->kill_prior_tuple)
261  {
262  /*
263  * Yes, remember it for later. (We'll deal with all such
264  * tuples at once right before leaving the index page.) The
265  * test for numKilled overrun is not just paranoia: if the
266  * caller reverses direction in the indexscan then the same
267  * item might get entered multiple times. It's not worth
268  * trying to optimize that, so we don't detect it, but instead
269  * just forget any excess entries.
270  */
271  if (so->killedItems == NULL)
272  so->killedItems = (int *)
273  palloc(MaxTIDsPerBTreePage * sizeof(int));
274  if (so->numKilled < MaxTIDsPerBTreePage)
275  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
276  }
277 
278  /*
279  * Now continue the scan.
280  */
281  res = _bt_next(scan, dir);
282  }
283 
284  /* If we have a tuple, return it ... */
285  if (res)
286  break;
287  /* ... otherwise see if we have more array keys to deal with */
288  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
289 
290  return res;
291 }
292 
293 /*
294  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
295  */
296 int64
298 {
299  BTScanOpaque so = (BTScanOpaque) scan->opaque;
300  int64 ntids = 0;
301  ItemPointer heapTid;
302 
303  /*
304  * If we have any array keys, initialize them.
305  */
306  if (so->numArrayKeys)
307  {
308  /* punt if we have any unsatisfiable array keys */
309  if (so->numArrayKeys < 0)
310  return ntids;
311 
313  }
314 
315  /* This loop handles advancing to the next array elements, if any */
316  do
317  {
318  /* Fetch the first page & tuple */
319  if (_bt_first(scan, ForwardScanDirection))
320  {
321  /* Save tuple ID, and continue scanning */
322  heapTid = &scan->xs_heaptid;
323  tbm_add_tuples(tbm, heapTid, 1, false);
324  ntids++;
325 
326  for (;;)
327  {
328  /*
329  * Advance to next tuple within page. This is the same as the
330  * easy case in _bt_next().
331  */
332  if (++so->currPos.itemIndex > so->currPos.lastItem)
333  {
334  /* let _bt_next do the heavy lifting */
335  if (!_bt_next(scan, ForwardScanDirection))
336  break;
337  }
338 
339  /* Save tuple ID, and continue scanning */
340  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
341  tbm_add_tuples(tbm, heapTid, 1, false);
342  ntids++;
343  }
344  }
345  /* Now see if we have more array keys to deal with */
347 
348  return ntids;
349 }
350 
351 /*
352  * btbeginscan() -- start a scan on a btree index
353  */
355 btbeginscan(Relation rel, int nkeys, int norderbys)
356 {
357  IndexScanDesc scan;
358  BTScanOpaque so;
359 
360  /* no order by operators allowed */
361  Assert(norderbys == 0);
362 
363  /* get the scan */
364  scan = RelationGetIndexScan(rel, nkeys, norderbys);
365 
366  /* allocate private workspace */
367  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
370  if (scan->numberOfKeys > 0)
371  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
372  else
373  so->keyData = NULL;
374 
375  so->arrayKeyData = NULL; /* assume no array keys for now */
376  so->numArrayKeys = 0;
377  so->arrayKeys = NULL;
378  so->arrayContext = NULL;
379 
380  so->killedItems = NULL; /* until needed */
381  so->numKilled = 0;
382 
383  /*
384  * We don't know yet whether the scan will be index-only, so we do not
385  * allocate the tuple workspace arrays until btrescan. However, we set up
386  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
387  */
388  so->currTuples = so->markTuples = NULL;
389 
390  scan->xs_itupdesc = RelationGetDescr(rel);
391 
392  scan->opaque = so;
393 
394  return scan;
395 }
396 
397 /*
398  * btrescan() -- rescan an index relation
399  */
400 void
401 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
402  ScanKey orderbys, int norderbys)
403 {
404  BTScanOpaque so = (BTScanOpaque) scan->opaque;
405 
406  /* we aren't holding any read locks, but gotta drop the pins */
407  if (BTScanPosIsValid(so->currPos))
408  {
409  /* Before leaving current page, deal with any killed items */
410  if (so->numKilled > 0)
411  _bt_killitems(scan);
414  }
415 
416  so->markItemIndex = -1;
417  so->arrayKeyCount = 0;
420 
421  /*
422  * Allocate tuple workspace arrays, if needed for an index-only scan and
423  * not already done in a previous rescan call. To save on palloc
424  * overhead, both workspaces are allocated as one palloc block; only this
425  * function and btendscan know that.
426  *
427  * NOTE: this data structure also makes it safe to return data from a
428  * "name" column, even though btree name_ops uses an underlying storage
429  * datatype of cstring. The risk there is that "name" is supposed to be
430  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
431  * However, since we only return data out of tuples sitting in the
432  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
433  * data out of the markTuples array --- running off the end of memory for
434  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
435  * adding special-case treatment for name_ops elsewhere.
436  */
437  if (scan->xs_want_itup && so->currTuples == NULL)
438  {
439  so->currTuples = (char *) palloc(BLCKSZ * 2);
440  so->markTuples = so->currTuples + BLCKSZ;
441  }
442 
443  /*
444  * Reset the scan keys
445  */
446  if (scankey && scan->numberOfKeys > 0)
447  memmove(scan->keyData,
448  scankey,
449  scan->numberOfKeys * sizeof(ScanKeyData));
450  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
451 
452  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
454 }
455 
456 /*
457  * btendscan() -- close down a scan
458  */
459 void
461 {
462  BTScanOpaque so = (BTScanOpaque) scan->opaque;
463 
464  /* we aren't holding any read locks, but gotta drop the pins */
465  if (BTScanPosIsValid(so->currPos))
466  {
467  /* Before leaving current page, deal with any killed items */
468  if (so->numKilled > 0)
469  _bt_killitems(scan);
471  }
472 
473  so->markItemIndex = -1;
475 
476  /* No need to invalidate positions, the RAM is about to be freed. */
477 
478  /* Release storage */
479  if (so->keyData != NULL)
480  pfree(so->keyData);
481  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
482  if (so->arrayContext != NULL)
484  if (so->killedItems != NULL)
485  pfree(so->killedItems);
486  if (so->currTuples != NULL)
487  pfree(so->currTuples);
488  /* so->markTuples should not be pfree'd, see btrescan */
489  pfree(so);
490 }
491 
492 /*
493  * btmarkpos() -- save current scan position
494  */
495 void
497 {
498  BTScanOpaque so = (BTScanOpaque) scan->opaque;
499 
500  /* There may be an old mark with a pin (but no lock). */
502 
503  /*
504  * Just record the current itemIndex. If we later step to next page
505  * before releasing the marked position, _bt_steppage makes a full copy of
506  * the currPos struct in markPos. If (as often happens) the mark is moved
507  * before we leave the page, we don't have to do that work.
508  */
509  if (BTScanPosIsValid(so->currPos))
510  so->markItemIndex = so->currPos.itemIndex;
511  else
512  {
514  so->markItemIndex = -1;
515  }
516 
517  /* Also record the current positions of any array keys */
518  if (so->numArrayKeys)
519  _bt_mark_array_keys(scan);
520 }
521 
522 /*
523  * btrestrpos() -- restore scan to last saved position
524  */
525 void
527 {
528  BTScanOpaque so = (BTScanOpaque) scan->opaque;
529 
530  /* Restore the marked positions of any array keys */
531  if (so->numArrayKeys)
533 
534  if (so->markItemIndex >= 0)
535  {
536  /*
537  * The scan has never moved to a new page since the last mark. Just
538  * restore the itemIndex.
539  *
540  * NB: In this case we can't count on anything in so->markPos to be
541  * accurate.
542  */
543  so->currPos.itemIndex = so->markItemIndex;
544  }
545  else
546  {
547  /*
548  * The scan moved to a new page after last mark or restore, and we are
549  * now restoring to the marked page. We aren't holding any read
550  * locks, but if we're still holding the pin for the current position,
551  * we must drop it.
552  */
553  if (BTScanPosIsValid(so->currPos))
554  {
555  /* Before leaving current page, deal with any killed items */
556  if (so->numKilled > 0)
557  _bt_killitems(scan);
559  }
560 
561  if (BTScanPosIsValid(so->markPos))
562  {
563  /* bump pin on mark buffer for assignment to current buffer */
564  if (BTScanPosIsPinned(so->markPos))
566  memcpy(&so->currPos, &so->markPos,
567  offsetof(BTScanPosData, items[1]) +
568  so->markPos.lastItem * sizeof(BTScanPosItem));
569  if (so->currTuples)
570  memcpy(so->currTuples, so->markTuples,
572  }
573  else
575  }
576 }
577 
578 /*
579  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
580  */
581 Size
583 {
584  return sizeof(BTParallelScanDescData);
585 }
586 
587 /*
588  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
589  */
590 void
591 btinitparallelscan(void *target)
592 {
593  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
594 
595  SpinLockInit(&bt_target->btps_mutex);
596  bt_target->btps_scanPage = InvalidBlockNumber;
598  bt_target->btps_arrayKeyCount = 0;
599  ConditionVariableInit(&bt_target->btps_cv);
600 }
601 
602 /*
603  * btparallelrescan() -- reset parallel scan
604  */
605 void
607 {
608  BTParallelScanDesc btscan;
609  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
610 
611  Assert(parallel_scan);
612 
613  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
614  parallel_scan->ps_offset);
615 
616  /*
617  * In theory, we don't need to acquire the spinlock here, because there
618  * shouldn't be any other workers running at this point, but we do so for
619  * consistency.
620  */
621  SpinLockAcquire(&btscan->btps_mutex);
624  btscan->btps_arrayKeyCount = 0;
625  SpinLockRelease(&btscan->btps_mutex);
626 }
627 
628 /*
629  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
630  * page. Other scans must wait until we call _bt_parallel_release()
631  * or _bt_parallel_done().
632  *
633  * The return value is true if we successfully seized the scan and false
634  * if we did not. The latter case occurs if no pages remain for the current
635  * set of scankeys.
636  *
637  * If the return value is true, *pageno returns the next or current page
638  * of the scan (depending on the scan direction). An invalid block number
639  * means the scan hasn't yet started, and P_NONE means we've reached the end.
640  * The first time a participating process reaches the last page, it will return
641  * true and set *pageno to P_NONE; after that, further attempts to seize the
642  * scan will return false.
643  *
644  * Callers should ignore the value of pageno if the return value is false.
645  */
646 bool
648 {
649  BTScanOpaque so = (BTScanOpaque) scan->opaque;
650  BTPS_State pageStatus;
651  bool exit_loop = false;
652  bool status = true;
653  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
654  BTParallelScanDesc btscan;
655 
656  *pageno = P_NONE;
657 
658  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
659  parallel_scan->ps_offset);
660 
661  while (1)
662  {
663  SpinLockAcquire(&btscan->btps_mutex);
664  pageStatus = btscan->btps_pageStatus;
665 
666  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
667  {
668  /* Parallel scan has already advanced to a new set of scankeys. */
669  status = false;
670  }
671  else if (pageStatus == BTPARALLEL_DONE)
672  {
673  /*
674  * We're done with this set of scankeys. This may be the end, or
675  * there could be more sets to try.
676  */
677  status = false;
678  }
679  else if (pageStatus != BTPARALLEL_ADVANCING)
680  {
681  /*
682  * We have successfully seized control of the scan for the purpose
683  * of advancing it to a new page!
684  */
685  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
686  *pageno = btscan->btps_scanPage;
687  exit_loop = true;
688  }
689  SpinLockRelease(&btscan->btps_mutex);
690  if (exit_loop || !status)
691  break;
693  }
695 
696  return status;
697 }
698 
699 /*
700  * _bt_parallel_release() -- Complete the process of advancing the scan to a
701  * new page. We now have the new value btps_scanPage; some other backend
702  * can now begin advancing the scan.
703  */
704 void
706 {
707  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
708  BTParallelScanDesc btscan;
709 
710  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
711  parallel_scan->ps_offset);
712 
713  SpinLockAcquire(&btscan->btps_mutex);
714  btscan->btps_scanPage = scan_page;
716  SpinLockRelease(&btscan->btps_mutex);
718 }
719 
720 /*
721  * _bt_parallel_done() -- Mark the parallel scan as complete.
722  *
723  * When there are no pages left to scan, this function should be called to
724  * notify other workers. Otherwise, they might wait forever for the scan to
725  * advance to the next page.
726  */
727 void
729 {
730  BTScanOpaque so = (BTScanOpaque) scan->opaque;
731  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
732  BTParallelScanDesc btscan;
733  bool status_changed = false;
734 
735  /* Do nothing, for non-parallel scans */
736  if (parallel_scan == NULL)
737  return;
738 
739  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
740  parallel_scan->ps_offset);
741 
742  /*
743  * Mark the parallel scan as done for this combination of scan keys,
744  * unless some other process already did so. See also
745  * _bt_advance_array_keys.
746  */
747  SpinLockAcquire(&btscan->btps_mutex);
748  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
749  btscan->btps_pageStatus != BTPARALLEL_DONE)
750  {
751  btscan->btps_pageStatus = BTPARALLEL_DONE;
752  status_changed = true;
753  }
754  SpinLockRelease(&btscan->btps_mutex);
755 
756  /* wake up all the workers associated with this parallel scan */
757  if (status_changed)
758  ConditionVariableBroadcast(&btscan->btps_cv);
759 }
760 
761 /*
762  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
763  * keys.
764  *
765  * Updates the count of array keys processed for both local and parallel
766  * scans.
767  */
768 void
770 {
771  BTScanOpaque so = (BTScanOpaque) scan->opaque;
772  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
773  BTParallelScanDesc btscan;
774 
775  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
776  parallel_scan->ps_offset);
777 
778  so->arrayKeyCount++;
779  SpinLockAcquire(&btscan->btps_mutex);
780  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
781  {
782  btscan->btps_scanPage = InvalidBlockNumber;
783  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
784  btscan->btps_arrayKeyCount++;
785  }
786  SpinLockRelease(&btscan->btps_mutex);
787 }
788 
789 /*
790  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
791  *
792  * Called by btvacuumcleanup when btbulkdelete was never called because no
793  * tuples need to be deleted.
794  *
795  * When we return false, VACUUM can even skip the cleanup-only call to
796  * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
797  * all). Otherwise, a cleanup-only btvacuumscan call is required.
798  */
799 static bool
801 {
802  Buffer metabuf;
803  Page metapg;
804  BTMetaPageData *metad;
805  bool result = false;
806 
807  metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
808  metapg = BufferGetPage(metabuf);
809  metad = BTPageGetMeta(metapg);
810 
811  /*
812  * XXX: If IndexVacuumInfo contained the heap relation, we could be more
813  * aggressive about vacuuming non catalog relations by passing the table
814  * to GlobalVisCheckRemovableXid().
815  */
816 
817  if (metad->btm_version < BTREE_NOVAC_VERSION)
818  {
819  /*
820  * Do cleanup if metapage needs upgrade, because we don't have
821  * cleanup-related meta-information yet.
822  */
823  result = true;
824  }
825  else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
827  {
828  /*
829  * If any oldest btpo.xact from a previously deleted page in the index
830  * is visible to everyone, then at least one deleted page can be
831  * recycled -- don't skip cleanup.
832  */
833  result = true;
834  }
835  else
836  {
837  BTOptions *relopts;
838  float8 cleanup_scale_factor;
839  float8 prev_num_heap_tuples;
840 
841  /*
842  * If table receives enough insertions and no cleanup was performed,
843  * then index would appear have stale statistics. If scale factor is
844  * set, we avoid that by performing cleanup if the number of inserted
845  * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
846  * original tuples count.
847  */
848  relopts = (BTOptions *) info->index->rd_options;
849  cleanup_scale_factor = (relopts &&
850  relopts->vacuum_cleanup_index_scale_factor >= 0)
853  prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
854 
855  if (cleanup_scale_factor <= 0 ||
856  info->num_heap_tuples < 0 ||
857  prev_num_heap_tuples <= 0 ||
858  (info->num_heap_tuples - prev_num_heap_tuples) /
859  prev_num_heap_tuples >= cleanup_scale_factor)
860  result = true;
861  }
862 
863  _bt_relbuf(info->index, metabuf);
864  return result;
865 }
866 
867 /*
868  * Bulk deletion of all index entries pointing to a set of heap tuples.
869  * The set of target tuples is specified via a callback routine that tells
870  * whether any given heap tuple (identified by ItemPointer) is being deleted.
871  *
872  * Result: a palloc'd struct containing statistical info for VACUUM displays.
873  */
876  IndexBulkDeleteCallback callback, void *callback_state)
877 {
878  Relation rel = info->index;
879  BTCycleId cycleid;
880 
881  /* allocate stats if first time through, else re-use existing struct */
882  if (stats == NULL)
884 
885  /* Establish the vacuum cycle ID to use for this scan */
886  /* The ENSURE stuff ensures we clean up shared memory on failure */
888  {
889  cycleid = _bt_start_vacuum(rel);
890 
891  btvacuumscan(info, stats, callback, callback_state, cycleid);
892  }
894  _bt_end_vacuum(rel);
895 
896  return stats;
897 }
898 
899 /*
900  * Post-VACUUM cleanup.
901  *
902  * Result: a palloc'd struct containing statistical info for VACUUM displays.
903  */
906 {
907  /* No-op in ANALYZE ONLY mode */
908  if (info->analyze_only)
909  return stats;
910 
911  /*
912  * If btbulkdelete was called, we need not do anything, just return the
913  * stats from the latest btbulkdelete call. If it wasn't called, we might
914  * still need to do a pass over the index, to recycle any newly-recyclable
915  * pages or to obtain index statistics. _bt_vacuum_needs_cleanup
916  * determines if either are needed.
917  *
918  * Since we aren't going to actually delete any leaf items, there's no
919  * need to go through all the vacuum-cycle-ID pushups.
920  */
921  if (stats == NULL)
922  {
923  /* Check if we need a cleanup */
924  if (!_bt_vacuum_needs_cleanup(info))
925  return NULL;
926 
928  btvacuumscan(info, stats, NULL, NULL, 0);
929  }
930 
931  /*
932  * It's quite possible for us to be fooled by concurrent page splits into
933  * double-counting some index tuples, so disbelieve any total that exceeds
934  * the underlying heap's count ... if we know that accurately. Otherwise
935  * this might just make matters worse.
936  *
937  * Posting list tuples are another source of inaccuracy. Cleanup-only
938  * btvacuumscan calls assume that the number of index tuples can be used
939  * as num_index_tuples, even though num_index_tuples is supposed to
940  * represent the number of TIDs in the index. This naive approach can
941  * underestimate the number of tuples in the index.
942  */
943  if (!info->estimated_count)
944  {
945  if (stats->num_index_tuples > info->num_heap_tuples)
946  stats->num_index_tuples = info->num_heap_tuples;
947  }
948 
949  return stats;
950 }
951 
952 /*
953  * btvacuumscan --- scan the index for VACUUMing purposes
954  *
955  * This combines the functions of looking for leaf tuples that are deletable
956  * according to the vacuum callback, looking for empty pages that can be
957  * deleted, and looking for old deleted pages that can be recycled. Both
958  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
959  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
960  * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
961  * is maintained.
962  *
963  * The caller is responsible for initially allocating/zeroing a stats struct
964  * and for obtaining a vacuum cycle ID if necessary.
965  */
966 static void
968  IndexBulkDeleteCallback callback, void *callback_state,
969  BTCycleId cycleid)
970 {
971  Relation rel = info->index;
972  BTVacState vstate;
973  BlockNumber num_pages;
974  BlockNumber scanblkno;
975  bool needLock;
976 
977  /*
978  * Reset counts that will be incremented during the scan; needed in case
979  * of multiple scans during a single VACUUM command
980  */
981  stats->estimated_count = false;
982  stats->num_index_tuples = 0;
983  stats->pages_deleted = 0;
984 
985  /* Set up info to pass down to btvacuumpage */
986  vstate.info = info;
987  vstate.stats = stats;
988  vstate.callback = callback;
989  vstate.callback_state = callback_state;
990  vstate.cycleid = cycleid;
991  vstate.totFreePages = 0;
993 
994  /* Create a temporary memory context to run _bt_pagedel in */
996  "_bt_pagedel",
998 
999  /*
1000  * The outer loop iterates over all index pages except the metapage, in
1001  * physical order (we hope the kernel will cooperate in providing
1002  * read-ahead for speed). It is critical that we visit all leaf pages,
1003  * including ones added after we start the scan, else we might fail to
1004  * delete some deletable tuples. Hence, we must repeatedly check the
1005  * relation length. We must acquire the relation-extension lock while
1006  * doing so to avoid a race condition: if someone else is extending the
1007  * relation, there is a window where bufmgr/smgr have created a new
1008  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1009  * we manage to scan such a page here, we'll improperly assume it can be
1010  * recycled. Taking the lock synchronizes things enough to prevent a
1011  * problem: either num_pages won't include the new page, or _bt_getbuf
1012  * already has write lock on the buffer and it will be fully initialized
1013  * before we can examine it. (See also vacuumlazy.c, which has the same
1014  * issue.) Also, we need not worry if a page is added immediately after
1015  * we look; the page splitting code already has write-lock on the left
1016  * page before it adds a right page, so we must already have processed any
1017  * tuples due to be moved into such a page.
1018  *
1019  * We can skip locking for new or temp relations, however, since no one
1020  * else could be accessing them.
1021  */
1022  needLock = !RELATION_IS_LOCAL(rel);
1023 
1024  scanblkno = BTREE_METAPAGE + 1;
1025  for (;;)
1026  {
1027  /* Get the current relation length */
1028  if (needLock)
1030  num_pages = RelationGetNumberOfBlocks(rel);
1031  if (needLock)
1033 
1034  if (info->report_progress)
1036  num_pages);
1037 
1038  /* Quit if we've scanned the whole relation */
1039  if (scanblkno >= num_pages)
1040  break;
1041  /* Iterate over pages, then loop back to recheck length */
1042  for (; scanblkno < num_pages; scanblkno++)
1043  {
1044  btvacuumpage(&vstate, scanblkno);
1045  if (info->report_progress)
1047  scanblkno);
1048  }
1049  }
1050 
1052 
1053  /*
1054  * If we found any recyclable pages (and recorded them in the FSM), then
1055  * forcibly update the upper-level FSM pages to ensure that searchers can
1056  * find them. It's possible that the pages were also found during
1057  * previous scans and so this is a waste of time, but it's cheap enough
1058  * relative to scanning the index that it shouldn't matter much, and
1059  * making sure that free pages are available sooner not later seems
1060  * worthwhile.
1061  *
1062  * Note that if no recyclable pages exist, we don't bother vacuuming the
1063  * FSM at all.
1064  */
1065  if (vstate.totFreePages > 0)
1067 
1068  /*
1069  * Maintain the oldest btpo.xact and a count of the current number of heap
1070  * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
1071  *
1072  * The page with the oldest btpo.xact is typically a page deleted by this
1073  * VACUUM operation, since pages deleted by a previous VACUUM operation
1074  * tend to be placed in the FSM (by the current VACUUM operation) -- such
1075  * pages are not candidates to be the oldest btpo.xact. (Note that pages
1076  * placed in the FSM are reported as deleted pages in the bulk delete
1077  * statistics, despite not counting as deleted pages for the purposes of
1078  * determining the oldest btpo.xact.)
1079  */
1081  info->num_heap_tuples);
1082 
1083  /* update statistics */
1084  stats->num_pages = num_pages;
1085  stats->pages_free = vstate.totFreePages;
1086 }
1087 
1088 /*
1089  * btvacuumpage --- VACUUM one page
1090  *
1091  * This processes a single page for btvacuumscan(). In some cases we must
1092  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1093  * a previous call here. This is how we handle page splits (that happened
1094  * after our cycleid was acquired) whose right half page happened to reuse
1095  * a block that we might have processed at some point before it was
1096  * recycled (i.e. before the page split).
1097  */
1098 static void
1100 {
1101  IndexVacuumInfo *info = vstate->info;
1102  IndexBulkDeleteResult *stats = vstate->stats;
1104  void *callback_state = vstate->callback_state;
1105  Relation rel = info->index;
1106  bool attempt_pagedel;
1107  BlockNumber blkno,
1108  backtrack_to;
1109  Buffer buf;
1110  Page page;
1111  BTPageOpaque opaque;
1112 
1113  blkno = scanblkno;
1114 
1115 backtrack:
1116 
1117  attempt_pagedel = false;
1118  backtrack_to = P_NONE;
1119 
1120  /* call vacuum_delay_point while not holding any buffer lock */
1122 
1123  /*
1124  * We can't use _bt_getbuf() here because it always applies
1125  * _bt_checkpage(), which will barf on an all-zero page. We want to
1126  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1127  * buffer access strategy.
1128  */
1129  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1130  info->strategy);
1131  _bt_lockbuf(rel, buf, BT_READ);
1132  page = BufferGetPage(buf);
1133  opaque = NULL;
1134  if (!PageIsNew(page))
1135  {
1136  _bt_checkpage(rel, buf);
1137  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1138  }
1139 
1140  Assert(blkno <= scanblkno);
1141  if (blkno != scanblkno)
1142  {
1143  /*
1144  * We're backtracking.
1145  *
1146  * We followed a right link to a sibling leaf page (a page that
1147  * happens to be from a block located before scanblkno). The only
1148  * case we want to do anything with is a live leaf page having the
1149  * current vacuum cycle ID.
1150  *
1151  * The page had better be in a state that's consistent with what we
1152  * expect. Check for conditions that imply corruption in passing. It
1153  * can't be half-dead because only an interrupted VACUUM process can
1154  * leave pages in that state, so we'd definitely have dealt with it
1155  * back when the page was the scanblkno page (half-dead pages are
1156  * always marked fully deleted by _bt_pagedel()). This assumes that
1157  * there can be only one vacuum process running at a time.
1158  */
1159  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1160  {
1161  Assert(false);
1162  ereport(LOG,
1163  (errcode(ERRCODE_INDEX_CORRUPTED),
1164  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1165  blkno, scanblkno, RelationGetRelationName(rel))));
1166  _bt_relbuf(rel, buf);
1167  return;
1168  }
1169 
1170  /*
1171  * We may have already processed the page in an earlier call, when the
1172  * page was scanblkno. This happens when the leaf page split occurred
1173  * after the scan began, but before the right sibling page became the
1174  * scanblkno.
1175  *
1176  * Page may also have been deleted by current btvacuumpage() call,
1177  * since _bt_pagedel() sometimes deletes the right sibling page of
1178  * scanblkno in passing (it does so after we decided where to
1179  * backtrack to). We don't need to process this page as a deleted
1180  * page a second time now (in fact, it would be wrong to count it as a
1181  * deleted page in the bulk delete statistics a second time).
1182  */
1183  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1184  {
1185  /* Done with current scanblkno (and all lower split pages) */
1186  _bt_relbuf(rel, buf);
1187  return;
1188  }
1189  }
1190 
1191  /* Page is valid, see what to do with it */
1192  if (_bt_page_recyclable(page))
1193  {
1194  /* Okay to recycle this page (which could be leaf or internal) */
1195  RecordFreeIndexPage(rel, blkno);
1196  vstate->totFreePages++;
1197  stats->pages_deleted++;
1198  }
1199  else if (P_ISDELETED(opaque))
1200  {
1201  /*
1202  * Already deleted page (which could be leaf or internal). Can't
1203  * recycle yet.
1204  */
1205  stats->pages_deleted++;
1206 
1207  /* Maintain the oldest btpo.xact */
1208  if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1209  TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1210  vstate->oldestBtpoXact = opaque->btpo.xact;
1211  }
1212  else if (P_ISHALFDEAD(opaque))
1213  {
1214  /*
1215  * Half-dead leaf page. Try to delete now. Might update
1216  * oldestBtpoXact and pages_deleted below.
1217  */
1218  attempt_pagedel = true;
1219  }
1220  else if (P_ISLEAF(opaque))
1221  {
1223  int ndeletable;
1225  int nupdatable;
1226  OffsetNumber offnum,
1227  minoff,
1228  maxoff;
1229  int nhtidsdead,
1230  nhtidslive;
1231 
1232  /*
1233  * Trade in the initial read lock for a super-exclusive write lock on
1234  * this page. We must get such a lock on every leaf page over the
1235  * course of the vacuum scan, whether or not it actually contains any
1236  * deletable tuples --- see nbtree/README.
1237  */
1238  _bt_upgradelockbufcleanup(rel, buf);
1239 
1240  /*
1241  * Check whether we need to backtrack to earlier pages. What we are
1242  * concerned about is a page split that happened since we started the
1243  * vacuum scan. If the split moved tuples on the right half of the
1244  * split (i.e. the tuples that sort high) to a block that we already
1245  * passed over, then we might have missed the tuples. We need to
1246  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1247  * or deleting scanblkno page below!)
1248  */
1249  if (vstate->cycleid != 0 &&
1250  opaque->btpo_cycleid == vstate->cycleid &&
1251  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1252  !P_RIGHTMOST(opaque) &&
1253  opaque->btpo_next < scanblkno)
1254  backtrack_to = opaque->btpo_next;
1255 
1256  /*
1257  * When each VACUUM begins, it determines an OldestXmin cutoff value.
1258  * Tuples before the cutoff are removed by VACUUM. Scan over all
1259  * items to see which ones need to be deleted according to cutoff
1260  * point using callback.
1261  */
1262  ndeletable = 0;
1263  nupdatable = 0;
1264  minoff = P_FIRSTDATAKEY(opaque);
1265  maxoff = PageGetMaxOffsetNumber(page);
1266  nhtidsdead = 0;
1267  nhtidslive = 0;
1268  if (callback)
1269  {
1270  for (offnum = minoff;
1271  offnum <= maxoff;
1272  offnum = OffsetNumberNext(offnum))
1273  {
1274  IndexTuple itup;
1275 
1276  itup = (IndexTuple) PageGetItem(page,
1277  PageGetItemId(page, offnum));
1278 
1279  /*
1280  * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
1281  * records do not produce their own conflicts. This is safe
1282  * as long as the callback function only considers whether the
1283  * index tuple refers to pre-cutoff heap tuples that were
1284  * certainly already pruned away during VACUUM's initial heap
1285  * scan by the time we get here. (heapam's XLOG_HEAP2_CLEAN
1286  * and XLOG_HEAP2_CLEANUP_INFO records produce conflicts using
1287  * a latestRemovedXid value for the pointed-to heap tuples, so
1288  * there is no need to produce our own conflict now.)
1289  *
1290  * Backends with snapshots acquired after a VACUUM starts but
1291  * before it finishes could have visibility cutoff with a
1292  * later xid than VACUUM's OldestXmin cutoff. These backends
1293  * might happen to opportunistically mark some index tuples
1294  * LP_DEAD before we reach them, even though they may be after
1295  * our cutoff. We don't try to kill these "extra" index
1296  * tuples in _bt_delitems_vacuum(). This keep things simple,
1297  * and allows us to always avoid generating our own conflicts.
1298  */
1299  Assert(!BTreeTupleIsPivot(itup));
1300  if (!BTreeTupleIsPosting(itup))
1301  {
1302  /* Regular tuple, standard table TID representation */
1303  if (callback(&itup->t_tid, callback_state))
1304  {
1305  deletable[ndeletable++] = offnum;
1306  nhtidsdead++;
1307  }
1308  else
1309  nhtidslive++;
1310  }
1311  else
1312  {
1313  BTVacuumPosting vacposting;
1314  int nremaining;
1315 
1316  /* Posting list tuple */
1317  vacposting = btreevacuumposting(vstate, itup, offnum,
1318  &nremaining);
1319  if (vacposting == NULL)
1320  {
1321  /*
1322  * All table TIDs from the posting tuple remain, so no
1323  * delete or update required
1324  */
1325  Assert(nremaining == BTreeTupleGetNPosting(itup));
1326  }
1327  else if (nremaining > 0)
1328  {
1329 
1330  /*
1331  * Store metadata about posting list tuple in
1332  * updatable array for entire page. Existing tuple
1333  * will be updated during the later call to
1334  * _bt_delitems_vacuum().
1335  */
1336  Assert(nremaining < BTreeTupleGetNPosting(itup));
1337  updatable[nupdatable++] = vacposting;
1338  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1339  }
1340  else
1341  {
1342  /*
1343  * All table TIDs from the posting list must be
1344  * deleted. We'll delete the index tuple completely
1345  * (no update required).
1346  */
1347  Assert(nremaining == 0);
1348  deletable[ndeletable++] = offnum;
1349  nhtidsdead += BTreeTupleGetNPosting(itup);
1350  pfree(vacposting);
1351  }
1352 
1353  nhtidslive += nremaining;
1354  }
1355  }
1356  }
1357 
1358  /*
1359  * Apply any needed deletes or updates. We issue just one
1360  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1361  */
1362  if (ndeletable > 0 || nupdatable > 0)
1363  {
1364  Assert(nhtidsdead >= ndeletable + nupdatable);
1365  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1366  nupdatable);
1367 
1368  stats->tuples_removed += nhtidsdead;
1369  /* must recompute maxoff */
1370  maxoff = PageGetMaxOffsetNumber(page);
1371 
1372  /* can't leak memory here */
1373  for (int i = 0; i < nupdatable; i++)
1374  pfree(updatable[i]);
1375  }
1376  else
1377  {
1378  /*
1379  * If the leaf page has been split during this vacuum cycle, it
1380  * seems worth expending a write to clear btpo_cycleid even if we
1381  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1382  * takes care of this.) This ensures we won't process the page
1383  * again.
1384  *
1385  * We treat this like a hint-bit update because there's no need to
1386  * WAL-log it.
1387  */
1388  Assert(nhtidsdead == 0);
1389  if (vstate->cycleid != 0 &&
1390  opaque->btpo_cycleid == vstate->cycleid)
1391  {
1392  opaque->btpo_cycleid = 0;
1393  MarkBufferDirtyHint(buf, true);
1394  }
1395  }
1396 
1397  /*
1398  * If the leaf page is now empty, try to delete it; else count the
1399  * live tuples (live table TIDs in posting lists are counted as
1400  * separate live tuples). We don't delete when backtracking, though,
1401  * since that would require teaching _bt_pagedel() about backtracking
1402  * (doesn't seem worth adding more complexity to deal with that).
1403  *
1404  * We don't count the number of live TIDs during cleanup-only calls to
1405  * btvacuumscan (i.e. when callback is not set). We count the number
1406  * of index tuples directly instead. This avoids the expense of
1407  * directly examining all of the tuples on each page.
1408  */
1409  if (minoff > maxoff)
1410  attempt_pagedel = (blkno == scanblkno);
1411  else if (callback)
1412  stats->num_index_tuples += nhtidslive;
1413  else
1414  stats->num_index_tuples += maxoff - minoff + 1;
1415 
1416  Assert(!attempt_pagedel || nhtidslive == 0);
1417  }
1418 
1419  if (attempt_pagedel)
1420  {
1421  MemoryContext oldcontext;
1422 
1423  /* Run pagedel in a temp context to avoid memory leakage */
1425  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1426 
1427  /*
1428  * We trust the _bt_pagedel return value because it does not include
1429  * any page that a future call here from btvacuumscan is expected to
1430  * count. There will be no double-counting.
1431  */
1432  Assert(blkno == scanblkno);
1433  stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
1434 
1435  MemoryContextSwitchTo(oldcontext);
1436  /* pagedel released buffer, so we shouldn't */
1437  }
1438  else
1439  _bt_relbuf(rel, buf);
1440 
1441  if (backtrack_to != P_NONE)
1442  {
1443  blkno = backtrack_to;
1444  goto backtrack;
1445  }
1446 }
1447 
1448 /*
1449  * btreevacuumposting --- determine TIDs still needed in posting list
1450  *
1451  * Returns metadata describing how to build replacement tuple without the TIDs
1452  * that VACUUM needs to delete. Returned value is NULL in the common case
1453  * where no changes are needed to caller's posting list tuple (we avoid
1454  * allocating memory here as an optimization).
1455  *
1456  * The number of TIDs that should remain in the posting list tuple is set for
1457  * caller in *nremaining.
1458  */
1459 static BTVacuumPosting
1461  OffsetNumber updatedoffset, int *nremaining)
1462 {
1463  int live = 0;
1464  int nitem = BTreeTupleGetNPosting(posting);
1465  ItemPointer items = BTreeTupleGetPosting(posting);
1466  BTVacuumPosting vacposting = NULL;
1467 
1468  for (int i = 0; i < nitem; i++)
1469  {
1470  if (!vstate->callback(items + i, vstate->callback_state))
1471  {
1472  /* Live table TID */
1473  live++;
1474  }
1475  else if (vacposting == NULL)
1476  {
1477  /*
1478  * First dead table TID encountered.
1479  *
1480  * It's now clear that we need to delete one or more dead table
1481  * TIDs, so start maintaining metadata describing how to update
1482  * existing posting list tuple.
1483  */
1484  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1485  nitem * sizeof(uint16));
1486 
1487  vacposting->itup = posting;
1488  vacposting->updatedoffset = updatedoffset;
1489  vacposting->ndeletedtids = 0;
1490  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1491  }
1492  else
1493  {
1494  /* Second or subsequent dead table TID */
1495  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1496  }
1497  }
1498 
1499  *nremaining = live;
1500  return vacposting;
1501 }
1502 
1503 /*
1504  * btcanreturn() -- Check whether btree indexes support index-only scans.
1505  *
1506  * btrees always do, so this is trivial.
1507  */
1508 bool
1510 {
1511  return true;
1512 }
slock_t btps_mutex
Definition: nbtree.c:86
ambeginscan_function ambeginscan
Definition: amapi.h:271
uint8 amparallelvacuumoptions
Definition: amapi.h:248
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
int slock_t
Definition: s_lock.h:934
uint16 ndeletedtids
Definition: nbtree.h:784
ambulkdelete_function ambulkdelete
Definition: amapi.h:262
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:544
#define BTP_SPLIT_END
Definition: nbtree.h:78
bool amcanmulticol
Definition: amapi.h:228
BlockNumber btpo_next
Definition: nbtree.h:60
uint16 amsupport
Definition: amapi.h:216
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:212
#define AllocSetContextCreate
Definition: memutils.h:170
MemoryContext pagedelcontext
Definition: nbtree.c:51
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:401
double tuples_removed
Definition: genam.h:78
void * callback_state
Definition: nbtree.c:47
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:349
static bool _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
Definition: nbtree.c:800
uint32 TransactionId
Definition: c.h:575
uint32 btm_version
Definition: nbtree.h:102
amgettuple_function amgettuple
Definition: amapi.h:273
struct BTParallelScanDescData BTParallelScanDescData
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3770
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:728
#define RelationGetDescr(relation)
Definition: rel.h:483
bool amcanorderbyop
Definition: amapi.h:222
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:518
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:809
BlockNumber totFreePages
Definition: nbtree.c:49
MemoryContext arrayContext
Definition: nbtree.h:920
amproperty_function amproperty
Definition: amapi.h:267
#define ExclusiveLock
Definition: lockdefs.h:44
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2152
#define PointerGetDatum(X)
Definition: postgres.h:556
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:406
int itemIndex
Definition: nbtree.h:857
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3478
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1131
char * currTuples
Definition: nbtree.h:931
OffsetNumber updatedoffset
Definition: nbtree.h:781
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
#define SpinLockInit(lock)
Definition: spin.h:60
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:245
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:584
bool analyze_only
Definition: genam.h:47
struct TupleDescData * xs_itupdesc
Definition: relscan.h:140
IndexBulkDeleteCallback callback
Definition: nbtree.c:46
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:666
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:282
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1460
union BTPageOpaqueData::@46 btpo
bool report_progress
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:52
IndexTuple itup
Definition: nbtree.h:780
bool amstorage
Definition: amapi.h:236
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define P_NONE
Definition: nbtree.h:207
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1099
void ConditionVariableBroadcast(ConditionVariable *cv)
int errcode(int sqlerrcode)
Definition: elog.c:704
Relation index
Definition: genam.h:46
double vacuum_cleanup_index_scale_factor
Definition: globals.c:152
bool ampredlocks
Definition: amapi.h:240
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:881
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:137
uint32 BlockNumber
Definition: block.h:31
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2054
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:355
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:180
aminsert_function aminsert
Definition: amapi.h:261
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:967
#define LOG
Definition: elog.h:26
#define MaxTIDsPerBTreePage
Definition: nbtree.h:180
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:90
Oid amkeytype
Definition: amapi.h:250
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:1087
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:64
bool amoptionalkey
Definition: amapi.h:230
amvalidate_function amvalidate
Definition: amapi.h:269
Size btestimateparallelscan(void)
Definition: nbtree.c:582
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:70
#define BTOPTIONS_PROC
Definition: nbtree.h:580
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:848
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
IndexUniqueCheck
Definition: genam.h:112
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:622
void btbuildempty(Relation index)
Definition: nbtree.c:163
#define BT_READ
Definition: nbtree.h:588
#define SpinLockAcquire(lock)
Definition: spin.h:62
int nextTupleOffset
Definition: nbtree.h:846
void ConditionVariableInit(ConditionVariable *cv)
int lastItem
Definition: nbtree.h:856
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
float8 vacuum_cleanup_index_scale_factor
Definition: nbtree.h:966
unsigned short uint16
Definition: c.h:428
BTScanPosData markPos
Definition: nbtree.h:945
void pfree(void *pointer)
Definition: mcxt.c:1057
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:905
void ConditionVariableCancelSleep(void)
amgetbitmap_function amgetbitmap
Definition: amapi.h:274
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:732
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:219
double float8
Definition: c.h:553
uint16 BTCycleId
Definition: nbtree.h:29
ambuild_function ambuild
Definition: amapi.h:259
amoptions_function amoptions
Definition: amapi.h:266
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:297
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:298
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:948
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
BlockNumber num_pages
Definition: genam.h:74
BlockNumber pages_free
Definition: genam.h:80
BTPS_State
Definition: nbtree.c:66
BTCycleId btpo_cycleid
Definition: nbtree.h:67
void ConditionVariableSignal(ConditionVariable *cv)
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:460
bool amcaninclude
Definition: amapi.h:244
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:192
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
amcostestimate_function amcostestimate
Definition: amapi.h:265
#define BTPageGetMeta(p)
Definition: nbtree.h:115
bool amcanunique
Definition: amapi.h:226
uint32 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
Definition: nbtpage.c:1782
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6241
int numArrayKeys
Definition: nbtree.h:915
ConditionVariable btps_cv
Definition: nbtree.c:87
ItemPointerData xs_heaptid
Definition: relscan.h:144
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2026
static char * buf
Definition: pg_test_fsync.c:68
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:263
amendscan_function amendscan
Definition: amapi.h:275
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:523
ScanKeyData * ScanKey
Definition: skey.h:75
bool amcanbackward
Definition: amapi.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:864
IndexTupleData * IndexTuple
Definition: itup.h:53
int arrayKeyCount
Definition: nbtree.h:917
ScanDirection
Definition: sdir.h:22
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:603
char * markTuples
Definition: nbtree.h:932
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:491
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:647
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:146
BlockNumber pages_deleted
Definition: genam.h:79
#define OffsetToPointer(base, offset)
Definition: c.h:695
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
ScanKey arrayKeyData
Definition: nbtree.h:914
amrescan_function amrescan
Definition: amapi.h:272
bool amcanparallel
Definition: amapi.h:242
#define BTREE_METAPAGE
Definition: nbtree.h:142
#define P_ISDELETED(opaque)
Definition: nbtree.h:217
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
TransactionId oldestBtpoXact
Definition: nbtree.c:50
BTCycleId cycleid
Definition: nbtree.c:48
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
#define SpinLockRelease(lock)
Definition: spin.h:64
bool amsearchnulls
Definition: amapi.h:234
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:223
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:496
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:163
void * palloc0(Size size)
Definition: mcxt.c:981
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:981
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
uintptr_t Datum
Definition: postgres.h:367
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:785
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:99
bool amclusterable
Definition: amapi.h:238
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:211
BlockNumber btps_scanPage
Definition: nbtree.c:80
bool amsearcharray
Definition: amapi.h:232
#define InvalidOid
Definition: postgres_ext.h:36
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:965
#define ereport(elevel,...)
Definition: elog.h:155
bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
Definition: procarray.c:4163
bool amusemaintenanceworkmem
Definition: amapi.h:246
int markItemIndex
Definition: nbtree.h:941
RelFileNode node
Definition: relfilenode.h:74
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:361
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1002
double num_heap_tuples
Definition: genam.h:51
#define makeNode(_type_)
Definition: nodes.h:576
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int * killedItems
Definition: nbtree.h:923
amadjustmembers_function amadjustmembers
Definition: amapi.h:270
#define Assert(condition)
Definition: c.h:792
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:52
IndexVacuumInfo * info
Definition: nbtree.c:44
BTPS_State btps_pageStatus
Definition: nbtree.c:81
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2129
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:526
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:108
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:887
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:528
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:110
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1422
int numberOfKeys
Definition: nbtree.h:910
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:199
ammarkpos_function ammarkpos
Definition: amapi.h:276
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
bool amcanorder
Definition: amapi.h:220
IndexBulkDeleteResult * stats
Definition: nbtree.c:45
ambuildphasename_function ambuildphasename
Definition: amapi.h:268
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:45
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1509
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:606
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:280
#define BTNProcs
Definition: nbtree.h:581
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:705
ItemPointerData heapTid
Definition: nbtree.h:820
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:859
struct ScanKeyData * keyData
Definition: relscan.h:119
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:769
static Datum values[MAXATTR]
Definition: bootstrap.c:165
uint16 amstrategies
Definition: amapi.h:214
BTScanPosData currPos
Definition: nbtree.h:944
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:950
void btinitparallelscan(void *target)
Definition: nbtree.c:591
ScanKey keyData
Definition: nbtree.h:911
uint16 amoptsprocnum
Definition: amapi.h:218
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:996
ambuildempty_function ambuildempty
Definition: amapi.h:260
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:919
int i
bool kill_prior_tuple
Definition: relscan.h:125
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:203
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:387
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1454
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2104
#define BTMaxStrategyNumber
Definition: stratnum.h:35
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:108
Buffer buf
Definition: nbtree.h:827
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1969
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void vacuum_delay_point(void)
Definition: vacuum.c:2047
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1052
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2692
uint16 btpo_flags
Definition: nbtree.h:66
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
double num_index_tuples
Definition: genam.h:77
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:264
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:214
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:660
bool estimated_count
Definition: genam.h:76
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3738
bytea * rd_options
Definition: rel.h:158
#define offsetof(type, field)
Definition: c.h:715
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:875
Pointer Page
Definition: bufpage.h:78
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:84
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:281
bool estimated_count
Definition: genam.h:49
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:875
amrestrpos_function amrestrpos
Definition: amapi.h:277
#define P_ISLEAF(opaque)
Definition: nbtree.h:215