PostgreSQL Source Code  git master
vacuumlazy.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  * Concurrent ("lazy") vacuuming.
5  *
6  * The major space usage for vacuuming is storage for the array of dead TIDs
7  * that are to be removed from indexes. We want to ensure we can vacuum even
8  * the very largest relations with finite memory space usage. To do that, we
9  * set upper bounds on the number of TIDs we can keep track of at once.
10  *
11  * We are willing to use at most maintenance_work_mem (or perhaps
12  * autovacuum_work_mem) memory space to keep track of dead TIDs. We initially
13  * allocate an array of TIDs of that size, with an upper limit that depends on
14  * table size (this limit ensures we don't allocate a huge area uselessly for
15  * vacuuming small tables). If the array threatens to overflow, we must call
16  * lazy_vacuum to vacuum indexes (and to vacuum the pages that we've pruned).
17  * This frees up the memory space dedicated to storing dead TIDs.
18  *
19  * In practice VACUUM will often complete its initial pass over the target
20  * heap relation without ever running out of space to store TIDs. This means
21  * that there only needs to be one call to lazy_vacuum, after the initial pass
22  * completes.
23  *
24  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
25  * Portions Copyright (c) 1994, Regents of the University of California
26  *
27  *
28  * IDENTIFICATION
29  * src/backend/access/heap/vacuumlazy.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <math.h>
36 
37 #include "access/amapi.h"
38 #include "access/genam.h"
39 #include "access/heapam.h"
40 #include "access/heapam_xlog.h"
41 #include "access/htup_details.h"
42 #include "access/multixact.h"
43 #include "access/parallel.h"
44 #include "access/transam.h"
45 #include "access/visibilitymap.h"
46 #include "access/xact.h"
47 #include "access/xlog.h"
48 #include "catalog/index.h"
49 #include "catalog/storage.h"
50 #include "commands/dbcommands.h"
51 #include "commands/progress.h"
52 #include "commands/vacuum.h"
53 #include "executor/instrument.h"
54 #include "miscadmin.h"
55 #include "optimizer/paths.h"
56 #include "pgstat.h"
57 #include "portability/instr_time.h"
58 #include "postmaster/autovacuum.h"
59 #include "storage/bufmgr.h"
60 #include "storage/freespace.h"
61 #include "storage/lmgr.h"
62 #include "tcop/tcopprot.h"
63 #include "utils/lsyscache.h"
64 #include "utils/memutils.h"
65 #include "utils/pg_rusage.h"
66 #include "utils/timestamp.h"
67 
68 
69 /*
70  * Space/time tradeoff parameters: do these need to be user-tunable?
71  *
72  * To consider truncating the relation, we want there to be at least
73  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
74  * is less) potentially-freeable pages.
75  */
76 #define REL_TRUNCATE_MINIMUM 1000
77 #define REL_TRUNCATE_FRACTION 16
78 
79 /*
80  * Timing parameters for truncate locking heuristics.
81  *
82  * These were not exposed as user tunable GUC values because it didn't seem
83  * that the potential for improvement was great enough to merit the cost of
84  * supporting them.
85  */
86 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
87 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
88 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
89 
90 /*
91  * Threshold that controls whether we bypass index vacuuming and heap
92  * vacuuming as an optimization
93  */
94 #define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */
95 
96 /*
97  * Perform a failsafe check every 4GB during the heap scan, approximately
98  */
99 #define FAILSAFE_EVERY_PAGES \
100  ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
101 
102 /*
103  * When a table has no indexes, vacuum the FSM after every 8GB, approximately
104  * (it won't be exact because we only vacuum FSM after processing a heap page
105  * that has some removable tuples). When there are indexes, this is ignored,
106  * and we vacuum FSM after each index/heap cleaning pass.
107  */
108 #define VACUUM_FSM_EVERY_PAGES \
109  ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
110 
111 /*
112  * Before we consider skipping a page that's marked as clean in
113  * visibility map, we must've seen at least this many clean pages.
114  */
115 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
116 
117 /*
118  * Size of the prefetch window for lazy vacuum backwards truncation scan.
119  * Needs to be a power of 2.
120  */
121 #define PREFETCH_SIZE ((BlockNumber) 32)
122 
123 /*
124  * DSM keys for parallel vacuum. Unlike other parallel execution code, since
125  * we don't need to worry about DSM keys conflicting with plan_node_id we can
126  * use small integers.
127  */
128 #define PARALLEL_VACUUM_KEY_SHARED 1
129 #define PARALLEL_VACUUM_KEY_DEAD_ITEMS 2
130 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 3
131 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4
132 #define PARALLEL_VACUUM_KEY_WAL_USAGE 5
133 
134 /*
135  * Macro to check if we are in a parallel vacuum. If true, we are in the
136  * parallel mode and the DSM segment is initialized.
137  */
138 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
139 
140 /* Phases of vacuum during which we report error context. */
141 typedef enum
142 {
150 
151 /*
152  * LVDeadItems stores TIDs whose index tuples are deleted by index vacuuming.
153  * Each TID points to an LP_DEAD line pointer from a heap page that has been
154  * processed by lazy_scan_prune.
155  *
156  * Also needed by lazy_vacuum_heap_rel, which marks the same LP_DEAD line
157  * pointers as LP_UNUSED during second heap pass.
158  */
159 typedef struct LVDeadItems
160 {
161  int max_items; /* # slots allocated in array */
162  int num_items; /* current # of entries */
163 
164  /* Sorted array of TIDs to delete from indexes */
167 
168 #define MAXDEADITEMS(avail_mem) \
169  (((avail_mem) - offsetof(LVDeadItems, items)) / sizeof(ItemPointerData))
170 
171 /*
172  * Shared information among parallel workers. So this is allocated in the DSM
173  * segment.
174  */
175 typedef struct LVShared
176 {
177  /*
178  * Target table relid and log level. These fields are not modified during
179  * the lazy vacuum.
180  */
182  int elevel;
183 
184  /*
185  * An indication for vacuum workers to perform either index vacuum or
186  * index cleanup. first_time is true only if for_cleanup is true and
187  * bulk-deletion is not performed yet.
188  */
191 
192  /*
193  * Fields for both index vacuum and cleanup.
194  *
195  * reltuples is the total number of input heap tuples. We set either old
196  * live tuples in the index vacuum case or the new live tuples in the
197  * index cleanup case.
198  *
199  * estimated_count is true if reltuples is an estimated value. (Note that
200  * reltuples could be -1 in this case, indicating we have no idea.)
201  */
202  double reltuples;
204 
205  /*
206  * In single process lazy vacuum we could consume more memory during index
207  * vacuuming or cleanup apart from the memory for heap scanning. In
208  * parallel vacuum, since individual vacuum workers can consume memory
209  * equal to maintenance_work_mem, the new maintenance_work_mem for each
210  * worker is set such that the parallel operation doesn't consume more
211  * memory than single process lazy vacuum.
212  */
214 
215  /*
216  * Shared vacuum cost balance. During parallel vacuum,
217  * VacuumSharedCostBalance points to this value and it accumulates the
218  * balance of each parallel vacuum worker.
219  */
221 
222  /*
223  * Number of active parallel workers. This is used for computing the
224  * minimum threshold of the vacuum cost balance before a worker sleeps for
225  * cost-based delay.
226  */
228 
229  /*
230  * Variables to control parallel vacuum. We have a bitmap to indicate
231  * which index has stats in shared memory. The set bit in the map
232  * indicates that the particular index supports a parallel vacuum.
233  */
234  pg_atomic_uint32 idx; /* counter for vacuuming and clean up */
235  uint32 offset; /* sizeof header incl. bitmap */
236  bits8 bitmap[FLEXIBLE_ARRAY_MEMBER]; /* bit map of NULLs */
237 
238  /* Shared index statistics data follows at end of struct */
240 
241 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
242 #define GetSharedIndStats(s) \
243  ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
244 #define IndStatsIsNull(s, i) \
245  (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
246 
247 /*
248  * Struct for an index bulk-deletion statistic used for parallel vacuum. This
249  * is allocated in the DSM segment.
250  */
251 typedef struct LVSharedIndStats
252 {
253  bool updated; /* are the stats updated? */
256 
257 /* Struct for maintaining a parallel vacuum state. */
258 typedef struct LVParallelState
259 {
261 
262  /* Shared information among parallel vacuum workers */
264 
265  /* Points to buffer usage area in DSM */
267 
268  /* Points to WAL usage area in DSM */
270 
271  /*
272  * The number of indexes that support parallel index bulk-deletion and
273  * parallel index cleanup respectively.
274  */
279 
280 typedef struct LVRelState
281 {
282  /* Target heap relation and its indexes */
285  int nindexes;
286 
287  /* Wraparound failsafe has been triggered? */
289  /* Consider index vacuuming bypass optimization? */
291 
292  /* Doing index vacuuming, index cleanup, rel truncation? */
296 
297  /* Buffer access strategy and parallel state */
300 
301  /* rel's initial relfrozenxid and relminmxid */
304  double old_live_tuples; /* previous value of pg_class.reltuples */
305 
306  /* VACUUM operation's cutoff for pruning */
308  /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
311 
312  /* Error reporting state */
314  char *relname;
315  char *indname;
316  BlockNumber blkno; /* used only for heap operations */
317  OffsetNumber offnum; /* used only for heap operations */
319 
320  /*
321  * State managed by lazy_scan_heap() follows
322  */
323  LVDeadItems *dead_items; /* TIDs whose index tuples we'll delete */
324  BlockNumber rel_pages; /* total number of pages */
325  BlockNumber scanned_pages; /* number of pages we examined */
326  BlockNumber pinskipped_pages; /* # of pages skipped due to a pin */
327  BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */
328  BlockNumber tupcount_pages; /* pages whose tuples we counted */
329  BlockNumber pages_removed; /* pages remove by truncation */
330  BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */
331  BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
332 
333  /* Statistics output by us, for table */
334  double new_rel_tuples; /* new estimated total # of tuples */
335  double new_live_tuples; /* new estimated total # of live tuples */
336  /* Statistics output by index AMs */
338 
339  /* Instrumentation counters */
341  int64 tuples_deleted; /* # deleted from table */
342  int64 lpdead_items; /* # deleted from indexes */
343  int64 new_dead_tuples; /* new estimated total # of dead items in
344  * table */
345  int64 num_tuples; /* total number of nonremovable tuples */
346  int64 live_tuples; /* live tuples (reltuples estimate) */
348 
349 /*
350  * State returned by lazy_scan_prune()
351  */
352 typedef struct LVPagePruneState
353 {
354  bool hastup; /* Page is truncatable? */
355  bool has_lpdead_items; /* includes existing LP_DEAD items */
356 
357  /*
358  * State describes the proper VM bit states to set for the page following
359  * pruning and freezing. all_visible implies !has_lpdead_items, but don't
360  * trust all_frozen result unless all_visible is also set to true.
361  */
362  bool all_visible; /* Every item visible to all? */
363  bool all_frozen; /* provided all_visible is also true */
364  TransactionId visibility_cutoff_xid; /* For recovery conflicts */
366 
367 /* Struct for saving and restoring vacuum error information. */
368 typedef struct LVSavedErrInfo
369 {
374 
375 /* elevel controls whole VACUUM's verbosity */
376 static int elevel = -1;
377 
378 
379 /* non-export function prototypes */
380 static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params,
381  bool aggressive);
382 static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
383  BlockNumber blkno, Page page,
384  GlobalVisState *vistest,
385  LVPagePruneState *prunestate);
386 static void lazy_vacuum(LVRelState *vacrel);
387 static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
388 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
389 static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
390  Buffer buffer, int index, Buffer *vmbuffer);
391 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
392  LVRelState *vacrel);
393 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
396 static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers);
397 static void do_parallel_processing(LVRelState *vacrel,
398  LVShared *lvshared);
400  LVShared *lvshared);
402  IndexBulkDeleteResult *istat,
403  LVShared *lvshared,
404  LVSharedIndStats *shared_indstats,
405  LVRelState *vacrel);
406 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
408  IndexBulkDeleteResult *istat,
409  double reltuples,
410  LVRelState *vacrel);
412  IndexBulkDeleteResult *istat,
413  double reltuples,
414  bool estimated_count,
415  LVRelState *vacrel);
416 static bool should_attempt_truncation(LVRelState *vacrel);
417 static void lazy_truncate_heap(LVRelState *vacrel);
419  bool *lock_waiter_detected);
420 static int dead_items_max_items(LVRelState *vacrel);
421 static inline Size max_items_to_alloc_size(int max_items);
422 static void dead_items_alloc(LVRelState *vacrel, int nworkers);
423 static void dead_items_cleanup(LVRelState *vacrel);
424 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
425 static int vac_cmp_itemptr(const void *left, const void *right);
426 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
427  TransactionId *visibility_cutoff_xid, bool *all_frozen);
429  int nrequested,
430  bool *will_parallel_vacuum);
431 static void update_index_statistics(LVRelState *vacrel);
432 static void begin_parallel_vacuum(LVRelState *vacrel, int nrequested);
433 static void end_parallel_vacuum(LVRelState *vacrel);
434 static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx);
435 static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared);
436 static void vacuum_error_callback(void *arg);
437 static void update_vacuum_error_info(LVRelState *vacrel,
438  LVSavedErrInfo *saved_vacrel,
439  int phase, BlockNumber blkno,
440  OffsetNumber offnum);
441 static void restore_vacuum_error_info(LVRelState *vacrel,
442  const LVSavedErrInfo *saved_vacrel);
443 
444 
445 /*
446  * heap_vacuum_rel() -- perform VACUUM for one heap relation
447  *
448  * This routine sets things up for and then calls lazy_scan_heap, where
449  * almost all work actually takes place. Finalizes everything after call
450  * returns by managing rel truncation and updating pg_class statistics.
451  *
452  * At entry, we have already established a transaction and opened
453  * and locked the relation.
454  */
455 void
457  BufferAccessStrategy bstrategy)
458 {
459  LVRelState *vacrel;
460  PGRUsage ru0;
461  TimestampTz starttime = 0;
462  WalUsage walusage_start = pgWalUsage;
463  WalUsage walusage = {0, 0, 0};
464  long secs;
465  int usecs;
466  double read_rate,
467  write_rate;
468  bool aggressive; /* should we scan all unfrozen pages? */
469  bool scanned_all_unfrozen; /* actually scanned all such pages? */
470  char **indnames = NULL;
471  TransactionId xidFullScanLimit;
472  MultiXactId mxactFullScanLimit;
473  BlockNumber new_rel_pages;
474  BlockNumber new_rel_allvisible;
475  double new_live_tuples;
476  TransactionId new_frozen_xid;
477  MultiXactId new_min_multi;
478  ErrorContextCallback errcallback;
479  PgStat_Counter startreadtime = 0;
480  PgStat_Counter startwritetime = 0;
481  TransactionId OldestXmin;
482  TransactionId FreezeLimit;
483  MultiXactId MultiXactCutoff;
484 
485  /* measure elapsed time iff autovacuum logging requires it */
486  if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
487  {
488  pg_rusage_init(&ru0);
489  starttime = GetCurrentTimestamp();
490  if (track_io_timing)
491  {
492  startreadtime = pgStatBlockReadTime;
493  startwritetime = pgStatBlockWriteTime;
494  }
495  }
496 
497  if (params->options & VACOPT_VERBOSE)
498  elevel = INFO;
499  else
500  elevel = DEBUG2;
501 
503  RelationGetRelid(rel));
504 
506  params->freeze_min_age,
507  params->freeze_table_age,
508  params->multixact_freeze_min_age,
510  &OldestXmin, &FreezeLimit, &xidFullScanLimit,
511  &MultiXactCutoff, &mxactFullScanLimit);
512 
513  /*
514  * We request an aggressive scan if the table's frozen Xid is now older
515  * than or equal to the requested Xid full-table scan limit; or if the
516  * table's minimum MultiXactId is older than or equal to the requested
517  * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
518  */
519  aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid,
520  xidFullScanLimit);
521  aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid,
522  mxactFullScanLimit);
524  aggressive = true;
525 
526  vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
527 
528  /* Set up high level stuff about rel */
529  vacrel->rel = rel;
530  vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
531  &vacrel->indrels);
532  vacrel->failsafe_active = false;
533  vacrel->consider_bypass_optimization = true;
534 
535  /*
536  * The index_cleanup param either disables index vacuuming and cleanup or
537  * forces it to go ahead when we would otherwise apply the index bypass
538  * optimization. The default is 'auto', which leaves the final decision
539  * up to lazy_vacuum().
540  *
541  * The truncate param allows user to avoid attempting relation truncation,
542  * though it can't force truncation to happen.
543  */
546  params->truncate != VACOPTVALUE_AUTO);
547  vacrel->do_index_vacuuming = true;
548  vacrel->do_index_cleanup = true;
549  vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED);
550  if (params->index_cleanup == VACOPTVALUE_DISABLED)
551  {
552  /* Force disable index vacuuming up-front */
553  vacrel->do_index_vacuuming = false;
554  vacrel->do_index_cleanup = false;
555  }
556  else if (params->index_cleanup == VACOPTVALUE_ENABLED)
557  {
558  /* Force index vacuuming. Note that failsafe can still bypass. */
559  vacrel->consider_bypass_optimization = false;
560  }
561  else
562  {
563  /* Default/auto, make all decisions dynamically */
565  }
566 
567  vacrel->bstrategy = bstrategy;
568  vacrel->relfrozenxid = rel->rd_rel->relfrozenxid;
569  vacrel->relminmxid = rel->rd_rel->relminmxid;
570  vacrel->old_live_tuples = rel->rd_rel->reltuples;
571 
572  /* Set cutoffs for entire VACUUM */
573  vacrel->OldestXmin = OldestXmin;
574  vacrel->FreezeLimit = FreezeLimit;
575  vacrel->MultiXactCutoff = MultiXactCutoff;
576 
578  vacrel->relname = pstrdup(RelationGetRelationName(rel));
579  vacrel->indname = NULL;
581 
582  /* Save index names iff autovacuum logging requires it */
583  if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 &&
584  vacrel->nindexes > 0)
585  {
586  indnames = palloc(sizeof(char *) * vacrel->nindexes);
587  for (int i = 0; i < vacrel->nindexes; i++)
588  indnames[i] =
590  }
591 
592  /*
593  * Setup error traceback support for ereport(). The idea is to set up an
594  * error context callback to display additional information on any error
595  * during a vacuum. During different phases of vacuum (heap scan, heap
596  * vacuum, index vacuum, index clean up, heap truncate), we update the
597  * error context callback to display appropriate information.
598  *
599  * Note that the index vacuum and heap vacuum phases may be called
600  * multiple times in the middle of the heap scan phase. So the old phase
601  * information is restored at the end of those phases.
602  */
603  errcallback.callback = vacuum_error_callback;
604  errcallback.arg = vacrel;
605  errcallback.previous = error_context_stack;
606  error_context_stack = &errcallback;
607 
608  /*
609  * Call lazy_scan_heap to perform all required heap pruning, index
610  * vacuuming, and heap vacuuming (plus related processing)
611  */
612  lazy_scan_heap(vacrel, params, aggressive);
613 
614  /* Done with indexes */
615  vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
616 
617  /*
618  * Compute whether we actually scanned the all unfrozen pages. If we did,
619  * we can adjust relfrozenxid and relminmxid.
620  *
621  * NB: We need to check this before truncating the relation, because that
622  * will change ->rel_pages.
623  */
624  if ((vacrel->scanned_pages + vacrel->frozenskipped_pages)
625  < vacrel->rel_pages)
626  {
627  Assert(!aggressive);
628  scanned_all_unfrozen = false;
629  }
630  else
631  scanned_all_unfrozen = true;
632 
633  /*
634  * Optionally truncate the relation.
635  */
636  if (should_attempt_truncation(vacrel))
637  {
638  /*
639  * Update error traceback information. This is the last phase during
640  * which we add context information to errors, so we don't need to
641  * revert to the previous phase.
642  */
644  vacrel->nonempty_pages,
646  lazy_truncate_heap(vacrel);
647  }
648 
649  /* Pop the error context stack */
650  error_context_stack = errcallback.previous;
651 
652  /* Report that we are now doing final cleanup */
655 
656  /*
657  * Update statistics in pg_class.
658  *
659  * In principle new_live_tuples could be -1 indicating that we (still)
660  * don't know the tuple count. In practice that probably can't happen,
661  * since we'd surely have scanned some pages if the table is new and
662  * nonempty.
663  *
664  * For safety, clamp relallvisible to be not more than what we're setting
665  * relpages to.
666  *
667  * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
668  * since then we don't know for certain that all tuples have a newer xmin.
669  */
670  new_rel_pages = vacrel->rel_pages;
671  new_live_tuples = vacrel->new_live_tuples;
672 
673  visibilitymap_count(rel, &new_rel_allvisible, NULL);
674  if (new_rel_allvisible > new_rel_pages)
675  new_rel_allvisible = new_rel_pages;
676 
677  new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
678  new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
679 
681  new_rel_pages,
682  new_live_tuples,
683  new_rel_allvisible,
684  vacrel->nindexes > 0,
685  new_frozen_xid,
686  new_min_multi,
687  false);
688 
689  /*
690  * Report results to the stats collector, too.
691  *
692  * Deliberately avoid telling the stats collector about LP_DEAD items that
693  * remain in the table due to VACUUM bypassing index and heap vacuuming.
694  * ANALYZE will consider the remaining LP_DEAD items to be dead "tuples".
695  * It seems like a good idea to err on the side of not vacuuming again too
696  * soon in cases where the failsafe prevented significant amounts of heap
697  * vacuuming.
698  */
700  rel->rd_rel->relisshared,
701  Max(new_live_tuples, 0),
702  vacrel->new_dead_tuples);
704 
705  /* and log the action if appropriate */
706  if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
707  {
708  TimestampTz endtime = GetCurrentTimestamp();
709 
710  if (params->log_min_duration == 0 ||
711  TimestampDifferenceExceeds(starttime, endtime,
712  params->log_min_duration))
713  {
715  char *msgfmt;
716  BlockNumber orig_rel_pages;
717 
718  TimestampDifference(starttime, endtime, &secs, &usecs);
719 
720  memset(&walusage, 0, sizeof(WalUsage));
721  WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
722 
723  read_rate = 0;
724  write_rate = 0;
725  if ((secs > 0) || (usecs > 0))
726  {
727  read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
728  (secs + usecs / 1000000.0);
729  write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
730  (secs + usecs / 1000000.0);
731  }
732 
733  /*
734  * This is pretty messy, but we split it up so that we can skip
735  * emitting individual parts of the message when not applicable.
736  */
738  if (params->is_wraparound)
739  {
740  /*
741  * While it's possible for a VACUUM to be both is_wraparound
742  * and !aggressive, that's just a corner-case -- is_wraparound
743  * implies aggressive. Produce distinct output for the corner
744  * case all the same, just in case.
745  */
746  if (aggressive)
747  msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
748  else
749  msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
750  }
751  else
752  {
753  if (aggressive)
754  msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
755  else
756  msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
757  }
758  appendStringInfo(&buf, msgfmt,
760  vacrel->relnamespace,
761  vacrel->relname,
762  vacrel->num_index_scans);
763  appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
764  vacrel->pages_removed,
765  vacrel->rel_pages,
766  vacrel->pinskipped_pages,
767  vacrel->frozenskipped_pages);
769  _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
770  (long long) vacrel->tuples_deleted,
771  (long long) vacrel->new_rel_tuples,
772  (long long) vacrel->new_dead_tuples,
773  OldestXmin);
774  orig_rel_pages = vacrel->rel_pages + vacrel->pages_removed;
775  if (orig_rel_pages > 0)
776  {
777  if (vacrel->do_index_vacuuming)
778  {
779  if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0)
780  appendStringInfoString(&buf, _("index scan not needed: "));
781  else
782  appendStringInfoString(&buf, _("index scan needed: "));
783 
784  msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
785  }
786  else
787  {
788  if (!vacrel->failsafe_active)
789  appendStringInfoString(&buf, _("index scan bypassed: "));
790  else
791  appendStringInfoString(&buf, _("index scan bypassed by failsafe: "));
792 
793  msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
794  }
795  appendStringInfo(&buf, msgfmt,
796  vacrel->lpdead_item_pages,
797  100.0 * vacrel->lpdead_item_pages / orig_rel_pages,
798  (long long) vacrel->lpdead_items);
799  }
800  for (int i = 0; i < vacrel->nindexes; i++)
801  {
802  IndexBulkDeleteResult *istat = vacrel->indstats[i];
803 
804  if (!istat)
805  continue;
806 
808  _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
809  indnames[i],
810  istat->num_pages,
811  istat->pages_newly_deleted,
812  istat->pages_deleted,
813  istat->pages_free);
814  }
815  if (track_io_timing)
816  {
817  double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000;
818  double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000;
819 
820  appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"),
821  read_ms, write_ms);
822  }
823  appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
824  read_rate, write_rate);
826  _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
827  (long long) VacuumPageHit,
828  (long long) VacuumPageMiss,
829  (long long) VacuumPageDirty);
831  _("WAL usage: %lld records, %lld full page images, %llu bytes\n"),
832  (long long) walusage.wal_records,
833  (long long) walusage.wal_fpi,
834  (unsigned long long) walusage.wal_bytes);
835  appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
836 
837  ereport(LOG,
838  (errmsg_internal("%s", buf.data)));
839  pfree(buf.data);
840  }
841  }
842 
843  /* Cleanup index statistics and index names */
844  for (int i = 0; i < vacrel->nindexes; i++)
845  {
846  if (vacrel->indstats[i])
847  pfree(vacrel->indstats[i]);
848 
849  if (indnames && indnames[i])
850  pfree(indnames[i]);
851  }
852 }
853 
854 /*
855  * lazy_scan_heap() -- workhorse function for VACUUM
856  *
857  * This routine prunes each page in the heap, and considers the need to
858  * freeze remaining tuples with storage (not including pages that can be
859  * skipped using the visibility map). Also performs related maintenance
860  * of the FSM and visibility map. These steps all take place during an
861  * initial pass over the target heap relation.
862  *
863  * Also invokes lazy_vacuum_all_indexes to vacuum indexes, which largely
864  * consists of deleting index tuples that point to LP_DEAD items left in
865  * heap pages following pruning. Earlier initial pass over the heap will
866  * have collected the TIDs whose index tuples need to be removed.
867  *
868  * Finally, invokes lazy_vacuum_heap_rel to vacuum heap pages, which
869  * largely consists of marking LP_DEAD items (from collected TID array)
870  * as LP_UNUSED. This has to happen in a second, final pass over the
871  * heap, to preserve a basic invariant that all index AMs rely on: no
872  * extant index tuple can ever be allowed to contain a TID that points to
873  * an LP_UNUSED line pointer in the heap. We must disallow premature
874  * recycling of line pointers to avoid index scans that get confused
875  * about which TID points to which tuple immediately after recycling.
876  * (Actually, this isn't a concern when target heap relation happens to
877  * have no indexes, which allows us to safely apply the one-pass strategy
878  * as an optimization).
879  *
880  * In practice we often have enough space to fit all TIDs, and so won't
881  * need to call lazy_vacuum more than once, after our initial pass over
882  * the heap has totally finished. Otherwise things are slightly more
883  * complicated: our "initial pass" over the heap applies only to those
884  * pages that were pruned before we needed to call lazy_vacuum, and our
885  * "final pass" over the heap only vacuums these same heap pages.
886  * However, we process indexes in full every time lazy_vacuum is called,
887  * which makes index processing very inefficient when memory is in short
888  * supply.
889  */
890 static void
891 lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
892 {
893  LVDeadItems *dead_items;
894  BlockNumber nblocks,
895  blkno,
896  next_unskippable_block,
897  next_failsafe_block,
898  next_fsm_block_to_vacuum;
899  PGRUsage ru0;
900  Buffer vmbuffer = InvalidBuffer;
901  bool skipping_blocks;
903  const int initprog_index[] = {
907  };
908  int64 initprog_val[3];
909  GlobalVisState *vistest;
910 
911  pg_rusage_init(&ru0);
912 
913  if (aggressive)
914  ereport(elevel,
915  (errmsg("aggressively vacuuming \"%s.%s\"",
916  vacrel->relnamespace,
917  vacrel->relname)));
918  else
919  ereport(elevel,
920  (errmsg("vacuuming \"%s.%s\"",
921  vacrel->relnamespace,
922  vacrel->relname)));
923 
924  nblocks = RelationGetNumberOfBlocks(vacrel->rel);
925  next_unskippable_block = 0;
926  next_failsafe_block = 0;
927  next_fsm_block_to_vacuum = 0;
928  vacrel->rel_pages = nblocks;
929  vacrel->scanned_pages = 0;
930  vacrel->pinskipped_pages = 0;
931  vacrel->frozenskipped_pages = 0;
932  vacrel->tupcount_pages = 0;
933  vacrel->pages_removed = 0;
934  vacrel->lpdead_item_pages = 0;
935  vacrel->nonempty_pages = 0;
936 
937  /* Initialize instrumentation counters */
938  vacrel->num_index_scans = 0;
939  vacrel->tuples_deleted = 0;
940  vacrel->lpdead_items = 0;
941  vacrel->new_dead_tuples = 0;
942  vacrel->num_tuples = 0;
943  vacrel->live_tuples = 0;
944 
945  vistest = GlobalVisTestFor(vacrel->rel);
946 
947  vacrel->indstats = (IndexBulkDeleteResult **)
948  palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
949 
950  /*
951  * Do failsafe precheck before calling dead_items_alloc. This ensures
952  * that parallel VACUUM won't be attempted when relfrozenxid is already
953  * dangerously old.
954  */
956 
957  /*
958  * Allocate the space for dead_items. Note that this handles parallel
959  * VACUUM initialization as part of allocating shared memory space used
960  * for dead_items.
961  */
962  dead_items_alloc(vacrel, params->nworkers);
963  dead_items = vacrel->dead_items;
964 
965  /* Report that we're scanning the heap, advertising total # of blocks */
966  initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
967  initprog_val[1] = nblocks;
968  initprog_val[2] = dead_items->max_items;
969  pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
970 
971  /*
972  * Except when aggressive is set, we want to skip pages that are
973  * all-visible according to the visibility map, but only when we can skip
974  * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
975  * sequentially, the OS should be doing readahead for us, so there's no
976  * gain in skipping a page now and then; that's likely to disable
977  * readahead and so be counterproductive. Also, skipping even a single
978  * page means that we can't update relfrozenxid, so we only want to do it
979  * if we can skip a goodly number of pages.
980  *
981  * When aggressive is set, we can't skip pages just because they are
982  * all-visible, but we can still skip pages that are all-frozen, since
983  * such pages do not need freezing and do not affect the value that we can
984  * safely set for relfrozenxid or relminmxid.
985  *
986  * Before entering the main loop, establish the invariant that
987  * next_unskippable_block is the next block number >= blkno that we can't
988  * skip based on the visibility map, either all-visible for a regular scan
989  * or all-frozen for an aggressive scan. We set it to nblocks if there's
990  * no such block. We also set up the skipping_blocks flag correctly at
991  * this stage.
992  *
993  * Note: The value returned by visibilitymap_get_status could be slightly
994  * out-of-date, since we make this test before reading the corresponding
995  * heap page or locking the buffer. This is OK. If we mistakenly think
996  * that the page is all-visible or all-frozen when in fact the flag's just
997  * been cleared, we might fail to vacuum the page. It's easy to see that
998  * skipping a page when aggressive is not set is not a very big deal; we
999  * might leave some dead tuples lying around, but the next vacuum will
1000  * find them. But even when aggressive *is* set, it's still OK if we miss
1001  * a page whose all-frozen marking has just been cleared. Any new XIDs
1002  * just added to that page are necessarily newer than the GlobalXmin we
1003  * computed, so they'll have no effect on the value to which we can safely
1004  * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
1005  *
1006  * We will scan the table's last page, at least to the extent of
1007  * determining whether it has tuples or not, even if it should be skipped
1008  * according to the above rules; except when we've already determined that
1009  * it's not worth trying to truncate the table. This avoids having
1010  * lazy_truncate_heap() take access-exclusive lock on the table to attempt
1011  * a truncation that just fails immediately because there are tuples in
1012  * the last page. This is worth avoiding mainly because such a lock must
1013  * be replayed on any hot standby, where it can be disruptive.
1014  */
1015  if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1016  {
1017  while (next_unskippable_block < nblocks)
1018  {
1019  uint8 vmstatus;
1020 
1021  vmstatus = visibilitymap_get_status(vacrel->rel,
1022  next_unskippable_block,
1023  &vmbuffer);
1024  if (aggressive)
1025  {
1026  if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
1027  break;
1028  }
1029  else
1030  {
1031  if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
1032  break;
1033  }
1035  next_unskippable_block++;
1036  }
1037  }
1038 
1039  if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
1040  skipping_blocks = true;
1041  else
1042  skipping_blocks = false;
1043 
1044  for (blkno = 0; blkno < nblocks; blkno++)
1045  {
1046  Buffer buf;
1047  Page page;
1048  bool all_visible_according_to_vm = false;
1049  LVPagePruneState prunestate;
1050 
1051  /*
1052  * Consider need to skip blocks. See note above about forcing
1053  * scanning of last page.
1054  */
1055 #define FORCE_CHECK_PAGE() \
1056  (blkno == nblocks - 1 && should_attempt_truncation(vacrel))
1057 
1059 
1061  blkno, InvalidOffsetNumber);
1062 
1063  if (blkno == next_unskippable_block)
1064  {
1065  /* Time to advance next_unskippable_block */
1066  next_unskippable_block++;
1067  if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1068  {
1069  while (next_unskippable_block < nblocks)
1070  {
1071  uint8 vmskipflags;
1072 
1073  vmskipflags = visibilitymap_get_status(vacrel->rel,
1074  next_unskippable_block,
1075  &vmbuffer);
1076  if (aggressive)
1077  {
1078  if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
1079  break;
1080  }
1081  else
1082  {
1083  if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
1084  break;
1085  }
1087  next_unskippable_block++;
1088  }
1089  }
1090 
1091  /*
1092  * We know we can't skip the current block. But set up
1093  * skipping_blocks to do the right thing at the following blocks.
1094  */
1095  if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
1096  skipping_blocks = true;
1097  else
1098  skipping_blocks = false;
1099 
1100  /*
1101  * Normally, the fact that we can't skip this block must mean that
1102  * it's not all-visible. But in an aggressive vacuum we know only
1103  * that it's not all-frozen, so it might still be all-visible.
1104  */
1105  if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1106  all_visible_according_to_vm = true;
1107  }
1108  else
1109  {
1110  /*
1111  * The current block is potentially skippable; if we've seen a
1112  * long enough run of skippable blocks to justify skipping it, and
1113  * we're not forced to check it, then go ahead and skip.
1114  * Otherwise, the page must be at least all-visible if not
1115  * all-frozen, so we can set all_visible_according_to_vm = true.
1116  */
1117  if (skipping_blocks && !FORCE_CHECK_PAGE())
1118  {
1119  /*
1120  * Tricky, tricky. If this is in aggressive vacuum, the page
1121  * must have been all-frozen at the time we checked whether it
1122  * was skippable, but it might not be any more. We must be
1123  * careful to count it as a skipped all-frozen page in that
1124  * case, or else we'll think we can't update relfrozenxid and
1125  * relminmxid. If it's not an aggressive vacuum, we don't
1126  * know whether it was all-frozen, so we have to recheck; but
1127  * in this case an approximate answer is OK.
1128  */
1129  if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1130  vacrel->frozenskipped_pages++;
1131  continue;
1132  }
1133  all_visible_according_to_vm = true;
1134  }
1135 
1137 
1138  /*
1139  * Regularly check if wraparound failsafe should trigger.
1140  *
1141  * There is a similar check inside lazy_vacuum_all_indexes(), but
1142  * relfrozenxid might start to look dangerously old before we reach
1143  * that point. This check also provides failsafe coverage for the
1144  * one-pass strategy, and the two-pass strategy with the index_cleanup
1145  * param set to 'off'.
1146  */
1147  if (blkno - next_failsafe_block >= FAILSAFE_EVERY_PAGES)
1148  {
1150  next_failsafe_block = blkno;
1151  }
1152 
1153  /*
1154  * Consider if we definitely have enough space to process TIDs on page
1155  * already. If we are close to overrunning the available space for
1156  * dead_items TIDs, pause and do a cycle of vacuuming before we tackle
1157  * this page.
1158  */
1159  Assert(dead_items->max_items >= MaxHeapTuplesPerPage);
1160  if (dead_items->max_items - dead_items->num_items < MaxHeapTuplesPerPage)
1161  {
1162  /*
1163  * Before beginning index vacuuming, we release any pin we may
1164  * hold on the visibility map page. This isn't necessary for
1165  * correctness, but we do it anyway to avoid holding the pin
1166  * across a lengthy, unrelated operation.
1167  */
1168  if (BufferIsValid(vmbuffer))
1169  {
1170  ReleaseBuffer(vmbuffer);
1171  vmbuffer = InvalidBuffer;
1172  }
1173 
1174  /* Perform a round of index and heap vacuuming */
1175  vacrel->consider_bypass_optimization = false;
1176  lazy_vacuum(vacrel);
1177 
1178  /*
1179  * Vacuum the Free Space Map to make newly-freed space visible on
1180  * upper-level FSM pages. Note we have not yet processed blkno.
1181  */
1182  FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1183  blkno);
1184  next_fsm_block_to_vacuum = blkno;
1185 
1186  /* Report that we are once again scanning the heap */
1189  }
1190 
1191  /*
1192  * Set up visibility map page as needed.
1193  *
1194  * Pin the visibility map page in case we need to mark the page
1195  * all-visible. In most cases this will be very cheap, because we'll
1196  * already have the correct page pinned anyway. However, it's
1197  * possible that (a) next_unskippable_block is covered by a different
1198  * VM page than the current block or (b) we released our pin and did a
1199  * cycle of index vacuuming.
1200  */
1201  visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
1202 
1203  buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
1204  RBM_NORMAL, vacrel->bstrategy);
1205 
1206  /*
1207  * We need buffer cleanup lock so that we can prune HOT chains and
1208  * defragment the page.
1209  */
1211  {
1212  bool hastup;
1213 
1214  /*
1215  * If we're not performing an aggressive scan to guard against XID
1216  * wraparound, and we don't want to forcibly check the page, then
1217  * it's OK to skip vacuuming pages we get a lock conflict on. They
1218  * will be dealt with in some future vacuum.
1219  */
1220  if (!aggressive && !FORCE_CHECK_PAGE())
1221  {
1222  ReleaseBuffer(buf);
1223  vacrel->pinskipped_pages++;
1224  continue;
1225  }
1226 
1227  /*
1228  * Read the page with share lock to see if any xids on it need to
1229  * be frozen. If not we just skip the page, after updating our
1230  * scan statistics. If there are some, we wait for cleanup lock.
1231  *
1232  * We could defer the lock request further by remembering the page
1233  * and coming back to it later, or we could even register
1234  * ourselves for multiple buffers and then service whichever one
1235  * is received first. For now, this seems good enough.
1236  *
1237  * If we get here with aggressive false, then we're just forcibly
1238  * checking the page, and so we don't want to insist on getting
1239  * the lock; we only need to know if the page contains tuples, so
1240  * that we can update nonempty_pages correctly. It's convenient
1241  * to use lazy_check_needs_freeze() for both situations, though.
1242  */
1244  if (!lazy_check_needs_freeze(buf, &hastup, vacrel))
1245  {
1247  vacrel->scanned_pages++;
1248  vacrel->pinskipped_pages++;
1249  if (hastup)
1250  vacrel->nonempty_pages = blkno + 1;
1251  continue;
1252  }
1253  if (!aggressive)
1254  {
1255  /*
1256  * Here, we must not advance scanned_pages; that would amount
1257  * to claiming that the page contains no freezable tuples.
1258  */
1260  vacrel->pinskipped_pages++;
1261  if (hastup)
1262  vacrel->nonempty_pages = blkno + 1;
1263  continue;
1264  }
1267  /* drop through to normal processing */
1268  }
1269 
1270  /*
1271  * By here we definitely have enough dead_items space for whatever
1272  * LP_DEAD tids are on this page, we have the visibility map page set
1273  * up in case we need to set this page's all_visible/all_frozen bit,
1274  * and we have a cleanup lock. Any tuples on this page are now sure
1275  * to be "counted" by this VACUUM.
1276  *
1277  * One last piece of preamble needs to take place before we can prune:
1278  * we need to consider new and empty pages.
1279  */
1280  vacrel->scanned_pages++;
1281  vacrel->tupcount_pages++;
1282 
1283  page = BufferGetPage(buf);
1284 
1285  if (PageIsNew(page))
1286  {
1287  /*
1288  * All-zeroes pages can be left over if either a backend extends
1289  * the relation by a single page, but crashes before the newly
1290  * initialized page has been written out, or when bulk-extending
1291  * the relation (which creates a number of empty pages at the tail
1292  * end of the relation, but enters them into the FSM).
1293  *
1294  * Note we do not enter the page into the visibilitymap. That has
1295  * the downside that we repeatedly visit this page in subsequent
1296  * vacuums, but otherwise we'll never not discover the space on a
1297  * promoted standby. The harm of repeated checking ought to
1298  * normally not be too bad - the space usually should be used at
1299  * some point, otherwise there wouldn't be any regular vacuums.
1300  *
1301  * Make sure these pages are in the FSM, to ensure they can be
1302  * reused. Do that by testing if there's any space recorded for
1303  * the page. If not, enter it. We do so after releasing the lock
1304  * on the heap page, the FSM is approximate, after all.
1305  */
1307 
1308  if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1309  {
1310  Size freespace = BLCKSZ - SizeOfPageHeaderData;
1311 
1312  RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1313  }
1314  continue;
1315  }
1316 
1317  if (PageIsEmpty(page))
1318  {
1319  Size freespace = PageGetHeapFreeSpace(page);
1320 
1321  /*
1322  * Empty pages are always all-visible and all-frozen (note that
1323  * the same is currently not true for new pages, see above).
1324  */
1325  if (!PageIsAllVisible(page))
1326  {
1328 
1329  /* mark buffer dirty before writing a WAL record */
1331 
1332  /*
1333  * It's possible that another backend has extended the heap,
1334  * initialized the page, and then failed to WAL-log the page
1335  * due to an ERROR. Since heap extension is not WAL-logged,
1336  * recovery might try to replay our record setting the page
1337  * all-visible and find that the page isn't initialized, which
1338  * will cause a PANIC. To prevent that, check whether the
1339  * page has been previously WAL-logged, and if not, do that
1340  * now.
1341  */
1342  if (RelationNeedsWAL(vacrel->rel) &&
1343  PageGetLSN(page) == InvalidXLogRecPtr)
1344  log_newpage_buffer(buf, true);
1345 
1346  PageSetAllVisible(page);
1347  visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1348  vmbuffer, InvalidTransactionId,
1350  END_CRIT_SECTION();
1351  }
1352 
1354  RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1355  continue;
1356  }
1357 
1358  /*
1359  * Prune and freeze tuples.
1360  *
1361  * Accumulates details of remaining LP_DEAD line pointers on page in
1362  * dead_items array. This includes LP_DEAD line pointers that we
1363  * pruned ourselves, as well as existing LP_DEAD line pointers that
1364  * were pruned some time earlier. Also considers freezing XIDs in the
1365  * tuple headers of remaining items with storage.
1366  */
1367  lazy_scan_prune(vacrel, buf, blkno, page, vistest, &prunestate);
1368 
1369  Assert(!prunestate.all_visible || !prunestate.has_lpdead_items);
1370 
1371  /* Remember the location of the last page with nonremovable tuples */
1372  if (prunestate.hastup)
1373  vacrel->nonempty_pages = blkno + 1;
1374 
1375  if (vacrel->nindexes == 0)
1376  {
1377  /*
1378  * Consider the need to do page-at-a-time heap vacuuming when
1379  * using the one-pass strategy now.
1380  *
1381  * The one-pass strategy will never call lazy_vacuum(). The steps
1382  * performed here can be thought of as the one-pass equivalent of
1383  * a call to lazy_vacuum().
1384  */
1385  if (prunestate.has_lpdead_items)
1386  {
1387  Size freespace;
1388 
1389  lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1390 
1391  /* Forget the LP_DEAD items that we just vacuumed */
1392  dead_items->num_items = 0;
1393 
1394  /*
1395  * Periodically perform FSM vacuuming to make newly-freed
1396  * space visible on upper FSM pages. Note we have not yet
1397  * performed FSM processing for blkno.
1398  */
1399  if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1400  {
1401  FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1402  blkno);
1403  next_fsm_block_to_vacuum = blkno;
1404  }
1405 
1406  /*
1407  * Now perform FSM processing for blkno, and move on to next
1408  * page.
1409  *
1410  * Our call to lazy_vacuum_heap_page() will have considered if
1411  * it's possible to set all_visible/all_frozen independently
1412  * of lazy_scan_prune(). Note that prunestate was invalidated
1413  * by lazy_vacuum_heap_page() call.
1414  */
1415  freespace = PageGetHeapFreeSpace(page);
1416 
1418  RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1419  continue;
1420  }
1421 
1422  /*
1423  * There was no call to lazy_vacuum_heap_page() because pruning
1424  * didn't encounter/create any LP_DEAD items that needed to be
1425  * vacuumed. Prune state has not been invalidated, so proceed
1426  * with prunestate-driven visibility map and FSM steps (just like
1427  * the two-pass strategy).
1428  */
1429  Assert(dead_items->num_items == 0);
1430  }
1431 
1432  /*
1433  * Handle setting visibility map bit based on what the VM said about
1434  * the page before pruning started, and using prunestate
1435  */
1436  if (!all_visible_according_to_vm && prunestate.all_visible)
1437  {
1439 
1440  if (prunestate.all_frozen)
1441  flags |= VISIBILITYMAP_ALL_FROZEN;
1442 
1443  /*
1444  * It should never be the case that the visibility map page is set
1445  * while the page-level bit is clear, but the reverse is allowed
1446  * (if checksums are not enabled). Regardless, set both bits so
1447  * that we get back in sync.
1448  *
1449  * NB: If the heap page is all-visible but the VM bit is not set,
1450  * we don't need to dirty the heap page. However, if checksums
1451  * are enabled, we do need to make sure that the heap page is
1452  * dirtied before passing it to visibilitymap_set(), because it
1453  * may be logged. Given that this situation should only happen in
1454  * rare cases after a crash, it is not worth optimizing.
1455  */
1456  PageSetAllVisible(page);
1458  visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1459  vmbuffer, prunestate.visibility_cutoff_xid,
1460  flags);
1461  }
1462 
1463  /*
1464  * As of PostgreSQL 9.2, the visibility map bit should never be set if
1465  * the page-level bit is clear. However, it's possible that the bit
1466  * got cleared after we checked it and before we took the buffer
1467  * content lock, so we must recheck before jumping to the conclusion
1468  * that something bad has happened.
1469  */
1470  else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1471  && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1472  {
1473  elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1474  vacrel->relname, blkno);
1475  visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1477  }
1478 
1479  /*
1480  * It's possible for the value returned by
1481  * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1482  * wrong for us to see tuples that appear to not be visible to
1483  * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1484  * xmin value never moves backwards, but
1485  * GetOldestNonRemovableTransactionId() is conservative and sometimes
1486  * returns a value that's unnecessarily small, so if we see that
1487  * contradiction it just means that the tuples that we think are not
1488  * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1489  * is correct.
1490  *
1491  * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE
1492  * set, however.
1493  */
1494  else if (prunestate.has_lpdead_items && PageIsAllVisible(page))
1495  {
1496  elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u",
1497  vacrel->relname, blkno);
1498  PageClearAllVisible(page);
1500  visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1502  }
1503 
1504  /*
1505  * If the all-visible page is all-frozen but not marked as such yet,
1506  * mark it as all-frozen. Note that all_frozen is only valid if
1507  * all_visible is true, so we must check both.
1508  */
1509  else if (all_visible_according_to_vm && prunestate.all_visible &&
1510  prunestate.all_frozen &&
1511  !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1512  {
1513  /*
1514  * We can pass InvalidTransactionId as the cutoff XID here,
1515  * because setting the all-frozen bit doesn't cause recovery
1516  * conflicts.
1517  */
1518  visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1519  vmbuffer, InvalidTransactionId,
1521  }
1522 
1523  /*
1524  * Final steps for block: drop cleanup lock, record free space in the
1525  * FSM
1526  */
1527  if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming)
1528  {
1529  /*
1530  * Wait until lazy_vacuum_heap_rel() to save free space. This
1531  * doesn't just save us some cycles; it also allows us to record
1532  * any additional free space that lazy_vacuum_heap_page() will
1533  * make available in cases where it's possible to truncate the
1534  * page's line pointer array.
1535  *
1536  * Note: It's not in fact 100% certain that we really will call
1537  * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip
1538  * index vacuuming (and so must skip heap vacuuming). This is
1539  * deemed okay because it only happens in emergencies, or when
1540  * there is very little free space anyway. (Besides, we start
1541  * recording free space in the FSM once index vacuuming has been
1542  * abandoned.)
1543  *
1544  * Note: The one-pass (no indexes) case is only supposed to make
1545  * it this far when there were no LP_DEAD items during pruning.
1546  */
1547  Assert(vacrel->nindexes > 0);
1549  }
1550  else
1551  {
1552  Size freespace = PageGetHeapFreeSpace(page);
1553 
1555  RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1556  }
1557  }
1558 
1559  /* report that everything is now scanned */
1561 
1562  /* Clear the block number information */
1563  vacrel->blkno = InvalidBlockNumber;
1564 
1565  /* now we can compute the new value for pg_class.reltuples */
1566  vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks,
1567  vacrel->tupcount_pages,
1568  vacrel->live_tuples);
1569 
1570  /*
1571  * Also compute the total number of surviving heap entries. In the
1572  * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1573  */
1574  vacrel->new_rel_tuples =
1575  Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples;
1576 
1577  /*
1578  * Release any remaining pin on visibility map page.
1579  */
1580  if (BufferIsValid(vmbuffer))
1581  {
1582  ReleaseBuffer(vmbuffer);
1583  vmbuffer = InvalidBuffer;
1584  }
1585 
1586  /* Perform a final round of index and heap vacuuming */
1587  if (dead_items->num_items > 0)
1588  lazy_vacuum(vacrel);
1589 
1590  /*
1591  * Vacuum the remainder of the Free Space Map. We must do this whether or
1592  * not there were indexes, and whether or not we bypassed index vacuuming.
1593  */
1594  if (blkno > next_fsm_block_to_vacuum)
1595  FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1596 
1597  /* report all blocks vacuumed */
1599 
1600  /* Do post-vacuum cleanup */
1601  if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1602  lazy_cleanup_all_indexes(vacrel);
1603 
1604  /*
1605  * Free resources managed by dead_items_alloc. This will end parallel
1606  * mode when needed (it must end before we update index statistics).
1607  */
1608  dead_items_cleanup(vacrel);
1609 
1610  /* Update index statistics */
1611  if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1612  update_index_statistics(vacrel);
1613 
1614  /*
1615  * When the table has no indexes (i.e. in the one-pass strategy case),
1616  * make log report that lazy_vacuum_heap_rel would've made had there been
1617  * indexes. (As in the two-pass strategy case, only make this report when
1618  * there were LP_DEAD line pointers vacuumed in lazy_vacuum_heap_page.)
1619  */
1620  if (vacrel->nindexes == 0 && vacrel->lpdead_item_pages > 0)
1621  ereport(elevel,
1622  (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
1623  vacrel->relname, (long long) vacrel->lpdead_items,
1624  vacrel->lpdead_item_pages)));
1625 
1626  /*
1627  * Make a log report summarizing pruning and freezing.
1628  *
1629  * The autovacuum specific logging in heap_vacuum_rel summarizes an entire
1630  * VACUUM operation, whereas each VACUUM VERBOSE log report generally
1631  * summarizes a single round of index/heap vacuuming (or rel truncation).
1632  * It wouldn't make sense to report on pruning or freezing while following
1633  * that convention, though. You can think of this log report as a summary
1634  * of our first pass over the heap.
1635  */
1636  initStringInfo(&buf);
1638  _("%lld dead row versions cannot be removed yet, oldest xmin: %u\n"),
1639  (long long) vacrel->new_dead_tuples, vacrel->OldestXmin);
1640  appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1641  "Skipped %u pages due to buffer pins, ",
1642  vacrel->pinskipped_pages),
1643  vacrel->pinskipped_pages);
1644  appendStringInfo(&buf, ngettext("%u frozen page.\n",
1645  "%u frozen pages.\n",
1646  vacrel->frozenskipped_pages),
1647  vacrel->frozenskipped_pages);
1648  appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1649 
1650  ereport(elevel,
1651  (errmsg("table \"%s.%s\": found %lld removable, %lld nonremovable row versions in %u out of %u pages",
1652  vacrel->relnamespace,
1653  vacrel->relname,
1654  (long long) vacrel->tuples_deleted,
1655  (long long) vacrel->num_tuples, vacrel->scanned_pages,
1656  nblocks),
1657  errdetail_internal("%s", buf.data)));
1658  pfree(buf.data);
1659 }
1660 
1661 /*
1662  * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1663  *
1664  * Caller must hold pin and buffer cleanup lock on the buffer.
1665  *
1666  * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
1667  * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
1668  * whether or not a tuple should be considered DEAD. This happened when an
1669  * inserting transaction concurrently aborted (after our heap_page_prune()
1670  * call, before our HeapTupleSatisfiesVacuum() call). There was rather a lot
1671  * of complexity just so we could deal with tuples that were DEAD to VACUUM,
1672  * but nevertheless were left with storage after pruning.
1673  *
1674  * The approach we take now is to restart pruning when the race condition is
1675  * detected. This allows heap_page_prune() to prune the tuples inserted by
1676  * the now-aborted transaction. This is a little crude, but it guarantees
1677  * that any items that make it into the dead_items array are simple LP_DEAD
1678  * line pointers, and that every remaining item with tuple storage is
1679  * considered as a candidate for freezing.
1680  */
1681 static void
1683  Buffer buf,
1684  BlockNumber blkno,
1685  Page page,
1686  GlobalVisState *vistest,
1687  LVPagePruneState *prunestate)
1688 {
1689  Relation rel = vacrel->rel;
1690  OffsetNumber offnum,
1691  maxoff;
1692  ItemId itemid;
1693  HeapTupleData tuple;
1694  HTSV_Result res;
1695  int tuples_deleted,
1696  lpdead_items,
1697  new_dead_tuples,
1698  num_tuples,
1699  live_tuples;
1700  int nnewlpdead;
1701  int nfrozen;
1702  OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1704 
1705  maxoff = PageGetMaxOffsetNumber(page);
1706 
1707 retry:
1708 
1709  /* Initialize (or reset) page-level counters */
1710  tuples_deleted = 0;
1711  lpdead_items = 0;
1712  new_dead_tuples = 0;
1713  num_tuples = 0;
1714  live_tuples = 0;
1715 
1716  /*
1717  * Prune all HOT-update chains in this page.
1718  *
1719  * We count tuples removed by the pruning step as tuples_deleted. Its
1720  * final value can be thought of as the number of tuples that have been
1721  * deleted from the table. It should not be confused with lpdead_items;
1722  * lpdead_items's final value can be thought of as the number of tuples
1723  * that were deleted from indexes.
1724  */
1725  tuples_deleted = heap_page_prune(rel, buf, vistest,
1726  InvalidTransactionId, 0, &nnewlpdead,
1727  &vacrel->offnum);
1728 
1729  /*
1730  * Now scan the page to collect LP_DEAD items and check for tuples
1731  * requiring freezing among remaining tuples with storage
1732  */
1733  prunestate->hastup = false;
1734  prunestate->has_lpdead_items = false;
1735  prunestate->all_visible = true;
1736  prunestate->all_frozen = true;
1738  nfrozen = 0;
1739 
1740  for (offnum = FirstOffsetNumber;
1741  offnum <= maxoff;
1742  offnum = OffsetNumberNext(offnum))
1743  {
1744  bool tuple_totally_frozen;
1745 
1746  /*
1747  * Set the offset number so that we can display it along with any
1748  * error that occurred while processing this tuple.
1749  */
1750  vacrel->offnum = offnum;
1751  itemid = PageGetItemId(page, offnum);
1752 
1753  if (!ItemIdIsUsed(itemid))
1754  continue;
1755 
1756  /* Redirect items mustn't be touched */
1757  if (ItemIdIsRedirected(itemid))
1758  {
1759  prunestate->hastup = true; /* page won't be truncatable */
1760  continue;
1761  }
1762 
1763  /*
1764  * LP_DEAD items are processed outside of the loop.
1765  *
1766  * Note that we deliberately don't set hastup=true in the case of an
1767  * LP_DEAD item here, which is not how lazy_check_needs_freeze() or
1768  * count_nondeletable_pages() do it -- they only consider pages empty
1769  * when they only have LP_UNUSED items, which is important for
1770  * correctness.
1771  *
1772  * Our assumption is that any LP_DEAD items we encounter here will
1773  * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
1774  * call count_nondeletable_pages(). In any case our opinion of
1775  * whether or not a page 'hastup' (which is how our caller sets its
1776  * vacrel->nonempty_pages value) is inherently race-prone. It must be
1777  * treated as advisory/unreliable, so we might as well be slightly
1778  * optimistic.
1779  */
1780  if (ItemIdIsDead(itemid))
1781  {
1782  deadoffsets[lpdead_items++] = offnum;
1783  prunestate->all_visible = false;
1784  prunestate->has_lpdead_items = true;
1785  continue;
1786  }
1787 
1788  Assert(ItemIdIsNormal(itemid));
1789 
1790  ItemPointerSet(&(tuple.t_self), blkno, offnum);
1791  tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1792  tuple.t_len = ItemIdGetLength(itemid);
1793  tuple.t_tableOid = RelationGetRelid(rel);
1794 
1795  /*
1796  * DEAD tuples are almost always pruned into LP_DEAD line pointers by
1797  * heap_page_prune(), but it's possible that the tuple state changed
1798  * since heap_page_prune() looked. Handle that here by restarting.
1799  * (See comments at the top of function for a full explanation.)
1800  */
1801  res = HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf);
1802 
1803  if (unlikely(res == HEAPTUPLE_DEAD))
1804  goto retry;
1805 
1806  /*
1807  * The criteria for counting a tuple as live in this block need to
1808  * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM
1809  * and ANALYZE may produce wildly different reltuples values, e.g.
1810  * when there are many recently-dead tuples.
1811  *
1812  * The logic here is a bit simpler than acquire_sample_rows(), as
1813  * VACUUM can't run inside a transaction block, which makes some cases
1814  * impossible (e.g. in-progress insert from the same transaction).
1815  *
1816  * We treat LP_DEAD items (which are the closest thing to DEAD tuples
1817  * that might be seen here) differently, too: we assume that they'll
1818  * become LP_UNUSED before VACUUM finishes. This difference is only
1819  * superficial. VACUUM effectively agrees with ANALYZE about DEAD
1820  * items, in the end. VACUUM won't remember LP_DEAD items, but only
1821  * because they're not supposed to be left behind when it is done.
1822  * (Cases where we bypass index vacuuming will violate this optimistic
1823  * assumption, but the overall impact of that should be negligible.)
1824  */
1825  switch (res)
1826  {
1827  case HEAPTUPLE_LIVE:
1828 
1829  /*
1830  * Count it as live. Not only is this natural, but it's also
1831  * what acquire_sample_rows() does.
1832  */
1833  live_tuples++;
1834 
1835  /*
1836  * Is the tuple definitely visible to all transactions?
1837  *
1838  * NB: Like with per-tuple hint bits, we can't set the
1839  * PD_ALL_VISIBLE flag if the inserter committed
1840  * asynchronously. See SetHintBits for more info. Check that
1841  * the tuple is hinted xmin-committed because of that.
1842  */
1843  if (prunestate->all_visible)
1844  {
1845  TransactionId xmin;
1846 
1848  {
1849  prunestate->all_visible = false;
1850  break;
1851  }
1852 
1853  /*
1854  * The inserter definitely committed. But is it old enough
1855  * that everyone sees it as committed?
1856  */
1857  xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1858  if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
1859  {
1860  prunestate->all_visible = false;
1861  break;
1862  }
1863 
1864  /* Track newest xmin on page. */
1865  if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid))
1866  prunestate->visibility_cutoff_xid = xmin;
1867  }
1868  break;
1870 
1871  /*
1872  * If tuple is recently deleted then we must not remove it
1873  * from relation. (We only remove items that are LP_DEAD from
1874  * pruning.)
1875  */
1876  new_dead_tuples++;
1877  prunestate->all_visible = false;
1878  break;
1880 
1881  /*
1882  * We do not count these rows as live, because we expect the
1883  * inserting transaction to update the counters at commit, and
1884  * we assume that will happen only after we report our
1885  * results. This assumption is a bit shaky, but it is what
1886  * acquire_sample_rows() does, so be consistent.
1887  */
1888  prunestate->all_visible = false;
1889  break;
1891  /* This is an expected case during concurrent vacuum */
1892  prunestate->all_visible = false;
1893 
1894  /*
1895  * Count such rows as live. As above, we assume the deleting
1896  * transaction will commit and update the counters after we
1897  * report.
1898  */
1899  live_tuples++;
1900  break;
1901  default:
1902  elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1903  break;
1904  }
1905 
1906  /*
1907  * Non-removable tuple (i.e. tuple with storage).
1908  *
1909  * Check tuple left behind after pruning to see if needs to be frozen
1910  * now.
1911  */
1912  num_tuples++;
1913  prunestate->hastup = true;
1915  vacrel->relfrozenxid,
1916  vacrel->relminmxid,
1917  vacrel->FreezeLimit,
1918  vacrel->MultiXactCutoff,
1919  &frozen[nfrozen],
1920  &tuple_totally_frozen))
1921  {
1922  /* Will execute freeze below */
1923  frozen[nfrozen++].offset = offnum;
1924  }
1925 
1926  /*
1927  * If tuple is not frozen (and not about to become frozen) then caller
1928  * had better not go on to set this page's VM bit
1929  */
1930  if (!tuple_totally_frozen)
1931  prunestate->all_frozen = false;
1932  }
1933 
1934  /*
1935  * We have now divided every item on the page into either an LP_DEAD item
1936  * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
1937  * that remains and needs to be considered for freezing now (LP_UNUSED and
1938  * LP_REDIRECT items also remain, but are of no further interest to us).
1939  */
1940  vacrel->offnum = InvalidOffsetNumber;
1941 
1942  /*
1943  * Consider the need to freeze any items with tuple storage from the page
1944  * first (arbitrary)
1945  */
1946  if (nfrozen > 0)
1947  {
1948  Assert(prunestate->hastup);
1949 
1950  /*
1951  * At least one tuple with storage needs to be frozen -- execute that
1952  * now.
1953  *
1954  * If we need to freeze any tuples we'll mark the buffer dirty, and
1955  * write a WAL record recording the changes. We must log the changes
1956  * to be crash-safe against future truncation of CLOG.
1957  */
1959 
1961 
1962  /* execute collected freezes */
1963  for (int i = 0; i < nfrozen; i++)
1964  {
1965  HeapTupleHeader htup;
1966 
1967  itemid = PageGetItemId(page, frozen[i].offset);
1968  htup = (HeapTupleHeader) PageGetItem(page, itemid);
1969 
1970  heap_execute_freeze_tuple(htup, &frozen[i]);
1971  }
1972 
1973  /* Now WAL-log freezing if necessary */
1974  if (RelationNeedsWAL(vacrel->rel))
1975  {
1976  XLogRecPtr recptr;
1977 
1978  recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit,
1979  frozen, nfrozen);
1980  PageSetLSN(page, recptr);
1981  }
1982 
1983  END_CRIT_SECTION();
1984  }
1985 
1986  /*
1987  * The second pass over the heap can also set visibility map bits, using
1988  * the same approach. This is important when the table frequently has a
1989  * few old LP_DEAD items on each page by the time we get to it (typically
1990  * because past opportunistic pruning operations freed some non-HOT
1991  * tuples).
1992  *
1993  * VACUUM will call heap_page_is_all_visible() during the second pass over
1994  * the heap to determine all_visible and all_frozen for the page -- this
1995  * is a specialized version of the logic from this function. Now that
1996  * we've finished pruning and freezing, make sure that we're in total
1997  * agreement with heap_page_is_all_visible() using an assertion.
1998  */
1999 #ifdef USE_ASSERT_CHECKING
2000  /* Note that all_frozen value does not matter when !all_visible */
2001  if (prunestate->all_visible)
2002  {
2003  TransactionId cutoff;
2004  bool all_frozen;
2005 
2006  if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
2007  Assert(false);
2008 
2009  Assert(lpdead_items == 0);
2010  Assert(prunestate->all_frozen == all_frozen);
2011 
2012  /*
2013  * It's possible that we froze tuples and made the page's XID cutoff
2014  * (for recovery conflict purposes) FrozenTransactionId. This is okay
2015  * because visibility_cutoff_xid will be logged by our caller in a
2016  * moment.
2017  */
2018  Assert(cutoff == FrozenTransactionId ||
2019  cutoff == prunestate->visibility_cutoff_xid);
2020  }
2021 #endif
2022 
2023  /*
2024  * Now save details of the LP_DEAD items from the page in vacrel
2025  */
2026  if (lpdead_items > 0)
2027  {
2028  LVDeadItems *dead_items = vacrel->dead_items;
2029  ItemPointerData tmp;
2030 
2031  Assert(!prunestate->all_visible);
2032  Assert(prunestate->has_lpdead_items);
2033 
2034  vacrel->lpdead_item_pages++;
2035 
2036  ItemPointerSetBlockNumber(&tmp, blkno);
2037 
2038  for (int i = 0; i < lpdead_items; i++)
2039  {
2040  ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]);
2041  dead_items->items[dead_items->num_items++] = tmp;
2042  }
2043 
2044  Assert(dead_items->num_items <= dead_items->max_items);
2046  dead_items->num_items);
2047  }
2048 
2049  /* Finally, add page-local counts to whole-VACUUM counts */
2050  vacrel->tuples_deleted += tuples_deleted;
2051  vacrel->lpdead_items += lpdead_items;
2052  vacrel->new_dead_tuples += new_dead_tuples;
2053  vacrel->num_tuples += num_tuples;
2054  vacrel->live_tuples += live_tuples;
2055 }
2056 
2057 /*
2058  * Remove the collected garbage tuples from the table and its indexes.
2059  *
2060  * We may choose to bypass index vacuuming at this point, though only when the
2061  * ongoing VACUUM operation will definitely only have one index scan/round of
2062  * index vacuuming.
2063  */
2064 static void
2066 {
2067  bool bypass;
2068 
2069  /* Should not end up here with no indexes */
2070  Assert(vacrel->nindexes > 0);
2072  Assert(vacrel->lpdead_item_pages > 0);
2073 
2074  if (!vacrel->do_index_vacuuming)
2075  {
2076  Assert(!vacrel->do_index_cleanup);
2077  vacrel->dead_items->num_items = 0;
2078  return;
2079  }
2080 
2081  /*
2082  * Consider bypassing index vacuuming (and heap vacuuming) entirely.
2083  *
2084  * We currently only do this in cases where the number of LP_DEAD items
2085  * for the entire VACUUM operation is close to zero. This avoids sharp
2086  * discontinuities in the duration and overhead of successive VACUUM
2087  * operations that run against the same table with a fixed workload.
2088  * Ideally, successive VACUUM operations will behave as if there are
2089  * exactly zero LP_DEAD items in cases where there are close to zero.
2090  *
2091  * This is likely to be helpful with a table that is continually affected
2092  * by UPDATEs that can mostly apply the HOT optimization, but occasionally
2093  * have small aberrations that lead to just a few heap pages retaining
2094  * only one or two LP_DEAD items. This is pretty common; even when the
2095  * DBA goes out of their way to make UPDATEs use HOT, it is practically
2096  * impossible to predict whether HOT will be applied in 100% of cases.
2097  * It's far easier to ensure that 99%+ of all UPDATEs against a table use
2098  * HOT through careful tuning.
2099  */
2100  bypass = false;
2101  if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0)
2102  {
2103  BlockNumber threshold;
2104 
2105  Assert(vacrel->num_index_scans == 0);
2106  Assert(vacrel->lpdead_items == vacrel->dead_items->num_items);
2107  Assert(vacrel->do_index_vacuuming);
2108  Assert(vacrel->do_index_cleanup);
2109 
2110  /*
2111  * This crossover point at which we'll start to do index vacuuming is
2112  * expressed as a percentage of the total number of heap pages in the
2113  * table that are known to have at least one LP_DEAD item. This is
2114  * much more important than the total number of LP_DEAD items, since
2115  * it's a proxy for the number of heap pages whose visibility map bits
2116  * cannot be set on account of bypassing index and heap vacuuming.
2117  *
2118  * We apply one further precautionary test: the space currently used
2119  * to store the TIDs (TIDs that now all point to LP_DEAD items) must
2120  * not exceed 32MB. This limits the risk that we will bypass index
2121  * vacuuming again and again until eventually there is a VACUUM whose
2122  * dead_items space is not CPU cache resident.
2123  *
2124  * We don't take any special steps to remember the LP_DEAD items (such
2125  * as counting them in new_dead_tuples report to the stats collector)
2126  * when the optimization is applied. Though the accounting used in
2127  * analyze.c's acquire_sample_rows() will recognize the same LP_DEAD
2128  * items as dead rows in its own stats collector report, that's okay.
2129  * The discrepancy should be negligible. If this optimization is ever
2130  * expanded to cover more cases then this may need to be reconsidered.
2131  */
2132  threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES;
2133  bypass = (vacrel->lpdead_item_pages < threshold &&
2134  vacrel->lpdead_items < MAXDEADITEMS(32L * 1024L * 1024L));
2135  }
2136 
2137  if (bypass)
2138  {
2139  /*
2140  * There are almost zero TIDs. Behave as if there were precisely
2141  * zero: bypass index vacuuming, but do index cleanup.
2142  *
2143  * We expect that the ongoing VACUUM operation will finish very
2144  * quickly, so there is no point in considering speeding up as a
2145  * failsafe against wraparound failure. (Index cleanup is expected to
2146  * finish very quickly in cases where there were no ambulkdelete()
2147  * calls.)
2148  */
2149  vacrel->do_index_vacuuming = false;
2150  ereport(elevel,
2151  (errmsg("table \"%s\": index scan bypassed: %u pages from table (%.2f%% of total) have %lld dead item identifiers",
2152  vacrel->relname, vacrel->lpdead_item_pages,
2153  100.0 * vacrel->lpdead_item_pages / vacrel->rel_pages,
2154  (long long) vacrel->lpdead_items)));
2155  }
2156  else if (lazy_vacuum_all_indexes(vacrel))
2157  {
2158  /*
2159  * We successfully completed a round of index vacuuming. Do related
2160  * heap vacuuming now.
2161  */
2162  lazy_vacuum_heap_rel(vacrel);
2163  }
2164  else
2165  {
2166  /*
2167  * Failsafe case.
2168  *
2169  * We attempted index vacuuming, but didn't finish a full round/full
2170  * index scan. This happens when relfrozenxid or relminmxid is too
2171  * far in the past.
2172  *
2173  * From this point on the VACUUM operation will do no further index
2174  * vacuuming or heap vacuuming. This VACUUM operation won't end up
2175  * back here again.
2176  */
2177  Assert(vacrel->failsafe_active);
2178  }
2179 
2180  /*
2181  * Forget the LP_DEAD items that we just vacuumed (or just decided to not
2182  * vacuum)
2183  */
2184  vacrel->dead_items->num_items = 0;
2185 }
2186 
2187 /*
2188  * lazy_vacuum_all_indexes() -- Main entry for index vacuuming
2189  *
2190  * Returns true in the common case when all indexes were successfully
2191  * vacuumed. Returns false in rare cases where we determined that the ongoing
2192  * VACUUM operation is at risk of taking too long to finish, leading to
2193  * wraparound failure.
2194  */
2195 static bool
2197 {
2198  bool allindexes = true;
2199 
2201  Assert(vacrel->nindexes > 0);
2202  Assert(vacrel->do_index_vacuuming);
2203  Assert(vacrel->do_index_cleanup);
2206 
2207  /* Precheck for XID wraparound emergencies */
2208  if (lazy_check_wraparound_failsafe(vacrel))
2209  {
2210  /* Wraparound emergency -- don't even start an index scan */
2211  return false;
2212  }
2213 
2214  /* Report that we are now vacuuming indexes */
2217 
2218  if (!ParallelVacuumIsActive(vacrel))
2219  {
2220  for (int idx = 0; idx < vacrel->nindexes; idx++)
2221  {
2222  Relation indrel = vacrel->indrels[idx];
2223  IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2224 
2225  vacrel->indstats[idx] =
2226  lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples,
2227  vacrel);
2228 
2229  if (lazy_check_wraparound_failsafe(vacrel))
2230  {
2231  /* Wraparound emergency -- end current index scan */
2232  allindexes = false;
2233  break;
2234  }
2235  }
2236  }
2237  else
2238  {
2239  /* Outsource everything to parallel variant */
2241 
2242  /*
2243  * Do a postcheck to consider applying wraparound failsafe now. Note
2244  * that parallel VACUUM only gets the precheck and this postcheck.
2245  */
2246  if (lazy_check_wraparound_failsafe(vacrel))
2247  allindexes = false;
2248  }
2249 
2250  /*
2251  * We delete all LP_DEAD items from the first heap pass in all indexes on
2252  * each call here (except calls where we choose to do the failsafe). This
2253  * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2254  * of the failsafe triggering, which prevents the next call from taking
2255  * place).
2256  */
2257  Assert(vacrel->num_index_scans > 0 ||
2258  vacrel->dead_items->num_items == vacrel->lpdead_items);
2259  Assert(allindexes || vacrel->failsafe_active);
2260 
2261  /*
2262  * Increase and report the number of index scans.
2263  *
2264  * We deliberately include the case where we started a round of bulk
2265  * deletes that we weren't able to finish due to the failsafe triggering.
2266  */
2267  vacrel->num_index_scans++;
2269  vacrel->num_index_scans);
2270 
2271  return allindexes;
2272 }
2273 
2274 /*
2275  * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2276  *
2277  * This routine marks LP_DEAD items in vacrel->dead_items array as LP_UNUSED.
2278  * Pages that never had lazy_scan_prune record LP_DEAD items are not visited
2279  * at all.
2280  *
2281  * We may also be able to truncate the line pointer array of the heap pages we
2282  * visit. If there is a contiguous group of LP_UNUSED items at the end of the
2283  * array, it can be reclaimed as free space. These LP_UNUSED items usually
2284  * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2285  * each page to LP_UNUSED, and then consider if it's possible to truncate the
2286  * page's line pointer array).
2287  *
2288  * Note: the reason for doing this as a second pass is we cannot remove the
2289  * tuples until we've removed their index entries, and we want to process
2290  * index entry removal in batches as large as possible.
2291  */
2292 static void
2294 {
2295  int index;
2296  BlockNumber vacuumed_pages;
2297  PGRUsage ru0;
2298  Buffer vmbuffer = InvalidBuffer;
2299  LVSavedErrInfo saved_err_info;
2300 
2301  Assert(vacrel->do_index_vacuuming);
2302  Assert(vacrel->do_index_cleanup);
2303  Assert(vacrel->num_index_scans > 0);
2304 
2305  /* Report that we are now vacuuming the heap */
2308 
2309  /* Update error traceback information */
2310  update_vacuum_error_info(vacrel, &saved_err_info,
2313 
2314  pg_rusage_init(&ru0);
2315  vacuumed_pages = 0;
2316 
2317  index = 0;
2318  while (index < vacrel->dead_items->num_items)
2319  {
2320  BlockNumber tblk;
2321  Buffer buf;
2322  Page page;
2323  Size freespace;
2324 
2326 
2327  tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]);
2328  vacrel->blkno = tblk;
2329  buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
2330  vacrel->bstrategy);
2332  index = lazy_vacuum_heap_page(vacrel, tblk, buf, index, &vmbuffer);
2333 
2334  /* Now that we've vacuumed the page, record its available space */
2335  page = BufferGetPage(buf);
2336  freespace = PageGetHeapFreeSpace(page);
2337 
2339  RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
2340  vacuumed_pages++;
2341  }
2342 
2343  /* Clear the block number information */
2344  vacrel->blkno = InvalidBlockNumber;
2345 
2346  if (BufferIsValid(vmbuffer))
2347  {
2348  ReleaseBuffer(vmbuffer);
2349  vmbuffer = InvalidBuffer;
2350  }
2351 
2352  /*
2353  * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2354  * the second heap pass. No more, no less.
2355  */
2356  Assert(index > 0);
2357  Assert(vacrel->num_index_scans > 1 ||
2358  (index == vacrel->lpdead_items &&
2359  vacuumed_pages == vacrel->lpdead_item_pages));
2360 
2361  ereport(elevel,
2362  (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
2363  vacrel->relname, (long long) index, vacuumed_pages),
2364  errdetail_internal("%s", pg_rusage_show(&ru0))));
2365 
2366  /* Revert to the previous phase information for error traceback */
2367  restore_vacuum_error_info(vacrel, &saved_err_info);
2368 }
2369 
2370 /*
2371  * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2372  * vacrel->dead_items array.
2373  *
2374  * Caller must have an exclusive buffer lock on the buffer (though a full
2375  * cleanup lock is also acceptable).
2376  *
2377  * index is an offset into the vacrel->dead_items array for the first listed
2378  * LP_DEAD item on the page. The return value is the first index immediately
2379  * after all LP_DEAD items for the same page in the array.
2380  *
2381  * Prior to PostgreSQL 14 there were rare cases where this routine had to set
2382  * tuples with storage to unused. These days it is strictly responsible for
2383  * marking LP_DEAD stub line pointers as unused. This only happens for those
2384  * LP_DEAD items on the page that were determined to be LP_DEAD items back
2385  * when the same page was visited by lazy_scan_prune() (i.e. those whose TID
2386  * was recorded in the dead_items array at the time).
2387  */
2388 static int
2390  int index, Buffer *vmbuffer)
2391 {
2392  LVDeadItems *dead_items = vacrel->dead_items;
2393  Page page = BufferGetPage(buffer);
2395  int uncnt = 0;
2396  TransactionId visibility_cutoff_xid;
2397  bool all_frozen;
2398  LVSavedErrInfo saved_err_info;
2399 
2400  Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming);
2401 
2403 
2404  /* Update error traceback information */
2405  update_vacuum_error_info(vacrel, &saved_err_info,
2408 
2410 
2411  for (; index < dead_items->num_items; index++)
2412  {
2413  BlockNumber tblk;
2414  OffsetNumber toff;
2415  ItemId itemid;
2416 
2417  tblk = ItemPointerGetBlockNumber(&dead_items->items[index]);
2418  if (tblk != blkno)
2419  break; /* past end of tuples for this block */
2420  toff = ItemPointerGetOffsetNumber(&dead_items->items[index]);
2421  itemid = PageGetItemId(page, toff);
2422 
2423  Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
2424  ItemIdSetUnused(itemid);
2425  unused[uncnt++] = toff;
2426  }
2427 
2428  Assert(uncnt > 0);
2429 
2430  /* Attempt to truncate line pointer array now */
2432 
2433  /*
2434  * Mark buffer dirty before we write WAL.
2435  */
2436  MarkBufferDirty(buffer);
2437 
2438  /* XLOG stuff */
2439  if (RelationNeedsWAL(vacrel->rel))
2440  {
2441  xl_heap_vacuum xlrec;
2442  XLogRecPtr recptr;
2443 
2444  xlrec.nunused = uncnt;
2445 
2446  XLogBeginInsert();
2447  XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum);
2448 
2449  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2450  XLogRegisterBufData(0, (char *) unused, uncnt * sizeof(OffsetNumber));
2451 
2452  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM);
2453 
2454  PageSetLSN(page, recptr);
2455  }
2456 
2457  /*
2458  * End critical section, so we safely can do visibility tests (which
2459  * possibly need to perform IO and allocate memory!). If we crash now the
2460  * page (including the corresponding vm bit) might not be marked all
2461  * visible, but that's fine. A later vacuum will fix that.
2462  */
2463  END_CRIT_SECTION();
2464 
2465  /*
2466  * Now that we have removed the LD_DEAD items from the page, once again
2467  * check if the page has become all-visible. The page is already marked
2468  * dirty, exclusively locked, and, if needed, a full page image has been
2469  * emitted.
2470  */
2471  if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2472  &all_frozen))
2473  PageSetAllVisible(page);
2474 
2475  /*
2476  * All the changes to the heap page have been done. If the all-visible
2477  * flag is now set, also set the VM all-visible bit (and, if possible, the
2478  * all-frozen bit) unless this has already been done previously.
2479  */
2480  if (PageIsAllVisible(page))
2481  {
2482  uint8 flags = 0;
2483  uint8 vm_status = visibilitymap_get_status(vacrel->rel,
2484  blkno, vmbuffer);
2485 
2486  /* Set the VM all-frozen bit to flag, if needed */
2487  if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2488  flags |= VISIBILITYMAP_ALL_VISIBLE;
2489  if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2490  flags |= VISIBILITYMAP_ALL_FROZEN;
2491 
2492  Assert(BufferIsValid(*vmbuffer));
2493  if (flags != 0)
2494  visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2495  *vmbuffer, visibility_cutoff_xid, flags);
2496  }
2497 
2498  /* Revert to the previous phase information for error traceback */
2499  restore_vacuum_error_info(vacrel, &saved_err_info);
2500  return index;
2501 }
2502 
2503 /*
2504  * lazy_check_needs_freeze() -- scan page to see if any tuples
2505  * need to be cleaned to avoid wraparound
2506  *
2507  * Returns true if the page needs to be vacuumed using cleanup lock.
2508  * Also returns a flag indicating whether page contains any tuples at all.
2509  */
2510 static bool
2512 {
2513  Page page = BufferGetPage(buf);
2514  OffsetNumber offnum,
2515  maxoff;
2516  HeapTupleHeader tupleheader;
2517 
2518  *hastup = false;
2519 
2520  /*
2521  * New and empty pages, obviously, don't contain tuples. We could make
2522  * sure that the page is registered in the FSM, but it doesn't seem worth
2523  * waiting for a cleanup lock just for that, especially because it's
2524  * likely that the pin holder will do so.
2525  */
2526  if (PageIsNew(page) || PageIsEmpty(page))
2527  return false;
2528 
2529  maxoff = PageGetMaxOffsetNumber(page);
2530  for (offnum = FirstOffsetNumber;
2531  offnum <= maxoff;
2532  offnum = OffsetNumberNext(offnum))
2533  {
2534  ItemId itemid;
2535 
2536  /*
2537  * Set the offset number so that we can display it along with any
2538  * error that occurred while processing this tuple.
2539  */
2540  vacrel->offnum = offnum;
2541  itemid = PageGetItemId(page, offnum);
2542 
2543  /* this should match hastup test in count_nondeletable_pages() */
2544  if (ItemIdIsUsed(itemid))
2545  *hastup = true;
2546 
2547  /* dead and redirect items never need freezing */
2548  if (!ItemIdIsNormal(itemid))
2549  continue;
2550 
2551  tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
2552 
2553  if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit,
2554  vacrel->MultiXactCutoff, buf))
2555  break;
2556  } /* scan along page */
2557 
2558  /* Clear the offset information once we have processed the given page. */
2559  vacrel->offnum = InvalidOffsetNumber;
2560 
2561  return (offnum <= maxoff);
2562 }
2563 
2564 /*
2565  * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2566  * relfrozenxid and/or relminmxid that is dangerously far in the past.
2567  * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2568  * vacuuming and heap vacuuming. Truncating the heap is also bypassed.
2569  *
2570  * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2571  * up when the failsafe triggers. VACUUM stops applying any cost-based delay
2572  * that it started out with.
2573  *
2574  * Returns true when failsafe has been triggered.
2575  */
2576 static bool
2578 {
2579  /* Don't warn more than once per VACUUM */
2580  if (vacrel->failsafe_active)
2581  return true;
2582 
2584  vacrel->relminmxid)))
2585  {
2586  vacrel->failsafe_active = true;
2587 
2588  /* Disable index vacuuming, index cleanup, and heap rel truncation */
2589  vacrel->do_index_vacuuming = false;
2590  vacrel->do_index_cleanup = false;
2591  vacrel->do_rel_truncate = false;
2592 
2593  ereport(WARNING,
2594  (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2596  vacrel->relnamespace,
2597  vacrel->relname,
2598  vacrel->num_index_scans),
2599  errdetail("The table's relfrozenxid or relminmxid is too far in the past."),
2600  errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2601  "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2602 
2603  /* Stop applying cost limits from this point on */
2604  VacuumCostActive = false;
2605  VacuumCostBalance = 0;
2606 
2607  return true;
2608  }
2609 
2610  return false;
2611 }
2612 
2613 /*
2614  * Perform lazy_vacuum_all_indexes() steps in parallel
2615  */
2616 static void
2618 {
2619  /* Tell parallel workers to do index vacuuming */
2620  vacrel->lps->lvshared->for_cleanup = false;
2621  vacrel->lps->lvshared->first_time = false;
2622 
2623  /*
2624  * We can only provide an approximate value of num_heap_tuples, at least
2625  * for now. Matches serial VACUUM case.
2626  */
2627  vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples;
2628  vacrel->lps->lvshared->estimated_count = true;
2629 
2631  vacrel->lps->nindexes_parallel_bulkdel);
2632 }
2633 
2634 /*
2635  * Perform lazy_cleanup_all_indexes() steps in parallel
2636  */
2637 static void
2639 {
2640  int nworkers;
2641 
2642  /*
2643  * If parallel vacuum is active we perform index cleanup with parallel
2644  * workers.
2645  *
2646  * Tell parallel workers to do index cleanup.
2647  */
2648  vacrel->lps->lvshared->for_cleanup = true;
2649  vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0);
2650 
2651  /*
2652  * Now we can provide a better estimate of total number of surviving
2653  * tuples (we assume indexes are more interested in that than in the
2654  * number of nominally live tuples).
2655  */
2656  vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples;
2657  vacrel->lps->lvshared->estimated_count =
2658  (vacrel->tupcount_pages < vacrel->rel_pages);
2659 
2660  /* Determine the number of parallel workers to launch */
2661  if (vacrel->lps->lvshared->first_time)
2662  nworkers = vacrel->lps->nindexes_parallel_cleanup +
2664  else
2665  nworkers = vacrel->lps->nindexes_parallel_cleanup;
2666 
2667  do_parallel_vacuum_or_cleanup(vacrel, nworkers);
2668 }
2669 
2670 /*
2671  * Perform index vacuum or index cleanup with parallel workers. This function
2672  * must be used by the parallel vacuum leader process. The caller must set
2673  * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2674  * cleanup.
2675  */
2676 static void
2678 {
2679  LVParallelState *lps = vacrel->lps;
2680 
2682  Assert(ParallelVacuumIsActive(vacrel));
2683  Assert(vacrel->nindexes > 0);
2684 
2685  /* The leader process will participate */
2686  nworkers--;
2687 
2688  /*
2689  * It is possible that parallel context is initialized with fewer workers
2690  * than the number of indexes that need a separate worker in the current
2691  * phase, so we need to consider it. See compute_parallel_vacuum_workers.
2692  */
2693  nworkers = Min(nworkers, lps->pcxt->nworkers);
2694 
2695  /* Setup the shared cost-based vacuum delay and launch workers */
2696  if (nworkers > 0)
2697  {
2698  if (vacrel->num_index_scans > 0)
2699  {
2700  /* Reset the parallel index processing counter */
2701  pg_atomic_write_u32(&(lps->lvshared->idx), 0);
2702 
2703  /* Reinitialize the parallel context to relaunch parallel workers */
2705  }
2706 
2707  /*
2708  * Set up shared cost balance and the number of active workers for
2709  * vacuum delay. We need to do this before launching workers as
2710  * otherwise, they might not see the updated values for these
2711  * parameters.
2712  */
2715 
2716  /*
2717  * The number of workers can vary between bulkdelete and cleanup
2718  * phase.
2719  */
2720  ReinitializeParallelWorkers(lps->pcxt, nworkers);
2721 
2723 
2724  if (lps->pcxt->nworkers_launched > 0)
2725  {
2726  /*
2727  * Reset the local cost values for leader backend as we have
2728  * already accumulated the remaining balance of heap.
2729  */
2730  VacuumCostBalance = 0;
2732 
2733  /* Enable shared cost balance for leader backend */
2736  }
2737 
2738  if (lps->lvshared->for_cleanup)
2739  ereport(elevel,
2740  (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2741  "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2742  lps->pcxt->nworkers_launched),
2743  lps->pcxt->nworkers_launched, nworkers)));
2744  else
2745  ereport(elevel,
2746  (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2747  "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2748  lps->pcxt->nworkers_launched),
2749  lps->pcxt->nworkers_launched, nworkers)));
2750  }
2751 
2752  /* Process the indexes that can be processed by only leader process */
2754 
2755  /*
2756  * Join as a parallel worker. The leader process alone processes all the
2757  * indexes in the case where no workers are launched.
2758  */
2759  do_parallel_processing(vacrel, lps->lvshared);
2760 
2761  /*
2762  * Next, accumulate buffer and WAL usage. (This must wait for the workers
2763  * to finish, or we might get incomplete data.)
2764  */
2765  if (nworkers > 0)
2766  {
2767  /* Wait for all vacuum workers to finish */
2769 
2770  for (int i = 0; i < lps->pcxt->nworkers_launched; i++)
2772  }
2773 
2774  /*
2775  * Carry the shared balance value to heap scan and disable shared costing
2776  */
2778  {
2780  VacuumSharedCostBalance = NULL;
2781  VacuumActiveNWorkers = NULL;
2782  }
2783 }
2784 
2785 /*
2786  * Index vacuum/cleanup routine used by the leader process and parallel
2787  * vacuum worker processes to process the indexes in parallel.
2788  */
2789 static void
2791 {
2792  /*
2793  * Increment the active worker count if we are able to launch any worker.
2794  */
2797 
2798  /* Loop until all indexes are vacuumed */
2799  for (;;)
2800  {
2801  int idx;
2802  LVSharedIndStats *shared_istat;
2803  Relation indrel;
2804  IndexBulkDeleteResult *istat;
2805 
2806  /* Get an index number to process */
2807  idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1);
2808 
2809  /* Done for all indexes? */
2810  if (idx >= vacrel->nindexes)
2811  break;
2812 
2813  /* Get the index statistics space from DSM, if any */
2814  shared_istat = parallel_stats_for_idx(lvshared, idx);
2815 
2816  /* Skip indexes not participating in parallelism */
2817  if (shared_istat == NULL)
2818  continue;
2819 
2820  indrel = vacrel->indrels[idx];
2821 
2822  /*
2823  * Skip processing indexes that are unsafe for workers (these are
2824  * processed in do_serial_processing_for_unsafe_indexes() by leader)
2825  */
2826  if (!parallel_processing_is_safe(indrel, lvshared))
2827  continue;
2828 
2829  /* Do vacuum or cleanup of the index */
2830  istat = vacrel->indstats[idx];
2831  vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2832  lvshared,
2833  shared_istat,
2834  vacrel);
2835  }
2836 
2837  /*
2838  * We have completed the index vacuum so decrement the active worker
2839  * count.
2840  */
2843 }
2844 
2845 /*
2846  * Perform parallel processing of indexes in leader process.
2847  *
2848  * Handles index vacuuming (or index cleanup) for indexes that are not
2849  * parallel safe. It's possible that this will vary for a given index, based
2850  * on details like whether we're performing for_cleanup processing right now.
2851  *
2852  * Also performs processing of smaller indexes that fell under the size cutoff
2853  * enforced by compute_parallel_vacuum_workers(). These indexes never get a
2854  * slot for statistics in DSM.
2855  */
2856 static void
2858 {
2860 
2861  /*
2862  * Increment the active worker count if we are able to launch any worker.
2863  */
2866 
2867  for (int idx = 0; idx < vacrel->nindexes; idx++)
2868  {
2869  LVSharedIndStats *shared_istat;
2870  Relation indrel;
2871  IndexBulkDeleteResult *istat;
2872 
2873  shared_istat = parallel_stats_for_idx(lvshared, idx);
2874  indrel = vacrel->indrels[idx];
2875 
2876  /*
2877  * We're only here for the indexes that parallel workers won't
2878  * process. Note that the shared_istat test ensures that we process
2879  * indexes that fell under initial size cutoff.
2880  */
2881  if (shared_istat != NULL &&
2882  parallel_processing_is_safe(indrel, lvshared))
2883  continue;
2884 
2885  /* Do vacuum or cleanup of the index */
2886  istat = vacrel->indstats[idx];
2887  vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2888  lvshared,
2889  shared_istat,
2890  vacrel);
2891  }
2892 
2893  /*
2894  * We have completed the index vacuum so decrement the active worker
2895  * count.
2896  */
2899 }
2900 
2901 /*
2902  * Vacuum or cleanup index either by leader process or by one of the worker
2903  * process. After processing the index this function copies the index
2904  * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2905  * segment.
2906  */
2907 static IndexBulkDeleteResult *
2909  IndexBulkDeleteResult *istat,
2910  LVShared *lvshared,
2911  LVSharedIndStats *shared_istat,
2912  LVRelState *vacrel)
2913 {
2914  IndexBulkDeleteResult *istat_res;
2915 
2916  /*
2917  * Update the pointer to the corresponding bulk-deletion result if someone
2918  * has already updated it
2919  */
2920  if (shared_istat && shared_istat->updated && istat == NULL)
2921  istat = &shared_istat->istat;
2922 
2923  /* Do vacuum or cleanup of the index */
2924  if (lvshared->for_cleanup)
2925  istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples,
2926  lvshared->estimated_count, vacrel);
2927  else
2928  istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples,
2929  vacrel);
2930 
2931  /*
2932  * Copy the index bulk-deletion result returned from ambulkdelete and
2933  * amvacuumcleanup to the DSM segment if it's the first cycle because they
2934  * allocate locally and it's possible that an index will be vacuumed by a
2935  * different vacuum process the next cycle. Copying the result normally
2936  * happens only the first time an index is vacuumed. For any additional
2937  * vacuum pass, we directly point to the result on the DSM segment and
2938  * pass it to vacuum index APIs so that workers can update it directly.
2939  *
2940  * Since all vacuum workers write the bulk-deletion result at different
2941  * slots we can write them without locking.
2942  */
2943  if (shared_istat && !shared_istat->updated && istat_res != NULL)
2944  {
2945  memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult));
2946  shared_istat->updated = true;
2947 
2948  /* Free the locally-allocated bulk-deletion result */
2949  pfree(istat_res);
2950 
2951  /* return the pointer to the result from shared memory */
2952  return &shared_istat->istat;
2953  }
2954 
2955  return istat_res;
2956 }
2957 
2958 /*
2959  * lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2960  */
2961 static void
2963 {
2965  Assert(vacrel->nindexes > 0);
2966 
2967  /* Report that we are now cleaning up indexes */
2970 
2971  if (!ParallelVacuumIsActive(vacrel))
2972  {
2973  double reltuples = vacrel->new_rel_tuples;
2974  bool estimated_count =
2975  vacrel->tupcount_pages < vacrel->rel_pages;
2976 
2977  for (int idx = 0; idx < vacrel->nindexes; idx++)
2978  {
2979  Relation indrel = vacrel->indrels[idx];
2980  IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2981 
2982  vacrel->indstats[idx] =
2983  lazy_cleanup_one_index(indrel, istat, reltuples,
2984  estimated_count, vacrel);
2985  }
2986  }
2987  else
2988  {
2989  /* Outsource everything to parallel variant */
2991  }
2992 }
2993 
2994 /*
2995  * lazy_vacuum_one_index() -- vacuum index relation.
2996  *
2997  * Delete all the index tuples containing a TID collected in
2998  * vacrel->dead_items array. Also update running statistics.
2999  * Exact details depend on index AM's ambulkdelete routine.
3000  *
3001  * reltuples is the number of heap tuples to be passed to the
3002  * bulkdelete callback. It's always assumed to be estimated.
3003  * See indexam.sgml for more info.
3004  *
3005  * Returns bulk delete stats derived from input stats
3006  */
3007 static IndexBulkDeleteResult *
3009  double reltuples, LVRelState *vacrel)
3010 {
3011  IndexVacuumInfo ivinfo;
3012  PGRUsage ru0;
3013  LVSavedErrInfo saved_err_info;
3014 
3015  pg_rusage_init(&ru0);
3016 
3017  ivinfo.index = indrel;
3018  ivinfo.analyze_only = false;
3019  ivinfo.report_progress = false;
3020  ivinfo.estimated_count = true;
3021  ivinfo.message_level = elevel;
3022  ivinfo.num_heap_tuples = reltuples;
3023  ivinfo.strategy = vacrel->bstrategy;
3024 
3025  /*
3026  * Update error traceback information.
3027  *
3028  * The index name is saved during this phase and restored immediately
3029  * after this phase. See vacuum_error_callback.
3030  */
3031  Assert(vacrel->indname == NULL);
3032  vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3033  update_vacuum_error_info(vacrel, &saved_err_info,
3036 
3037  /* Do bulk deletion */
3038  istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped,
3039  (void *) vacrel->dead_items);
3040 
3041  ereport(elevel,
3042  (errmsg("scanned index \"%s\" to remove %d row versions",
3043  vacrel->indname, vacrel->dead_items->num_items),
3044  errdetail_internal("%s", pg_rusage_show(&ru0))));
3045 
3046  /* Revert to the previous phase information for error traceback */
3047  restore_vacuum_error_info(vacrel, &saved_err_info);
3048  pfree(vacrel->indname);
3049  vacrel->indname = NULL;
3050 
3051  return istat;
3052 }
3053 
3054 /*
3055  * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
3056  *
3057  * Calls index AM's amvacuumcleanup routine. reltuples is the number
3058  * of heap tuples and estimated_count is true if reltuples is an
3059  * estimated value. See indexam.sgml for more info.
3060  *
3061  * Returns bulk delete stats derived from input stats
3062  */
3063 static IndexBulkDeleteResult *
3065  double reltuples, bool estimated_count,
3066  LVRelState *vacrel)
3067 {
3068  IndexVacuumInfo ivinfo;
3069  PGRUsage ru0;
3070  LVSavedErrInfo saved_err_info;
3071 
3072  pg_rusage_init(&ru0);
3073 
3074  ivinfo.index = indrel;
3075  ivinfo.analyze_only = false;
3076  ivinfo.report_progress = false;
3077  ivinfo.estimated_count = estimated_count;
3078  ivinfo.message_level = elevel;
3079 
3080  ivinfo.num_heap_tuples = reltuples;
3081  ivinfo.strategy = vacrel->bstrategy;
3082 
3083  /*
3084  * Update error traceback information.
3085  *
3086  * The index name is saved during this phase and restored immediately
3087  * after this phase. See vacuum_error_callback.
3088  */
3089  Assert(vacrel->indname == NULL);
3090  vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3091  update_vacuum_error_info(vacrel, &saved_err_info,
3094 
3095  istat = index_vacuum_cleanup(&ivinfo, istat);
3096 
3097  if (istat)
3098  {
3099  ereport(elevel,
3100  (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3101  RelationGetRelationName(indrel),
3102  istat->num_index_tuples,
3103  istat->num_pages),
3104  errdetail("%.0f index row versions were removed.\n"
3105  "%u index pages were newly deleted.\n"
3106  "%u index pages are currently deleted, of which %u are currently reusable.\n"
3107  "%s.",
3108  istat->tuples_removed,
3109  istat->pages_newly_deleted,
3110  istat->pages_deleted, istat->pages_free,
3111  pg_rusage_show(&ru0))));
3112  }
3113 
3114  /* Revert to the previous phase information for error traceback */
3115  restore_vacuum_error_info(vacrel, &saved_err_info);
3116  pfree(vacrel->indname);
3117  vacrel->indname = NULL;
3118 
3119  return istat;
3120 }
3121 
3122 /*
3123  * should_attempt_truncation - should we attempt to truncate the heap?
3124  *
3125  * Don't even think about it unless we have a shot at releasing a goodly
3126  * number of pages. Otherwise, the time taken isn't worth it.
3127  *
3128  * Also don't attempt it if wraparound failsafe is in effect. It's hard to
3129  * predict how long lazy_truncate_heap will take. Don't take any chances.
3130  * There is very little chance of truncation working out when the failsafe is
3131  * in effect in any case. lazy_scan_prune makes the optimistic assumption
3132  * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
3133  * we're called.
3134  *
3135  * Also don't attempt it if we are doing early pruning/vacuuming, because a
3136  * scan which cannot find a truncated heap page cannot determine that the
3137  * snapshot is too old to read that page.
3138  *
3139  * This is split out so that we can test whether truncation is going to be
3140  * called for before we actually do it. If you change the logic here, be
3141  * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
3142  */
3143 static bool
3145 {
3146  BlockNumber possibly_freeable;
3147 
3148  if (!vacrel->do_rel_truncate || vacrel->failsafe_active)
3149  return false;
3150 
3151  possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
3152  if (possibly_freeable > 0 &&
3153  (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
3154  possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) &&
3156  return true;
3157  else
3158  return false;
3159 }
3160 
3161 /*
3162  * lazy_truncate_heap - try to truncate off any empty pages at the end
3163  */
3164 static void
3166 {
3167  BlockNumber orig_rel_pages = vacrel->rel_pages;
3168  BlockNumber new_rel_pages;
3169  bool lock_waiter_detected;
3170  int lock_retry;
3171 
3172  /* Report that we are now truncating */
3175 
3176  /*
3177  * Loop until no more truncating can be done.
3178  */
3179  do
3180  {
3181  PGRUsage ru0;
3182 
3183  pg_rusage_init(&ru0);
3184 
3185  /*
3186  * We need full exclusive lock on the relation in order to do
3187  * truncation. If we can't get it, give up rather than waiting --- we
3188  * don't want to block other backends, and we don't want to deadlock
3189  * (which is quite possible considering we already hold a lower-grade
3190  * lock).
3191  */
3192  lock_waiter_detected = false;
3193  lock_retry = 0;
3194  while (true)
3195  {
3197  break;
3198 
3199  /*
3200  * Check for interrupts while trying to (re-)acquire the exclusive
3201  * lock.
3202  */
3204 
3205  if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
3207  {
3208  /*
3209  * We failed to establish the lock in the specified number of
3210  * retries. This means we give up truncating.
3211  */
3212  ereport(elevel,
3213  (errmsg("\"%s\": stopping truncate due to conflicting lock request",
3214  vacrel->relname)));
3215  return;
3216  }
3217 
3218  (void) WaitLatch(MyLatch,
3223  }
3224 
3225  /*
3226  * Now that we have exclusive lock, look to see if the rel has grown
3227  * whilst we were vacuuming with non-exclusive lock. If so, give up;
3228  * the newly added pages presumably contain non-deletable tuples.
3229  */
3230  new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
3231  if (new_rel_pages != orig_rel_pages)
3232  {
3233  /*
3234  * Note: we intentionally don't update vacrel->rel_pages with the
3235  * new rel size here. If we did, it would amount to assuming that
3236  * the new pages are empty, which is unlikely. Leaving the numbers
3237  * alone amounts to assuming that the new pages have the same
3238  * tuple density as existing ones, which is less unlikely.
3239  */
3241  return;
3242  }
3243 
3244  /*
3245  * Scan backwards from the end to verify that the end pages actually
3246  * contain no tuples. This is *necessary*, not optional, because
3247  * other backends could have added tuples to these pages whilst we
3248  * were vacuuming.
3249  */
3250  new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected);
3251  vacrel->blkno = new_rel_pages;
3252 
3253  if (new_rel_pages >= orig_rel_pages)
3254  {
3255  /* can't do anything after all */
3257  return;
3258  }
3259 
3260  /*
3261  * Okay to truncate.
3262  */
3263  RelationTruncate(vacrel->rel, new_rel_pages);
3264 
3265  /*
3266  * We can release the exclusive lock as soon as we have truncated.
3267  * Other backends can't safely access the relation until they have
3268  * processed the smgr invalidation that smgrtruncate sent out ... but
3269  * that should happen as part of standard invalidation processing once
3270  * they acquire lock on the relation.
3271  */
3273 
3274  /*
3275  * Update statistics. Here, it *is* correct to adjust rel_pages
3276  * without also touching reltuples, since the tuple count wasn't
3277  * changed by the truncation.
3278  */
3279  vacrel->pages_removed += orig_rel_pages - new_rel_pages;
3280  vacrel->rel_pages = new_rel_pages;
3281 
3282  ereport(elevel,
3283  (errmsg("table \"%s\": truncated %u to %u pages",
3284  vacrel->relname,
3285  orig_rel_pages, new_rel_pages),
3286  errdetail_internal("%s",
3287  pg_rusage_show(&ru0))));
3288  orig_rel_pages = new_rel_pages;
3289  } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected);
3290 }
3291 
3292 /*
3293  * Rescan end pages to verify that they are (still) empty of tuples.
3294  *
3295  * Returns number of nondeletable pages (last nonempty page + 1).
3296  */
3297 static BlockNumber
3298 count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
3299 {
3300  BlockNumber blkno;
3301  BlockNumber prefetchedUntil;
3302  instr_time starttime;
3303 
3304  /* Initialize the starttime if we check for conflicting lock requests */
3305  INSTR_TIME_SET_CURRENT(starttime);
3306 
3307  /*
3308  * Start checking blocks at what we believe relation end to be and move
3309  * backwards. (Strange coding of loop control is needed because blkno is
3310  * unsigned.) To make the scan faster, we prefetch a few blocks at a time
3311  * in forward direction, so that OS-level readahead can kick in.
3312  */
3313  blkno = vacrel->rel_pages;
3315  "prefetch size must be power of 2");
3316  prefetchedUntil = InvalidBlockNumber;
3317  while (blkno > vacrel->nonempty_pages)
3318  {
3319  Buffer buf;
3320  Page page;
3321  OffsetNumber offnum,
3322  maxoff;
3323  bool hastup;
3324 
3325  /*
3326  * Check if another process requests a lock on our relation. We are
3327  * holding an AccessExclusiveLock here, so they will be waiting. We
3328  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
3329  * only check if that interval has elapsed once every 32 blocks to
3330  * keep the number of system calls and actual shared lock table
3331  * lookups to a minimum.
3332  */
3333  if ((blkno % 32) == 0)
3334  {
3335  instr_time currenttime;
3336  instr_time elapsed;
3337 
3338  INSTR_TIME_SET_CURRENT(currenttime);
3339  elapsed = currenttime;
3340  INSTR_TIME_SUBTRACT(elapsed, starttime);
3341  if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
3343  {
3345  {
3346  ereport(elevel,
3347  (errmsg("table \"%s\": suspending truncate due to conflicting lock request",
3348  vacrel->relname)));
3349 
3350  *lock_waiter_detected = true;
3351  return blkno;
3352  }
3353  starttime = currenttime;
3354  }
3355  }
3356 
3357  /*
3358  * We don't insert a vacuum delay point here, because we have an
3359  * exclusive lock on the table which we want to hold for as short a
3360  * time as possible. We still need to check for interrupts however.
3361  */
3363 
3364  blkno--;
3365 
3366  /* If we haven't prefetched this lot yet, do so now. */
3367  if (prefetchedUntil > blkno)
3368  {
3369  BlockNumber prefetchStart;
3370  BlockNumber pblkno;
3371 
3372  prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
3373  for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
3374  {
3375  PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
3377  }
3378  prefetchedUntil = prefetchStart;
3379  }
3380 
3381  buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
3382  vacrel->bstrategy);
3383 
3384  /* In this phase we only need shared access to the buffer */
3386 
3387  page = BufferGetPage(buf);
3388 
3389  if (PageIsNew(page) || PageIsEmpty(page))
3390  {
3392  continue;
3393  }
3394 
3395  hastup = false;
3396  maxoff = PageGetMaxOffsetNumber(page);
3397  for (offnum = FirstOffsetNumber;
3398  offnum <= maxoff;
3399  offnum = OffsetNumberNext(offnum))
3400  {
3401  ItemId itemid;
3402 
3403  itemid = PageGetItemId(page, offnum);
3404 
3405  /*
3406  * Note: any non-unused item should be taken as a reason to keep
3407  * this page. Even an LP_DEAD item makes truncation unsafe, since
3408  * we must not have cleaned out its index entries.
3409  */
3410  if (ItemIdIsUsed(itemid))
3411  {
3412  hastup = true;
3413  break; /* can stop scanning */
3414  }
3415  } /* scan along page */
3416 
3418 
3419  /* Done scanning if we found a tuple here */
3420  if (hastup)
3421  return blkno + 1;
3422  }
3423 
3424  /*
3425  * If we fall out of the loop, all the previously-thought-to-be-empty
3426  * pages still are; we need not bother to look at the last known-nonempty
3427  * page.
3428  */
3429  return vacrel->nonempty_pages;
3430 }
3431 
3432 /*
3433  * Returns the number of dead TIDs that VACUUM should allocate space to
3434  * store, given a heap rel of size vacrel->rel_pages, and given current
3435  * maintenance_work_mem setting (or current autovacuum_work_mem setting,
3436  * when applicable).
3437  *
3438  * See the comments at the head of this file for rationale.
3439  */
3440 static int
3442 {
3443  int64 max_items;
3444  int vac_work_mem = IsAutoVacuumWorkerProcess() &&
3445  autovacuum_work_mem != -1 ?
3447 
3449 
3450  if (vacrel->nindexes > 0)
3451  {
3452  BlockNumber rel_pages = vacrel->rel_pages;
3453 
3454  max_items = MAXDEADITEMS(vac_work_mem * 1024L);
3455  max_items = Min(max_items, INT_MAX);
3456  max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize));
3457 
3458  /* curious coding here to ensure the multiplication can't overflow */
3459  if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages)
3460  max_items = rel_pages * MaxHeapTuplesPerPage;
3461 
3462  /* stay sane if small maintenance_work_mem */
3463  max_items = Max(max_items, MaxHeapTuplesPerPage);
3464  }
3465  else
3466  {
3467  /* One-pass case only stores a single heap page's TIDs at a time */
3468  max_items = MaxHeapTuplesPerPage;
3469  }
3470 
3471  return (int) max_items;
3472 }
3473 
3474 /*
3475  * Returns the total required space for VACUUM's dead_items array given a
3476  * max_items value returned by dead_items_max_items
3477  */
3478 static inline Size
3480 {
3481  Assert(max_items >= MaxHeapTuplesPerPage);
3482  Assert(max_items <= MAXDEADITEMS(MaxAllocSize));
3483 
3484  return offsetof(LVDeadItems, items) + sizeof(ItemPointerData) * max_items;
3485 }
3486 
3487 /*
3488  * Allocate dead_items (either using palloc, or in dynamic shared memory).
3489  * Sets dead_items in vacrel for caller.
3490  *
3491  * Also handles parallel initialization as part of allocating dead_items in
3492  * DSM when required.
3493  */
3494 static void
3495 dead_items_alloc(LVRelState *vacrel, int nworkers)
3496 {
3497  LVDeadItems *dead_items;
3498  int max_items;
3499 
3500  /*
3501  * Initialize state for a parallel vacuum. As of now, only one worker can
3502  * be used for an index, so we invoke parallelism only if there are at
3503  * least two indexes on a table.
3504  */
3505  if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming)
3506  {
3507  /*
3508  * Since parallel workers cannot access data in temporary tables, we
3509  * can't perform parallel vacuum on them.
3510  */
3511  if (RelationUsesLocalBuffers(vacrel->rel))
3512  {
3513  /*
3514  * Give warning only if the user explicitly tries to perform a
3515  * parallel vacuum on the temporary table.
3516  */
3517  if (nworkers > 0)
3518  ereport(WARNING,
3519  (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3520  vacrel->relname)));
3521  }
3522  else
3523  begin_parallel_vacuum(vacrel, nworkers);
3524 
3525  /* If parallel mode started, vacrel->dead_items allocated in DSM */
3526  if (ParallelVacuumIsActive(vacrel))
3527  return;
3528  }
3529 
3530  /* Serial VACUUM case */
3531  max_items = dead_items_max_items(vacrel);
3532  dead_items = (LVDeadItems *) palloc(max_items_to_alloc_size(max_items));
3533  dead_items->max_items = max_items;
3534  dead_items->num_items = 0;
3535 
3536  vacrel->dead_items = dead_items;
3537 }
3538 
3539 /*
3540  * Perform cleanup for resources allocated in dead_items_alloc
3541  */
3542 static void
3544 {
3545  if (!ParallelVacuumIsActive(vacrel))
3546  {
3547  /* Don't bother with pfree here */
3548  return;
3549  }
3550 
3551  /*
3552  * End parallel mode before updating index statistics as we cannot write
3553  * during parallel mode.
3554  */
3555  end_parallel_vacuum(vacrel);
3556 }
3557 
3558 /*
3559  * lazy_tid_reaped() -- is a particular tid deletable?
3560  *
3561  * This has the right signature to be an IndexBulkDeleteCallback.
3562  *
3563  * Assumes dead_items array is sorted (in ascending TID order).
3564  */
3565 static bool
3567 {
3568  LVDeadItems *dead_items = (LVDeadItems *) state;
3569  int64 litem,
3570  ritem,
3571  item;
3572  ItemPointer res;
3573 
3574  litem = itemptr_encode(&dead_items->items[0]);
3575  ritem = itemptr_encode(&dead_items->items[dead_items->num_items - 1]);
3576  item = itemptr_encode(itemptr);
3577 
3578  /*
3579  * Doing a simple bound check before bsearch() is useful to avoid the
3580  * extra cost of bsearch(), especially if dead items on the heap are
3581  * concentrated in a certain range. Since this function is called for
3582  * every index tuple, it pays to be really fast.
3583  */
3584  if (item < litem || item > ritem)
3585  return false;
3586 
3587  res = (ItemPointer) bsearch((void *) itemptr,
3588  (void *) dead_items->items,
3589  dead_items->num_items,
3590  sizeof(ItemPointerData),
3591  vac_cmp_itemptr);
3592 
3593  return (res != NULL);
3594 }
3595 
3596 /*
3597  * Comparator routines for use with qsort() and bsearch().
3598  */
3599 static int
3600 vac_cmp_itemptr(const void *left, const void *right)
3601 {
3602  BlockNumber lblk,
3603  rblk;
3604  OffsetNumber loff,
3605  roff;
3606 
3607  lblk = ItemPointerGetBlockNumber((ItemPointer) left);
3608  rblk = ItemPointerGetBlockNumber((ItemPointer) right);
3609 
3610  if (lblk < rblk)
3611  return -1;
3612  if (lblk > rblk)
3613  return 1;
3614 
3615  loff = ItemPointerGetOffsetNumber((ItemPointer) left);
3616  roff = ItemPointerGetOffsetNumber((ItemPointer) right);
3617 
3618  if (loff < roff)
3619  return -1;
3620  if (loff > roff)
3621  return 1;
3622 
3623  return 0;
3624 }
3625 
3626 /*
3627  * Check if every tuple in the given page is visible to all current and future
3628  * transactions. Also return the visibility_cutoff_xid which is the highest
3629  * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
3630  * on this page is frozen.
3631  *
3632  * This is a stripped down version of lazy_scan_prune(). If you change
3633  * anything here, make sure that everything stays in sync. Note that an
3634  * assertion calls us to verify that everybody still agrees. Be sure to avoid
3635  * introducing new side-effects here.
3636  */
3637 static bool
3639  TransactionId *visibility_cutoff_xid,
3640  bool *all_frozen)
3641 {
3642  Page page = BufferGetPage(buf);
3644  OffsetNumber offnum,
3645  maxoff;
3646  bool all_visible = true;
3647 
3648  *visibility_cutoff_xid = InvalidTransactionId;
3649  *all_frozen = true;
3650 
3651  maxoff = PageGetMaxOffsetNumber(page);
3652  for (offnum = FirstOffsetNumber;
3653  offnum <= maxoff && all_visible;
3654  offnum = OffsetNumberNext(offnum))
3655  {
3656  ItemId itemid;
3657  HeapTupleData tuple;
3658 
3659  /*
3660  * Set the offset number so that we can display it along with any
3661  * error that occurred while processing this tuple.
3662  */
3663  vacrel->offnum = offnum;
3664  itemid = PageGetItemId(page, offnum);
3665 
3666  /* Unused or redirect line pointers are of no interest */
3667  if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3668  continue;
3669 
3670  ItemPointerSet(&(tuple.t_self), blockno, offnum);
3671 
3672  /*
3673  * Dead line pointers can have index pointers pointing to them. So
3674  * they can't be treated as visible
3675  */
3676  if (ItemIdIsDead(itemid))
3677  {
3678  all_visible = false;
3679  *all_frozen = false;
3680  break;
3681  }
3682 
3683  Assert(ItemIdIsNormal(itemid));
3684 
3685  tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3686  tuple.t_len = ItemIdGetLength(itemid);
3687  tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3688 
3689  switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
3690  {
3691  case HEAPTUPLE_LIVE:
3692  {
3693  TransactionId xmin;
3694 
3695  /* Check comments in lazy_scan_prune. */
3697  {
3698  all_visible = false;
3699  *all_frozen = false;
3700  break;
3701  }
3702 
3703  /*
3704  * The inserter definitely committed. But is it old enough
3705  * that everyone sees it as committed?
3706  */
3707  xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3708  if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
3709  {
3710  all_visible = false;
3711  *all_frozen = false;
3712  break;
3713  }
3714 
3715  /* Track newest xmin on page. */
3716  if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3717  *visibility_cutoff_xid = xmin;
3718 
3719  /* Check whether this tuple is already frozen or not */
3720  if (all_visible && *all_frozen &&
3722  *all_frozen = false;
3723  }
3724  break;
3725 
3726  case HEAPTUPLE_DEAD:
3730  {
3731  all_visible = false;
3732  *all_frozen = false;
3733  break;
3734  }
3735  default:
3736  elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3737  break;
3738  }
3739  } /* scan along page */
3740 
3741  /* Clear the offset information once we have processed the given page. */
3742  vacrel->offnum = InvalidOffsetNumber;
3743 
3744  return all_visible;
3745 }
3746 
3747 /*
3748  * Compute the number of parallel worker processes to request. Both index
3749  * vacuum and index cleanup can be executed with parallel workers. The index
3750  * is eligible for parallel vacuum iff its size is greater than
3751  * min_parallel_index_scan_size as invoking workers for very small indexes
3752  * can hurt performance.
3753  *
3754  * nrequested is the number of parallel workers that user requested. If
3755  * nrequested is 0, we compute the parallel degree based on nindexes, that is
3756  * the number of indexes that support parallel vacuum. This function also
3757  * sets will_parallel_vacuum to remember indexes that participate in parallel
3758  * vacuum.
3759  */
3760 static int
3762  bool *will_parallel_vacuum)
3763 {
3764  int nindexes_parallel = 0;
3765  int nindexes_parallel_bulkdel = 0;
3766  int nindexes_parallel_cleanup = 0;
3767  int parallel_workers;
3768 
3769  /*
3770  * We don't allow performing parallel operation in standalone backend or
3771  * when parallelism is disabled.
3772  */
3774  return 0;
3775 
3776  /*
3777  * Compute the number of indexes that can participate in parallel vacuum.
3778  */
3779  for (int idx = 0; idx < vacrel->nindexes; idx++)
3780  {
3781  Relation indrel = vacrel->indrels[idx];
3782  uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3783 
3784  if (vacoptions == VACUUM_OPTION_NO_PARALLEL ||
3786  continue;
3787 
3788  will_parallel_vacuum[idx] = true;
3789 
3790  if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3791  nindexes_parallel_bulkdel++;
3792  if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) ||
3793  ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3794  nindexes_parallel_cleanup++;
3795  }
3796 
3797  nindexes_parallel = Max(nindexes_parallel_bulkdel,
3798  nindexes_parallel_cleanup);
3799 
3800  /* The leader process takes one index */
3801  nindexes_parallel--;
3802 
3803  /* No index supports parallel vacuum */
3804  if (nindexes_parallel <= 0)
3805  return 0;
3806 
3807  /* Compute the parallel degree */
3808  parallel_workers = (nrequested > 0) ?
3809  Min(nrequested, nindexes_parallel) : nindexes_parallel;
3810 
3811  /* Cap by max_parallel_maintenance_workers */
3812  parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers);
3813 
3814  return parallel_workers;
3815 }
3816 
3817 /*
3818  * Update index statistics in pg_class if the statistics are accurate.
3819  */
3820 static void
3822 {
3823  Relation *indrels = vacrel->indrels;
3824  int nindexes = vacrel->nindexes;
3825  IndexBulkDeleteResult **indstats = vacrel->indstats;
3826 
3828 
3829  for (int idx = 0; idx < nindexes; idx++)
3830  {
3831  Relation indrel = indrels[idx];
3832  IndexBulkDeleteResult *istat = indstats[idx];
3833 
3834  if (istat == NULL || istat->estimated_count)
3835  continue;
3836 
3837  /* Update index statistics */
3838  vac_update_relstats(indrel,
3839  istat->num_pages,
3840  istat->num_index_tuples,
3841  0,
3842  false,
3845  false);
3846  }
3847 }
3848 
3849 /*
3850  * Try to enter parallel mode and create a parallel context. Then initialize
3851  * shared memory state.
3852  *
3853  * On success (when we can launch one or more workers), will set dead_items and
3854  * lps in vacrel for caller. A set lps in vacrel state indicates that parallel
3855  * VACUUM is currently active.
3856  */
3857 static void
3858 begin_parallel_vacuum(LVRelState *vacrel, int nrequested)
3859 {
3860  LVParallelState *lps;
3861  Relation *indrels = vacrel->indrels;
3862  int nindexes = vacrel->nindexes;
3863  ParallelContext *pcxt;
3864  LVShared *shared;
3865  LVDeadItems *dead_items;
3866  BufferUsage *buffer_usage;
3867  WalUsage *wal_usage;
3868  bool *will_parallel_vacuum;
3869  int max_items;
3870  Size est_shared_len;
3871  Size est_dead_items_len;
3872  int nindexes_mwm = 0;
3873  int parallel_workers = 0;
3874  int querylen;
3875 
3876  /*
3877  * A parallel vacuum must be requested and there must be indexes on the
3878  * relation
3879  */
3880  Assert(nrequested >= 0);
3881  Assert(nindexes > 0);
3882 
3883  /*
3884  * Compute the number of parallel vacuum workers to launch
3885  */
3886  will_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes);
3887  parallel_workers = compute_parallel_vacuum_workers(vacrel,
3888  nrequested,
3889  will_parallel_vacuum);
3890  if (parallel_workers <= 0)
3891  {
3892  /* Can't perform vacuum in parallel -- lps not set in vacrel */
3893  pfree(will_parallel_vacuum);
3894  return;
3895  }
3896 
3897  lps = (LVParallelState *) palloc0(sizeof(LVParallelState));
3898 
3900  pcxt = CreateParallelContext("postgres", "parallel_vacuum_main",
3901  parallel_workers);
3902  Assert(pcxt->nworkers > 0);
3903  lps->pcxt = pcxt;
3904 
3905  /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3906  est_shared_len = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3907  for (int idx = 0; idx < nindexes; idx++)
3908  {
3909  Relation indrel = indrels[idx];
3910  uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3911 
3912  /*
3913  * Cleanup option should be either disabled, always performing in
3914  * parallel or conditionally performing in parallel.
3915  */
3916  Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) ||
3917  ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0));
3918  Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE);
3919 
3920  /* Skip indexes that don't participate in parallel vacuum */
3921  if (!will_parallel_vacuum[idx])
3922  continue;
3923 
3924  if (indrel->rd_indam->amusemaintenanceworkmem)
3925  nindexes_mwm++;
3926 
3927  est_shared_len = add_size(est_shared_len, sizeof(LVSharedIndStats));
3928 
3929  /*
3930  * Remember the number of indexes that support parallel operation for
3931  * each phase.
3932  */
3933  if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3935  if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)
3937  if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)
3939  }
3940  shm_toc_estimate_chunk(&pcxt->estimator, est_shared_len);
3941  shm_toc_estimate_keys(&pcxt->estimator, 1);
3942 
3943  /* Estimate size for dead_items -- PARALLEL_VACUUM_KEY_DEAD_ITEMS */
3944  max_items = dead_items_max_items(vacrel);
3945  est_dead_items_len = MAXALIGN(max_items_to_alloc_size(max_items));
3946  shm_toc_estimate_chunk(&pcxt->estimator, est_dead_items_len);
3947  shm_toc_estimate_keys(&pcxt->estimator, 1);
3948 
3949  /*
3950  * Estimate space for BufferUsage and WalUsage --
3951  * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3952  *
3953  * If there are no extensions loaded that care, we could skip this. We
3954  * have no way of knowing whether anyone's looking at pgBufferUsage or
3955  * pgWalUsage, so do it unconditionally.
3956  */
3958  mul_size(sizeof(BufferUsage), pcxt->nworkers));
3959  shm_toc_estimate_keys(&pcxt->estimator, 1);
3961  mul_size(sizeof(WalUsage), pcxt->nworkers));
3962  shm_toc_estimate_keys(&pcxt->estimator, 1);
3963 
3964  /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3965  if (debug_query_string)
3966  {
3967  querylen = strlen(debug_query_string);
3968  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
3969  shm_toc_estimate_keys(&pcxt->estimator, 1);
3970  }
3971  else
3972  querylen = 0; /* keep compiler quiet */
3973 
3974  InitializeParallelDSM(pcxt);
3975 
3976  /* Prepare shared information */
3977  shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared_len);
3978  MemSet(shared, 0, est_shared_len);
3979  shared->relid = RelationGetRelid(vacrel->rel);
3980  shared->elevel = elevel;
3981  shared->maintenance_work_mem_worker =
3982  (nindexes_mwm > 0) ?
3983  maintenance_work_mem / Min(parallel_workers, nindexes_mwm) :
3985 
3986  pg_atomic_init_u32(&(shared->cost_balance), 0);
3987  pg_atomic_init_u32(&(shared->active_nworkers), 0);
3988  pg_atomic_init_u32(&(shared->idx), 0);
3989  shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3990 
3991  /*
3992  * Initialize variables for shared index statistics, set NULL bitmap and
3993  * the size of stats for each index.
3994  */
3995  memset(shared->bitmap, 0x00, BITMAPLEN(nindexes));
3996  for (int idx = 0; idx < nindexes; idx++)
3997  {
3998  if (!will_parallel_vacuum[idx])
3999  continue;
4000 
4001  /* Set NOT NULL as this index does support parallelism */
4002  shared->bitmap[idx >> 3] |= 1 << (idx & 0x07);
4003  }
4004 
4006  lps->lvshared = shared;
4007 
4008  /* Prepare the dead_items space */
4009  dead_items = (LVDeadItems *) shm_toc_allocate(pcxt->toc,
4010  est_dead_items_len);
4011  dead_items->max_items = max_items;
4012  dead_items->num_items = 0;
4013  MemSet(dead_items->items, 0, sizeof(ItemPointerData) * max_items);
4015 
4016  /*
4017  * Allocate space for each worker's BufferUsage and WalUsage; no need to
4018  * initialize
4019  */
4020  buffer_usage = shm_toc_allocate(pcxt->toc,
4021  mul_size(sizeof(BufferUsage), pcxt->nworkers));
4022  shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
4023  lps->buffer_usage = buffer_usage;
4024  wal_usage = shm_toc_allocate(pcxt->toc,
4025  mul_size(sizeof(WalUsage), pcxt->nworkers));
4027  lps->wal_usage = wal_usage;
4028 
4029  /* Store query string for workers */
4030  if (debug_query_string)
4031  {
4032  char *sharedquery;
4033 
4034  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
4035  memcpy(sharedquery, debug_query_string, querylen + 1);
4036  sharedquery[querylen] = '\0';
4037  shm_toc_insert(pcxt->toc,
4038  PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery);
4039  }
4040 
4041  pfree(will_parallel_vacuum);
4042 
4043  /* Success -- set dead_items and lps in leader's vacrel state */
4044  vacrel->dead_items = dead_items;
4045  vacrel->lps = lps;
4046 }
4047 
4048 /*
4049  * Destroy the parallel context, and end parallel mode.
4050  *
4051  * Since writes are not allowed during parallel mode, copy the
4052  * updated index statistics from DSM into local memory and then later use that
4053  * to update the index statistics. One might think that we can exit from
4054  * parallel mode, update the index statistics and then destroy parallel
4055  * context, but that won't be safe (see ExitParallelMode).
4056  */
4057 static void
4059 {
4060  IndexBulkDeleteResult **indstats = vacrel->indstats;
4061  LVParallelState *lps = vacrel->lps;
4062  int nindexes = vacrel->nindexes;
4063 
4065 
4066  /* Copy the updated statistics */
4067  for (int idx = 0; idx < nindexes; idx++)
4068  {
4069  LVSharedIndStats *shared_istat;
4070 
4071  shared_istat = parallel_stats_for_idx(lps->lvshared, idx);
4072 
4073  /*
4074  * Skip index -- it must have been processed by the leader, from
4075  * inside do_serial_processing_for_unsafe_indexes()
4076  */
4077  if (shared_istat == NULL)
4078  continue;
4079 
4080  if (shared_istat->updated)
4081  {
4082  indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
4083  memcpy(indstats[idx], &shared_istat->istat, sizeof(IndexBulkDeleteResult));
4084  }
4085  else
4086  indstats[idx] = NULL;
4087  }
4088 
4090  ExitParallelMode();
4091 
4092  /* Deactivate parallel vacuum */
4093  pfree(lps);
4094  vacrel->lps = NULL;
4095 }
4096 
4097 /*
4098  * Return shared memory statistics for index at offset 'getidx', if any
4099  *
4100  * Returning NULL indicates that compute_parallel_vacuum_workers() determined
4101  * that the index is a totally unsuitable target for all parallel processing
4102  * up front. For example, the index could be < min_parallel_index_scan_size
4103  * cutoff.
4104  */
4105 static LVSharedIndStats *
4106 parallel_stats_for_idx(LVShared *lvshared, int getidx)
4107 {
4108  char *p;
4109 
4110  if (IndStatsIsNull(lvshared, getidx))
4111  return NULL;
4112 
4113  p = (char *) GetSharedIndStats(lvshared);
4114  for (int idx = 0; idx < getidx; idx++)
4115  {
4116  if (IndStatsIsNull(lvshared, idx))
4117  continue;
4118 
4119  p += sizeof(LVSharedIndStats);
4120  }
4121 
4122  return (LVSharedIndStats *) p;
4123 }
4124 
4125 /*
4126  * Returns false, if the given index can't participate in parallel index
4127  * vacuum or parallel index cleanup
4128  */
4129 static bool
4131 {
4132  uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
4133 
4134  /* first_time must be true only if for_cleanup is true */
4135  Assert(lvshared->for_cleanup || !lvshared->first_time);
4136 
4137  if (lvshared->for_cleanup)
4138  {
4139  /* Skip, if the index does not support parallel cleanup */
4140  if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) &&
4141  ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0))
4142  return false;
4143 
4144  /*
4145  * Skip, if the index supports parallel cleanup conditionally, but we
4146  * have already processed the index (for bulkdelete). See the
4147  * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
4148  * when indexes support parallel cleanup conditionally.
4149  */
4150  if (!lvshared->first_time &&
4151  ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
4152  return false;
4153  }
4154  else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0)
4155  {
4156  /* Skip if the index does not support parallel bulk deletion */
4157  return false;
4158  }
4159 
4160  return true;
4161 }
4162 
4163 /*
4164  * Perform work within a launched parallel process.
4165  *
4166  * Since parallel vacuum workers perform only index vacuum or index cleanup,
4167  * we don't need to report progress information.
4168  */
4169 void
4171 {
4172  Relation rel;
4173  Relation *indrels;
4174  LVShared *lvshared;
4175  LVDeadItems *dead_items;
4176  BufferUsage *buffer_usage;
4177  WalUsage *wal_usage;
4178  int nindexes;
4179  char *sharedquery;
4180  LVRelState vacrel;
4181  ErrorContextCallback errcallback;
4182 
4183  /*
4184  * A parallel vacuum worker must have only PROC_IN_VACUUM flag since we
4185  * don't support parallel vacuum for autovacuum as of now.
4186  */
4188 
4190  false);
4191  elevel = lvshared->elevel;
4192 
4193  if (lvshared->for_cleanup)
4194  elog(DEBUG1, "starting parallel vacuum worker for cleanup");
4195  else
4196  elog(DEBUG1, "starting parallel vacuum worker for bulk delete");
4197 
4198  /* Set debug_query_string for individual workers */
4199  sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true);
4200  debug_query_string = sharedquery;
4202 
4203  /*
4204  * Open table. The lock mode is the same as the leader process. It's
4205  * okay because the lock mode does not conflict among the parallel
4206  * workers.
4207  */
4208  rel = table_open(lvshared->relid, ShareUpdateExclusiveLock);
4209 
4210  /*
4211  * Open all indexes. indrels are sorted in order by OID, which should be
4212  * matched to the leader's one.
4213  */
4214  vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels);
4215  Assert(nindexes > 0);
4216 
4217  /* Set dead_items space (set as worker's vacrel dead_items below) */
4218  dead_items = (LVDeadItems *) shm_toc_lookup(toc,
4220  false);
4221 
4222  /* Set cost-based vacuum delay */
4224  VacuumCostBalance = 0;
4225  VacuumPageHit = 0;
4226  VacuumPageMiss = 0;
4227  VacuumPageDirty = 0;
4229  VacuumSharedCostBalance = &(lvshared->cost_balance);
4230  VacuumActiveNWorkers = &(lvshared->active_nworkers);
4231 
4232  vacrel.rel = rel;
4233  vacrel.indrels = indrels;
4234  vacrel.nindexes = nindexes;
4235  /* Each parallel VACUUM worker gets its own access strategy */
4237  vacrel.indstats = (IndexBulkDeleteResult **)
4238  palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
4239 
4240  if (lvshared->maintenance_work_mem_worker > 0)
4242 
4243  /*
4244  * Initialize vacrel for use as error callback arg by parallel worker.
4245  */
4247  vacrel.relname = pstrdup(RelationGetRelationName(rel));
4248  vacrel.indname = NULL;
4249  vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN; /* Not yet processing */
4250  vacrel.dead_items = dead_items;
4251 
4252  /* Setup error traceback support for ereport() */
4253  errcallback.callback = vacuum_error_callback;
4254  errcallback.arg = &vacrel;
4255  errcallback.previous = error_context_stack;
4256  error_context_stack = &errcallback;
4257 
4258  /* Prepare to track buffer usage during parallel execution */
4260 
4261  /* Process indexes to perform vacuum/cleanup */
4262  do_parallel_processing(&vacrel, lvshared);
4263 
4264  /* Report buffer/WAL usage during parallel execution */
4265  buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
4266  wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
4268  &wal_usage[ParallelWorkerNumber]);
4269 
4270  /* Pop the error context stack */
4271  error_context_stack = errcallback.previous;
4272 
4273  vac_close_indexes(nindexes, indrels, RowExclusiveLock);
4275  FreeAccessStrategy(vacrel.bstrategy);
4276  pfree(vacrel.indstats);
4277 }
4278 
4279 /*
4280  * Error context callback for errors occurring during vacuum.
4281  */
4282 static void
4284 {
4285  LVRelState *errinfo = arg;
4286 
4287  switch (errinfo->phase)
4288  {
4290  if (BlockNumberIsValid(errinfo->blkno))
4291  {
4292  if (OffsetNumberIsValid(errinfo->offnum))
4293  errcontext("while scanning block %u offset %u of relation \"%s.%s\"",
4294  errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4295  else
4296  errcontext("while scanning block %u of relation \"%s.%s\"",
4297  errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4298  }
4299  else
4300  errcontext("while scanning relation \"%s.%s\"",
4301  errinfo->relnamespace, errinfo->relname);
4302  break;
4303 
4305  if (BlockNumberIsValid(errinfo->blkno))
4306  {
4307  if (OffsetNumberIsValid(errinfo->offnum))
4308  errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"",
4309  errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4310  else
4311  errcontext("while vacuuming block %u of relation \"%s.%s\"",
4312  errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4313  }
4314  else
4315  errcontext("while vacuuming relation \"%s.%s\"",
4316  errinfo->relnamespace, errinfo->relname);
4317  break;
4318 
4320  errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
4321  errinfo->indname, errinfo->relnamespace, errinfo->relname);
4322  break;
4323 
4325  errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
4326  errinfo->indname, errinfo->relnamespace, errinfo->relname);
4327  break;
4328 
4330  if (BlockNumberIsValid(errinfo->blkno))
4331  errcontext("while truncating relation \"%s.%s\" to %u blocks",
4332  errinfo->relnamespace, errinfo->relname, errinfo->blkno);
4333  break;
4334 
4336  default:
4337  return; /* do nothing; the errinfo may not be
4338  * initialized */
4339  }
4340 }
4341 
4342 /*
4343  * Updates the information required for vacuum error callback. This also saves
4344  * the current information which can be later restored via restore_vacuum_error_info.
4345  */
4346 static void
4348  int phase, BlockNumber blkno, OffsetNumber offnum)
4349 {
4350  if (saved_vacrel)
4351  {
4352  saved_vacrel->offnum = vacrel->offnum;
4353  saved_vacrel->blkno = vacrel->blkno;
4354  saved_vacrel->phase = vacrel->phase;
4355  }
4356 
4357  vacrel->blkno = blkno;
4358  vacrel->offnum = offnum;
4359  vacrel->phase = phase;
4360 }
4361 
4362 /*
4363  * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
4364  */
4365 static void
4367  const LVSavedErrInfo *saved_vacrel)
4368 {
4369  vacrel->blkno = saved_vacrel->blkno;
4370  vacrel->offnum = saved_vacrel->offnum;
4371  vacrel->phase = saved_vacrel->phase;
4372 }
Datum idx(PG_FUNCTION_ARGS)
Definition: _int_op.c:259
int min_parallel_index_scan_size
Definition: allpaths.c:65
static uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:401
static void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:223
static uint32 pg_atomic_fetch_add_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:328
static uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
Definition: atomics.h:386
static void pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:258
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
int autovacuum_work_mem
Definition: autovacuum.c:116
bool IsAutoVacuumWorkerProcess(void)
Definition: autovacuum.c:3411
int ParallelWorkerNumber
Definition: parallel.c:112
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:202
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:762
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:539
void ReinitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:475
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:916
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:164
void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch)
Definition: parallel.c:525
void TimestampDifference(TimestampTz start_time, TimestampTz stop_time, long *secs, int *microsecs)
Definition: timestamp.c:1656
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1711
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1580
void pgstat_progress_start_command(ProgressCommandType cmdtype, Oid relid)
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_progress_end_command(void)
@ PROGRESS_COMMAND_VACUUM
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
bool track_io_timing
Definition: bufmgr.c:135
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2748
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:587
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3757
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3780
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1565
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4053
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3996
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:741
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4230
@ BAS_VACUUM
Definition: bufmgr.h:33
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:97
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:212
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
@ RBM_NORMAL
Definition: bufmgr.h:39
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
Size PageGetHeapFreeSpace(Page page)
Definition: bufpage.c:984
void PageTruncateLinePointerArray(Page page)
Definition: bufpage.c:828
Pointer Page
Definition: bufpage.h:78
#define PageIsAllVisible(page)
Definition: bufpage.h:384
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:356
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:234
#define PageIsEmpty(page)
Definition: bufpage.h:221
#define PageGetItem(page, itemId)
Definition: bufpage.h:339
#define SizeOfPageHeaderData
Definition: bufpage.h:215
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
#define PageIsNew(page)
Definition: bufpage.h:228
#define PageClearAllVisible(page)
Definition: bufpage.h:388
#define PageSetAllVisible(page)
Definition: bufpage.h:386
#define PageGetLSN(page)
Definition: bufpage.h:365
unsigned int uint32
Definition: c.h:441
#define Min(x, y)
Definition: c.h:986
#define MAXALIGN(LEN)
Definition: c.h:757
#define offsetof(type, field)
Definition: c.h:727
#define ngettext(s, p, n)
Definition: c.h:1179
#define Max(x, y)
Definition: c.h:980
TransactionId MultiXactId
Definition: c.h:597
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:350
uint8 bits8
Definition: c.h:448
#define unlikely(x)
Definition: c.h:273
unsigned char uint8
Definition: c.h:439
#define MemSet(start, val, len)
Definition: c.h:1008
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:918
uint32 TransactionId
Definition: c.h:587
size_t Size
Definition: c.h:540
int64 TimestampTz
Definition: timestamp.h:39
char * get_database_name(Oid dbid)
Definition: dbcommands.c:2113
int errmsg_internal(const char *fmt,...)
Definition: elog.c:996
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1069
int errdetail(const char *fmt,...)
Definition: elog.c:1042
ErrorContextCallback * error_context_stack
Definition: elog.c:93
int errhint(const char *fmt,...)
Definition: elog.c:1156
int errmsg(const char *fmt,...)
Definition: elog.c:909
#define _(x)
Definition: elog.c:89
#define LOG
Definition: elog.h:25
#define errcontext
Definition: elog.h:190
#define WARNING
Definition: elog.h:30
#define DEBUG2
Definition: elog.h:23
#define DEBUG1
Definition: elog.h:24
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
#define INFO
Definition: elog.h:28
#define ereport(elevel,...)
Definition: elog.h:143
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:542
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:597
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition: freespace.c:352
Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
Definition: freespace.c:230
void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
Definition: freespace.c:181
int64 VacuumPageHit
Definition: globals.c:147
int max_parallel_maintenance_workers
Definition: globals.c:127
int64 VacuumPageMiss
Definition: globals.c:148
bool VacuumCostActive
Definition: globals.c:152
bool IsUnderPostmaster
Definition: globals.c:112
int64 VacuumPageDirty
Definition: globals.c:149
int VacuumCostBalance
Definition: globals.c:151
int maintenance_work_mem
Definition: globals.c:126
struct Latch * MyLatch
Definition: globals.c:57
double VacuumCostDelay
Definition: globals.c:145
Oid MyDatabaseId
Definition: globals.c:88
void heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
Definition: heapam.c:6612
bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
Definition: heapam.c:7028
bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, MultiXactId cutoff_multi, Buffer buf)
Definition: heapam.c:7081
bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, TransactionId cutoff_multi, xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
Definition: heapam.c:6383
XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples, int ntuples)
Definition: heapam.c:7983
HTSV_Result
Definition: heapam.h:94
@ HEAPTUPLE_RECENTLY_DEAD
Definition: heapam.h:97
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition: heapam.h:98
@ HEAPTUPLE_LIVE
Definition: heapam.h:96
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition: heapam.h:99
@ HEAPTUPLE_DEAD
Definition: heapam.h:95
HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer)
#define XLOG_HEAP2_VACUUM
Definition: heapam_xlog.h:55
#define SizeOfHeapVacuum
Definition: heapam_xlog.h:265
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
#define HeapTupleHeaderGetXmin(tup)
Definition: htup_details.h:308
#define BITMAPLEN(NATTS)
Definition: htup_details.h:541
#define HeapTupleHeaderXminCommitted(tup)
Definition: htup_details.h:319
#define MaxHeapTuplesPerPage
Definition: htup_details.h:568
#define IsParallelWorker()
Definition: parallel.h:61
static int64 itemptr_encode(ItemPointer itemptr)
Definition: index.h:185
IndexBulkDeleteResult * index_vacuum_cleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *istat)
Definition: indexam.c:712
IndexBulkDeleteResult * index_bulk_delete(IndexVacuumInfo *info, IndexBulkDeleteResult *istat, IndexBulkDeleteCallback callback, void *callback_state)
Definition: indexam.c:691
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
struct timeval instr_time
Definition: instr_time.h:150
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:218
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
WalUsage pgWalUsage
Definition: instrument.c:22
void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
Definition: instrument.c:274
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int i
Definition: isn.c:73
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdIsNormal(itemId)
Definition: itemid.h:99
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define ItemIdHasStorage(itemId)
Definition: itemid.h:120
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define ItemPointerSet(pointer, blockNumber, offNum)
Definition: itemptr.h:127
#define ItemPointerSetOffsetNumber(pointer, offsetNumber)
Definition: itemptr.h:148
ItemPointerData * ItemPointer
Definition: itemptr.h:49
struct ItemPointerData ItemPointerData
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define ItemPointerSetBlockNumber(pointer, blockNumber)
Definition: itemptr.h:138
void ResetLatch(Latch *latch)
Definition: latch.c:660
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:452
#define WL_TIMEOUT
Definition: latch.h:128
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:130
#define WL_LATCH_SET
Definition: latch.h:125
Assert(fmt[strlen(fmt) - 1] !='\n')
void UnlockRelation(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:283
bool ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:248
bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:346
#define NoLock
Definition: lockdefs.h:34
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define RowExclusiveLock
Definition: lockdefs.h:38
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3316
char * pstrdup(const char *in)
Definition: mcxt.c:1299
void pfree(void *pointer)
Definition: mcxt.c:1169
void * palloc0(Size size)
Definition: mcxt.c:1093
void * palloc(Size size)
Definition: mcxt.c:1062
#define MaxAllocSize
Definition: memutils.h:40
#define START_CRIT_SECTION()
Definition: miscadmin.h:147
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
#define END_CRIT_SECTION()
Definition: miscadmin.h:149
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3173
#define MultiXactIdIsValid(multi)
Definition: multixact.h:28
#define InvalidMultiXactId
Definition: multixact.h:24
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
void * arg
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:70
PgStat_Counter pgStatBlockReadTime
Definition: pgstat.c:243
PgStat_Counter pgStatBlockWriteTime
Definition: pgstat.c:244
void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples)
Definition: pgstat.c:1651
int64 PgStat_Counter
Definition: pgstat.h:95
const char * debug_query_string
Definition: postgres.c:90
unsigned int Oid
Definition: postgres_ext.h:31
#define PROC_IN_VACUUM
Definition: proc.h:55
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition: procarray.c:4042
#define PROGRESS_VACUUM_PHASE_FINAL_CLEANUP
Definition: progress.h:35
#define PROGRESS_VACUUM_PHASE_SCAN_HEAP
Definition: progress.h:30
#define PROGRESS_VACUUM_TOTAL_HEAP_BLKS
Definition: progress.h:22
#define PROGRESS_VACUUM_PHASE
Definition: progress.h:21
#define PROGRESS_VACUUM_NUM_DEAD_TUPLES
Definition: progress.h:27
#define PROGRESS_VACUUM_NUM_INDEX_VACUUMS
Definition: progress.h:25
#define PROGRESS_VACUUM_PHASE_VACUUM_HEAP
Definition: progress.h:32
#define PROGRESS_VACUUM_HEAP_BLKS_SCANNED
Definition: progress.h:23
#define PROGRESS_VACUUM_PHASE_INDEX_CLEANUP
Definition: progress.h:33
#define PROGRESS_VACUUM_PHASE_VACUUM_INDEX
Definition: progress.h:31
#define PROGRESS_VACUUM_MAX_DEAD_TUPLES
Definition: progress.h:26
#define PROGRESS_VACUUM_HEAP_BLKS_VACUUMED
Definition: progress.h:24
#define PROGRESS_VACUUM_PHASE_TRUNCATE
Definition: progress.h:34
int heap_page_prune(Relation relation, Buffer buffer, GlobalVisState *vistest, TransactionId old_snap_xmin, TimestampTz old_snap_ts, int *nnewlpdead, OffsetNumber *off_loc)
Definition: pruneheap.c:243
#define RelationGetRelid(relation)
Definition: rel.h:478
#define RelationGetRelationName(relation)
Definition: rel.h:512
#define RelationNeedsWAL(relation)
Definition: rel.h:602
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:611
#define RelationGetNamespace(relation)
Definition: rel.h:519
@ MAIN_FORKNUM
Definition: relpath.h:43
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
int old_snapshot_threshold
Definition: snapmgr.c:78
PGPROC * MyProc
Definition: proc.c:68
void RelationTruncate(Relation rel, BlockNumber nblocks)
Definition: storage.c:277
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
struct ErrorContextCallback * previous
Definition: elog.h:232
void(* callback)(void *arg)
Definition: elog.h:233
ItemPointerData t_self
Definition: htup.h:65
uint32 t_len
Definition: htup.h:64
HeapTupleHeader t_data
Definition: htup.h:68
Oid t_tableOid
Definition: htup.h:66
bool amusemaintenanceworkmem
Definition: amapi.h:246
uint8 amparallelvacuumoptions
Definition: amapi.h:250
bool estimated_count
Definition: genam.h:77
BlockNumber pages_deleted
Definition: genam.h:81
BlockNumber pages_newly_deleted
Definition: genam.h:80
BlockNumber pages_free
Definition: genam.h:82
BlockNumber num_pages
Definition: genam.h:76
double tuples_removed
Definition: genam.h:79
double num_index_tuples
Definition: genam.h:78
Relation index
Definition: genam.h:46
double num_heap_tuples
Definition: genam.h:51
bool analyze_only
Definition: genam.h:47
BufferAccessStrategy strategy
Definition: genam.h:52
bool report_progress
Definition: genam.h:48
int message_level
Definition: genam.h:50
bool estimated_count
Definition: genam.h:49
ItemPointerData items[FLEXIBLE_ARRAY_MEMBER]
Definition: vacuumlazy.c:165
TransactionId visibility_cutoff_xid
Definition: vacuumlazy.c:364
WalUsage * wal_usage
Definition: vacuumlazy.c:269
ParallelContext * pcxt
Definition: vacuumlazy.c:260
int nindexes_parallel_cleanup
Definition: vacuumlazy.c:276
int nindexes_parallel_bulkdel
Definition: vacuumlazy.c:275
LVShared * lvshared
Definition: vacuumlazy.c:263
BufferUsage * buffer_usage
Definition: vacuumlazy.c:266
int nindexes_parallel_condcleanup
Definition: vacuumlazy.c:277
int nindexes
Definition: vacuumlazy.c:285
MultiXactId relminmxid
Definition: vacuumlazy.c:303
TransactionId OldestXmin
Definition: vacuumlazy.c:307
BlockNumber tupcount_pages
Definition: vacuumlazy.c:328
OffsetNumber offnum
Definition: vacuumlazy.c:317
int64 tuples_deleted
Definition: vacuumlazy.c:341
MultiXactId MultiXactCutoff
Definition: vacuumlazy.c:310
BlockNumber nonempty_pages
Definition: vacuumlazy.c:331
double old_live_tuples
Definition: vacuumlazy.c:304
bool do_rel_truncate
Definition: vacuumlazy.c:295
BlockNumber scanned_pages
Definition: vacuumlazy.c:325
bool failsafe_active
Definition: vacuumlazy.c:288
int num_index_scans
Definition: vacuumlazy.c:340
IndexBulkDeleteResult ** indstats
Definition: vacuumlazy.c:337
double new_live_tuples
Definition: vacuumlazy.c:335
double new_rel_tuples
Definition: vacuumlazy.c:334
int64 new_dead_tuples
Definition: vacuumlazy.c:343
Relation rel
Definition: vacuumlazy.c:283
bool consider_bypass_optimization
Definition: vacuumlazy.c:290
BlockNumber rel_pages
Definition: vacuumlazy.c:324
TransactionId FreezeLimit
Definition: vacuumlazy.c:309
BlockNumber pinskipped_pages
Definition: vacuumlazy.c:326
BlockNumber pages_removed
Definition: vacuumlazy.c:329
char * relnamespace
Definition: vacuumlazy.c:313
int64 live_tuples
Definition: vacuumlazy.c:346
BlockNumber frozenskipped_pages
Definition: vacuumlazy.c:327
int64 lpdead_items
Definition: vacuumlazy.c:342
BufferAccessStrategy bstrategy
Definition: vacuumlazy.c:298
BlockNumber lpdead_item_pages
Definition: vacuumlazy.c:330
Relation * indrels
Definition: vacuumlazy.c:284
bool do_index_cleanup
Definition: vacuumlazy.c:294
BlockNumber blkno
Definition: vacuumlazy.c:316
LVParallelState * lps
Definition: vacuumlazy.c:299
TransactionId relfrozenxid
Definition: vacuumlazy.c:302
int64 num_tuples
Definition: vacuumlazy.c:345
LVDeadItems * dead_items
Definition: vacuumlazy.c:323
char * relname
Definition: vacuumlazy.c:314
VacErrPhase phase
Definition: vacuumlazy.c:318
char * indname
Definition: vacuumlazy.c:315
bool do_index_vacuuming
Definition: vacuumlazy.c:293
BlockNumber blkno
Definition: vacuumlazy.c:370
VacErrPhase phase
Definition: vacuumlazy.c:372
OffsetNumber offnum
Definition: vacuumlazy.c:371
IndexBulkDeleteResult istat
Definition: vacuumlazy.c:254
pg_atomic_uint32 idx
Definition: vacuumlazy.c:234
bool first_time
Definition: vacuumlazy.c:190
Oid relid
Definition: vacuumlazy.c:181
pg_atomic_uint32 active_nworkers
Definition: vacuumlazy.c:227
double reltuples
Definition: vacuumlazy.c:202
int elevel
Definition: vacuumlazy.c:182
bool for_cleanup
Definition: vacuumlazy.c:189
bool estimated_count
Definition: vacuumlazy.c:203
bits8 bitmap[FLEXIBLE_ARRAY_MEMBER]
Definition: vacuumlazy.c:236
uint32 offset
Definition: vacuumlazy.c:235
pg_atomic_uint32 cost_balance
Definition: vacuumlazy.c:220
int maintenance_work_mem_worker
Definition: vacuumlazy.c:213
uint8 statusFlags
Definition: proc.h:192
shm_toc_estimator estimator
Definition: parallel.h:42
shm_toc * toc
Definition: parallel.h:45
int nworkers_launched
Definition: parallel.h:38
struct IndexAmRoutine * rd_indam
Definition: rel.h:202
Form_pg_class rd_rel
Definition: rel.h:109
int nworkers
Definition: vacuum.h:230
int freeze_table_age
Definition: vacuum.h:213
VacOptValue truncate
Definition: vacuum.h:223
bits32 options
Definition: vacuum.h:211
int freeze_min_age
Definition: vacuum.h:212
bool is_wraparound
Definition: vacuum.h:218
int multixact_freeze_min_age
Definition: vacuum.h:214
int multixact_freeze_table_age
Definition: vacuum.h:216
int log_min_duration
Definition: vacuum.h:219
VacOptValue index_cleanup
Definition: vacuum.h:222
uint64 wal_bytes
Definition: instrument.h:51
int64 wal_fpi
Definition: instrument.h:50
int64 wal_records
Definition: instrument.h:49
Definition: type.h:90
Definition: regguts.h:318
OffsetNumber offset
Definition: heapam_xlog.h:327
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:167
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:39
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:319
bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition: transam.c:334
#define FrozenTransactionId
Definition: transam.h:33
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
void vac_update_relstats(Relation relation, BlockNumber num_pages, double num_tuples, BlockNumber num_all_visible_pages, bool hasindex, TransactionId frozenxid, MultiXactId minmulti, bool in_outer_xact)
Definition: vacuum.c:1306
pg_atomic_uint32 * VacuumActiveNWorkers
Definition: vacuum.c:79
void vac_open_indexes(Relation relation, LOCKMODE lockmode, int *nindexes, Relation **Irel)
Definition: vacuum.c:2085
int VacuumCostBalanceLocal
Definition: vacuum.c:80
bool vacuum_xid_failsafe_check(TransactionId relfrozenxid, MultiXactId relminmxid)
Definition: vacuum.c:1163
void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
Definition: vacuum.c:2128
void vacuum_delay_point(void)
Definition: vacuum.c:2149
pg_atomic_uint32 * VacuumSharedCostBalance
Definition: vacuum.c:78
void vacuum_set_xid_limits(Relation rel, int freeze_min_age, int freeze_table_age, int multixact_freeze_min_age, int multixact_freeze_table_age, TransactionId *oldestXmin, TransactionId *freezeLimit, TransactionId *xidFullScanLimit, MultiXactId *multiXactCutoff, MultiXactId *mxactFullScanLimit)
Definition: vacuum.c:957
double vac_estimate_reltuples(Relation relation, BlockNumber total_pages, BlockNumber scanned_pages, double scanned_tuples)
Definition: vacuum.c:1223
#define VACOPT_VERBOSE
Definition: vacuum.h:180
#define VACUUM_OPTION_PARALLEL_CLEANUP
Definition: vacuum.h:60
#define VACUUM_OPTION_NO_PARALLEL
Definition: vacuum.h:39
@ VACOPTVALUE_AUTO
Definition: vacuum.h:198
@ VACOPTVALUE_ENABLED
Definition: vacuum.h:200
@ VACOPTVALUE_UNSPECIFIED
Definition: vacuum.h:197
@ VACOPTVALUE_DISABLED
Definition: vacuum.h:199
#define VACOPT_DISABLE_PAGE_SKIPPING
Definition: vacuum.h:185
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:45
#define VACUUM_OPTION_MAX_VALID_VALUE
Definition: vacuum.h:63
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:52
static void dead_items_cleanup(LVRelState *vacrel)
Definition: vacuumlazy.c:3543
struct LVParallelState LVParallelState
static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, TransactionId *visibility_cutoff_xid, bool *all_frozen)
Definition: vacuumlazy.c:3638
#define FORCE_CHECK_PAGE()
static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers)
Definition: vacuumlazy.c:2677
static Size max_items_to_alloc_size(int max_items)
Definition: vacuumlazy.c:3479
static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel)
Definition: vacuumlazy.c:2638
#define SizeOfLVShared
Definition: vacuumlazy.c:241
static void update_index_statistics(LVRelState *vacrel)
Definition: vacuumlazy.c:3821
static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, GlobalVisState *vistest, LVPagePruneState *prunestate)
Definition: vacuumlazy.c:1682
static bool lazy_tid_reaped(ItemPointer itemptr, void *state)
Definition: vacuumlazy.c:3566
static IndexBulkDeleteResult * parallel_process_one_index(Relation indrel, IndexBulkDeleteResult *istat, LVShared *lvshared, LVSharedIndStats *shared_indstats, LVRelState *vacrel)
Definition: vacuumlazy.c:2908
struct LVPagePruneState LVPagePruneState
#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL
Definition: vacuumlazy.c:87
static void vacuum_error_callback(void *arg)
Definition: vacuumlazy.c:4283
static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared)
Definition: vacuumlazy.c:2857
static void lazy_truncate_heap(LVRelState *vacrel)
Definition: vacuumlazy.c:3165
static void lazy_vacuum(LVRelState *vacrel)
Definition: vacuumlazy.c:2065
static void lazy_cleanup_all_indexes(LVRelState *vacrel)
Definition: vacuumlazy.c:2962
static void end_parallel_vacuum(LVRelState *vacrel)
Definition: vacuumlazy.c:4058
#define MAXDEADITEMS(avail_mem)
Definition: vacuumlazy.c:168
#define REL_TRUNCATE_MINIMUM
Definition: vacuumlazy.c:76
#define PARALLEL_VACUUM_KEY_QUERY_TEXT
Definition: vacuumlazy.c:130
static bool should_attempt_truncation(LVRelState *vacrel)
Definition: vacuumlazy.c:3144
VacErrPhase
Definition: vacuumlazy.c:142
@ VACUUM_ERRCB_PHASE_SCAN_HEAP
Definition: vacuumlazy.c:144
@ VACUUM_ERRCB_PHASE_VACUUM_INDEX
Definition: vacuumlazy.c:145
@ VACUUM_ERRCB_PHASE_TRUNCATE
Definition: vacuumlazy.c:148
@ VACUUM_ERRCB_PHASE_INDEX_CLEANUP
Definition: vacuumlazy.c:147
@ VACUUM_ERRCB_PHASE_VACUUM_HEAP
Definition: vacuumlazy.c:146
@ VACUUM_ERRCB_PHASE_UNKNOWN
Definition: vacuumlazy.c:143
#define PARALLEL_VACUUM_KEY_BUFFER_USAGE
Definition: vacuumlazy.c:131
#define ParallelVacuumIsActive(vacrel)
Definition: vacuumlazy.c:138
static void restore_vacuum_error_info(LVRelState *vacrel, const LVSavedErrInfo *saved_vacrel)
Definition: vacuumlazy.c:4366
#define IndStatsIsNull(s, i)
Definition: vacuumlazy.c:244
static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel)
Definition: vacuumlazy.c:2617
static LVSharedIndStats * parallel_stats_for_idx(LVShared *lvshared, int getidx)
Definition: vacuumlazy.c:4106
struct LVSharedIndStats LVSharedIndStats
void heap_vacuum_rel(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy)
Definition: vacuumlazy.c:456
static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, int index, Buffer *vmbuffer)
Definition: vacuumlazy.c:2389
static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared)
Definition: vacuumlazy.c:4130
#define PARALLEL_VACUUM_KEY_SHARED
Definition: vacuumlazy.c:128
static void do_parallel_processing(LVRelState *vacrel, LVShared *lvshared)
Definition: vacuumlazy.c:2790
static IndexBulkDeleteResult * lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat, double reltuples, LVRelState *vacrel)
Definition: vacuumlazy.c:3008
static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
Definition: vacuumlazy.c:891
#define PARALLEL_VACUUM_KEY_WAL_USAGE
Definition: vacuumlazy.c:132
struct LVDeadItems LVDeadItems
#define REL_TRUNCATE_FRACTION
Definition: vacuumlazy.c:77
static bool lazy_check_wraparound_failsafe(LVRelState *vacrel)
Definition: vacuumlazy.c:2577
static int dead_items_max_items(LVRelState *vacrel)
Definition: vacuumlazy.c:3441
struct LVSavedErrInfo LVSavedErrInfo
static IndexBulkDeleteResult * lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, double reltuples, bool estimated_count, LVRelState *vacrel)
Definition: vacuumlazy.c:3064
#define PREFETCH_SIZE
Definition: vacuumlazy.c:121
struct LVRelState LVRelState
static void begin_parallel_vacuum(LVRelState *vacrel, int nrequested)
Definition: vacuumlazy.c:3858
void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
Definition: vacuumlazy.c:4170
#define BYPASS_THRESHOLD_PAGES
Definition: vacuumlazy.c:94
static void dead_items_alloc(LVRelState *vacrel, int nworkers)
Definition: vacuumlazy.c:3495
#define VACUUM_TRUNCATE_LOCK_TIMEOUT
Definition: vacuumlazy.c:88
static bool lazy_vacuum_all_indexes(LVRelState *vacrel)
Definition: vacuumlazy.c:2196
static void update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel, int phase, BlockNumber blkno, OffsetNumber offnum)
Definition: vacuumlazy.c:4347
static int compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested, bool *will_parallel_vacuum)
Definition: vacuumlazy.c:3761
#define PARALLEL_VACUUM_KEY_DEAD_ITEMS
Definition: vacuumlazy.c:129
static BlockNumber count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
Definition: vacuumlazy.c:3298
#define SKIP_PAGES_THRESHOLD
Definition: vacuumlazy.c:115
static int vac_cmp_itemptr(const void *left, const void *right)
Definition: vacuumlazy.c:3600
#define FAILSAFE_EVERY_PAGES
Definition: vacuumlazy.c:99
#define GetSharedIndStats(s)
Definition: vacuumlazy.c:242
struct LVShared LVShared
#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL
Definition: vacuumlazy.c:86
static bool lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel)
Definition: vacuumlazy.c:2511
static int elevel
Definition: vacuumlazy.c:376
static void lazy_vacuum_heap_rel(LVRelState *vacrel)
Definition: vacuumlazy.c:2293
#define VACUUM_FSM_EVERY_PAGES
Definition: vacuumlazy.c:108
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf, uint8 flags)
void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags)
uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen)
#define VM_ALL_VISIBLE(r, b, v)
Definition: visibilitymap.h:24
#define VM_ALL_FROZEN(r, b, v)
Definition: visibilitymap.h:26
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_ALL_VISIBLE
@ WAIT_EVENT_VACUUM_TRUNCATE
Definition: wait_event.h:148
void ExitParallelMode(void)
Definition: xact.c:1044
void EnterParallelMode(void)
Definition: xact.c:1031
bool IsInParallelMode(void)
Definition: xact.c:1064
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:429
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1144
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:375
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:229
void XLogBeginInsert(void)
Definition: xloginsert.c:136
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:337
#define REGBUF_STANDARD
Definition: xloginsert.h:34