PostgreSQL Source Code  git master
pruneheap.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * pruneheap.c
4  * heap page pruning and HOT-chain management code
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/pruneheap.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/heapam.h"
18 #include "access/heapam_xlog.h"
19 #include "access/htup_details.h"
20 #include "access/transam.h"
21 #include "access/xlog.h"
22 #include "catalog/catalog.h"
23 #include "miscadmin.h"
24 #include "pgstat.h"
25 #include "storage/bufmgr.h"
26 #include "utils/snapmgr.h"
27 #include "utils/rel.h"
28 #include "utils/snapmgr.h"
29 
30 /* Working data for heap_page_prune and subroutines */
31 typedef struct
32 {
34 
35  /* tuple visibility test, initialized for the relation */
37 
38  /*
39  * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
40  * been computed (done on demand, and only if
41  * OldSnapshotThresholdActive()). The first time a tuple is about to be
42  * removed based on the limited horizon, old_snap_used is set to true, and
43  * SetOldSnapshotThresholdTimestamp() is called. See
44  * heap_prune_satisfies_vacuum().
45  */
49 
50  TransactionId new_prune_xid; /* new prune hint value for page */
51  TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
52  int nredirected; /* numbers of entries in arrays below */
53  int ndead;
54  int nunused;
55  /* arrays that accumulate indexes of items to be changed */
59  /* marked[i] is true if item i is entered in one of the above arrays */
60  bool marked[MaxHeapTuplesPerPage + 1];
61 } PruneState;
62 
63 /* Local functions */
64 static int heap_prune_chain(Buffer buffer,
65  OffsetNumber rootoffnum,
66  PruneState *prstate);
67 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
68 static void heap_prune_record_redirect(PruneState *prstate,
69  OffsetNumber offnum, OffsetNumber rdoffnum);
70 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
71 static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
72 
73 
74 /*
75  * Optionally prune and repair fragmentation in the specified page.
76  *
77  * This is an opportunistic function. It will perform housekeeping
78  * only if the page heuristically looks like a candidate for pruning and we
79  * can acquire buffer cleanup lock without blocking.
80  *
81  * Note: this is called quite often. It's important that it fall out quickly
82  * if there's not any use in pruning.
83  *
84  * Caller must have pin on the buffer, and must *not* have a lock on it.
85  */
86 void
88 {
89  Page page = BufferGetPage(buffer);
90  TransactionId prune_xid;
91  GlobalVisState *vistest;
92  TransactionId limited_xmin = InvalidTransactionId;
93  TimestampTz limited_ts = 0;
94  Size minfree;
95 
96  /*
97  * We can't write WAL in recovery mode, so there's no point trying to
98  * clean the page. The primary will likely issue a cleaning WAL record soon
99  * anyway, so this is no particular loss.
100  */
101  if (RecoveryInProgress())
102  return;
103 
104  /*
105  * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
106  * currently are broken, and discussion of what to do about them is
107  * ongoing. See
108  * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
109  */
110  if (old_snapshot_threshold == 0)
112 
113  /*
114  * First check whether there's any chance there's something to prune,
115  * determining the appropriate horizon is a waste if there's no prune_xid
116  * (i.e. no updates/deletes left potentially dead tuples around).
117  */
118  prune_xid = ((PageHeader) page)->pd_prune_xid;
119  if (!TransactionIdIsValid(prune_xid))
120  return;
121 
122  /*
123  * Check whether prune_xid indicates that there may be dead rows that can
124  * be cleaned up.
125  *
126  * It is OK to check the old snapshot limit before acquiring the cleanup
127  * lock because the worst that can happen is that we are not quite as
128  * aggressive about the cleanup (by however many transaction IDs are
129  * consumed between this point and acquiring the lock). This allows us to
130  * save significant overhead in the case where the page is found not to be
131  * prunable.
132  *
133  * Even if old_snapshot_threshold is set, we first check whether the page
134  * can be pruned without. Both because
135  * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
136  * unnecessarily relying on old_snapshot_threshold avoids causing
137  * conflicts.
138  */
139  vistest = GlobalVisTestFor(relation);
140 
141  if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
142  {
144  return;
145 
147  relation,
148  &limited_xmin, &limited_ts))
149  return;
150 
151  if (!TransactionIdPrecedes(prune_xid, limited_xmin))
152  return;
153  }
154 
155  /*
156  * We prune when a previous UPDATE failed to find enough space on the page
157  * for a new tuple version, or when free space falls below the relation's
158  * fill-factor target (but not less than 10%).
159  *
160  * Checking free space here is questionable since we aren't holding any
161  * lock on the buffer; in the worst case we could get a bogus answer. It's
162  * unlikely to be *seriously* wrong, though, since reading either pd_lower
163  * or pd_upper is probably atomic. Avoiding taking a lock seems more
164  * important than sometimes getting a wrong answer in what is after all
165  * just a heuristic estimate.
166  */
167  minfree = RelationGetTargetPageFreeSpace(relation,
169  minfree = Max(minfree, BLCKSZ / 10);
170 
171  if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
172  {
173  /* OK, try to get exclusive buffer lock */
174  if (!ConditionalLockBufferForCleanup(buffer))
175  return;
176 
177  /*
178  * Now that we have buffer lock, get accurate information about the
179  * page's free space, and recheck the heuristic about whether to
180  * prune. (We needn't recheck PageIsPrunable, since no one else could
181  * have pruned while we hold pin.)
182  */
183  if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
184  {
185  TransactionId ignore = InvalidTransactionId; /* return value not
186  * needed */
187 
188  /* OK to prune */
189  (void) heap_page_prune(relation, buffer, vistest,
190  limited_xmin, limited_ts,
191  true, &ignore, NULL);
192  }
193 
194  /* And release buffer lock */
196  }
197 }
198 
199 
200 /*
201  * Prune and repair fragmentation in the specified page.
202  *
203  * Caller must have pin and buffer cleanup lock on the page.
204  *
205  * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
206  * (see heap_prune_satisfies_vacuum and
207  * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
208  * either have been set by TransactionIdLimitedForOldSnapshots, or
209  * InvalidTransactionId/0 respectively.
210  *
211  * If report_stats is true then we send the number of reclaimed heap-only
212  * tuples to pgstats. (This must be false during vacuum, since vacuum will
213  * send its own new total to pgstats, and we don't want this delta applied
214  * on top of that.)
215  *
216  * off_loc is the offset location required by the caller to use in error
217  * callback.
218  *
219  * Returns the number of tuples deleted from the page and sets
220  * latestRemovedXid.
221  */
222 int
223 heap_page_prune(Relation relation, Buffer buffer,
224  GlobalVisState *vistest,
225  TransactionId old_snap_xmin,
226  TimestampTz old_snap_ts,
227  bool report_stats, TransactionId *latestRemovedXid,
228  OffsetNumber *off_loc)
229 {
230  int ndeleted = 0;
231  Page page = BufferGetPage(buffer);
232  OffsetNumber offnum,
233  maxoff;
234  PruneState prstate;
235 
236  /*
237  * Our strategy is to scan the page and make lists of items to change,
238  * then apply the changes within a critical section. This keeps as much
239  * logic as possible out of the critical section, and also ensures that
240  * WAL replay will work the same as the normal case.
241  *
242  * First, initialize the new pd_prune_xid value to zero (indicating no
243  * prunable tuples). If we find any tuples which may soon become
244  * prunable, we will save the lowest relevant XID in new_prune_xid. Also
245  * initialize the rest of our working state.
246  */
248  prstate.rel = relation;
249  prstate.vistest = vistest;
250  prstate.old_snap_xmin = old_snap_xmin;
251  prstate.old_snap_ts = old_snap_ts;
252  prstate.old_snap_used = false;
253  prstate.latestRemovedXid = *latestRemovedXid;
254  prstate.nredirected = prstate.ndead = prstate.nunused = 0;
255  memset(prstate.marked, 0, sizeof(prstate.marked));
256 
257  /* Scan the page */
258  maxoff = PageGetMaxOffsetNumber(page);
259  for (offnum = FirstOffsetNumber;
260  offnum <= maxoff;
261  offnum = OffsetNumberNext(offnum))
262  {
263  ItemId itemid;
264 
265  /* Ignore items already processed as part of an earlier chain */
266  if (prstate.marked[offnum])
267  continue;
268 
269  /*
270  * Set the offset number so that we can display it along with any
271  * error that occurred while processing this tuple.
272  */
273  if (off_loc)
274  *off_loc = offnum;
275 
276  /* Nothing to do if slot is empty or already dead */
277  itemid = PageGetItemId(page, offnum);
278  if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
279  continue;
280 
281  /* Process this item or chain of items */
282  ndeleted += heap_prune_chain(buffer, offnum, &prstate);
283  }
284 
285  /* Clear the offset information once we have processed the given page. */
286  if (off_loc)
287  *off_loc = InvalidOffsetNumber;
288 
289  /* Any error while applying the changes is critical */
291 
292  /* Have we found any prunable items? */
293  if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
294  {
295  /*
296  * Apply the planned item changes, then repair page fragmentation, and
297  * update the page's hint bit about whether it has free line pointers.
298  */
300  prstate.redirected, prstate.nredirected,
301  prstate.nowdead, prstate.ndead,
302  prstate.nowunused, prstate.nunused);
303 
304  /*
305  * Update the page's pd_prune_xid field to either zero, or the lowest
306  * XID of any soon-prunable tuple.
307  */
308  ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
309 
310  /*
311  * Also clear the "page is full" flag, since there's no point in
312  * repeating the prune/defrag process until something else happens to
313  * the page.
314  */
315  PageClearFull(page);
316 
317  MarkBufferDirty(buffer);
318 
319  /*
320  * Emit a WAL XLOG_HEAP2_CLEAN record showing what we did
321  */
322  if (RelationNeedsWAL(relation))
323  {
324  XLogRecPtr recptr;
325 
326  recptr = log_heap_clean(relation, buffer,
327  prstate.redirected, prstate.nredirected,
328  prstate.nowdead, prstate.ndead,
329  prstate.nowunused, prstate.nunused,
330  prstate.latestRemovedXid);
331 
332  PageSetLSN(BufferGetPage(buffer), recptr);
333  }
334  }
335  else
336  {
337  /*
338  * If we didn't prune anything, but have found a new value for the
339  * pd_prune_xid field, update it and mark the buffer dirty. This is
340  * treated as a non-WAL-logged hint.
341  *
342  * Also clear the "page is full" flag if it is set, since there's no
343  * point in repeating the prune/defrag process until something else
344  * happens to the page.
345  */
346  if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
347  PageIsFull(page))
348  {
349  ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
350  PageClearFull(page);
351  MarkBufferDirtyHint(buffer, true);
352  }
353  }
354 
356 
357  /*
358  * If requested, report the number of tuples reclaimed to pgstats. This is
359  * ndeleted minus ndead, because we don't want to count a now-DEAD root
360  * item as a deletion for this purpose.
361  */
362  if (report_stats && ndeleted > prstate.ndead)
363  pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);
364 
365  *latestRemovedXid = prstate.latestRemovedXid;
366 
367  /*
368  * XXX Should we update the FSM information of this page ?
369  *
370  * There are two schools of thought here. We may not want to update FSM
371  * information so that the page is not used for unrelated UPDATEs/INSERTs
372  * and any free space in this page will remain available for further
373  * UPDATEs in *this* page, thus improving chances for doing HOT updates.
374  *
375  * But for a large table and where a page does not receive further UPDATEs
376  * for a long time, we might waste this space by not updating the FSM
377  * information. The relation may get extended and fragmented further.
378  *
379  * One possibility is to leave "fillfactor" worth of space in this page
380  * and update FSM with the remaining space.
381  */
382 
383  return ndeleted;
384 }
385 
386 
387 /*
388  * Perform visiblity checks for heap pruning.
389  *
390  * This is more complicated than just using GlobalVisTestIsRemovableXid()
391  * because of old_snapshot_threshold. We only want to increase the threshold
392  * that triggers errors for old snapshots when we actually decide to remove a
393  * row based on the limited horizon.
394  *
395  * Due to its cost we also only want to call
396  * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
397  * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
398  * still want to be able to remove rows that are too new to be removed
399  * according to prstate->vistest, but that can be removed based on
400  * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
401  * demand in here, if appropriate.
402  */
403 static HTSV_Result
405 {
406  HTSV_Result res;
407  TransactionId dead_after;
408 
409  res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
410 
411  if (res != HEAPTUPLE_RECENTLY_DEAD)
412  return res;
413 
414  /*
415  * If we are already relying on the limited xmin, there is no need to
416  * delay doing so anymore.
417  */
418  if (prstate->old_snap_used)
419  {
421 
422  if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
423  res = HEAPTUPLE_DEAD;
424  return res;
425  }
426 
427  /*
428  * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
429  * row dead. If not, and old_snapshot_threshold is enabled, try to use the
430  * lowered horizon.
431  */
432  if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
433  res = HEAPTUPLE_DEAD;
434  else if (OldSnapshotThresholdActive())
435  {
436  /* haven't determined limited horizon yet, requests */
437  if (!TransactionIdIsValid(prstate->old_snap_xmin))
438  {
439  TransactionId horizon =
441 
442  TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
443  &prstate->old_snap_xmin,
444  &prstate->old_snap_ts);
445  }
446 
447  if (TransactionIdIsValid(prstate->old_snap_xmin) &&
448  TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
449  {
450  /*
451  * About to remove row based on snapshot_too_old. Need to raise
452  * the threshold so problematic accesses would error.
453  */
454  Assert(!prstate->old_snap_used);
456  prstate->old_snap_xmin);
457  prstate->old_snap_used = true;
458  res = HEAPTUPLE_DEAD;
459  }
460  }
461 
462  return res;
463 }
464 
465 
466 /*
467  * Prune specified line pointer or a HOT chain originating at line pointer.
468  *
469  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
470  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
471  * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
472  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
473  * DEAD, the OldestXmin test is just too coarse to detect it.
474  *
475  * The root line pointer is redirected to the tuple immediately after the
476  * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
477  * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
478  * tuple, which we treat as a chain of length 1.)
479  *
480  * OldestXmin is the cutoff XID used to identify dead tuples.
481  *
482  * We don't actually change the page here, except perhaps for hint-bit updates
483  * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in
484  * prstate showing the changes to be made. Items to be redirected are added
485  * to the redirected[] array (two entries per redirection); items to be set to
486  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
487  * state are added to nowunused[].
488  *
489  * Returns the number of tuples (to be) deleted from the page.
490  */
491 static int
492 heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
493 {
494  int ndeleted = 0;
495  Page dp = (Page) BufferGetPage(buffer);
497  ItemId rootlp;
498  HeapTupleHeader htup;
499  OffsetNumber latestdead = InvalidOffsetNumber,
500  maxoff = PageGetMaxOffsetNumber(dp),
501  offnum;
503  int nchain = 0,
504  i;
505  HeapTupleData tup;
506 
507  tup.t_tableOid = RelationGetRelid(prstate->rel);
508 
509  rootlp = PageGetItemId(dp, rootoffnum);
510 
511  /*
512  * If it's a heap-only tuple, then it is not the start of a HOT chain.
513  */
514  if (ItemIdIsNormal(rootlp))
515  {
516  htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
517 
518  tup.t_data = htup;
519  tup.t_len = ItemIdGetLength(rootlp);
520  ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum);
521 
522  if (HeapTupleHeaderIsHeapOnly(htup))
523  {
524  /*
525  * If the tuple is DEAD and doesn't chain to anything else, mark
526  * it unused immediately. (If it does chain, we can only remove
527  * it as part of pruning its chain.)
528  *
529  * We need this primarily to handle aborted HOT updates, that is,
530  * XMIN_INVALID heap-only tuples. Those might not be linked to by
531  * any chain, since the parent tuple might be re-updated before
532  * any pruning occurs. So we have to be able to reap them
533  * separately from chain-pruning. (Note that
534  * HeapTupleHeaderIsHotUpdated will never return true for an
535  * XMIN_INVALID tuple, so this code will work even when there were
536  * sequential updates within the aborted transaction.)
537  *
538  * Note that we might first arrive at a dead heap-only tuple
539  * either here or while following a chain below. Whichever path
540  * gets there first will mark the tuple unused.
541  */
542  if (heap_prune_satisfies_vacuum(prstate, &tup, buffer)
544  {
545  heap_prune_record_unused(prstate, rootoffnum);
547  &prstate->latestRemovedXid);
548  ndeleted++;
549  }
550 
551  /* Nothing more to do */
552  return ndeleted;
553  }
554  }
555 
556  /* Start from the root tuple */
557  offnum = rootoffnum;
558 
559  /* while not end of the chain */
560  for (;;)
561  {
562  ItemId lp;
563  bool tupdead,
564  recent_dead;
565 
566  /* Some sanity checks */
567  if (offnum < FirstOffsetNumber || offnum > maxoff)
568  break;
569 
570  /* If item is already processed, stop --- it must not be same chain */
571  if (prstate->marked[offnum])
572  break;
573 
574  lp = PageGetItemId(dp, offnum);
575 
576  /* Unused item obviously isn't part of the chain */
577  if (!ItemIdIsUsed(lp))
578  break;
579 
580  /*
581  * If we are looking at the redirected root line pointer, jump to the
582  * first normal tuple in the chain. If we find a redirect somewhere
583  * else, stop --- it must not be same chain.
584  */
585  if (ItemIdIsRedirected(lp))
586  {
587  if (nchain > 0)
588  break; /* not at start of chain */
589  chainitems[nchain++] = offnum;
590  offnum = ItemIdGetRedirect(rootlp);
591  continue;
592  }
593 
594  /*
595  * Likewise, a dead line pointer can't be part of the chain. (We
596  * already eliminated the case of dead root tuple outside this
597  * function.)
598  */
599  if (ItemIdIsDead(lp))
600  break;
601 
602  Assert(ItemIdIsNormal(lp));
603  htup = (HeapTupleHeader) PageGetItem(dp, lp);
604 
605  tup.t_data = htup;
606  tup.t_len = ItemIdGetLength(lp);
607  ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
608 
609  /*
610  * Check the tuple XMIN against prior XMAX, if any
611  */
612  if (TransactionIdIsValid(priorXmax) &&
613  !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
614  break;
615 
616  /*
617  * OK, this tuple is indeed a member of the chain.
618  */
619  chainitems[nchain++] = offnum;
620 
621  /*
622  * Check tuple's visibility status.
623  */
624  tupdead = recent_dead = false;
625 
626  switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer))
627  {
628  case HEAPTUPLE_DEAD:
629  tupdead = true;
630  break;
631 
633  recent_dead = true;
634 
635  /*
636  * This tuple may soon become DEAD. Update the hint field so
637  * that the page is reconsidered for pruning in future.
638  */
641  break;
642 
644 
645  /*
646  * This tuple may soon become DEAD. Update the hint field so
647  * that the page is reconsidered for pruning in future.
648  */
651  break;
652 
653  case HEAPTUPLE_LIVE:
655 
656  /*
657  * If we wanted to optimize for aborts, we might consider
658  * marking the page prunable when we see INSERT_IN_PROGRESS.
659  * But we don't. See related decisions about when to mark the
660  * page prunable in heapam.c.
661  */
662  break;
663 
664  default:
665  elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
666  break;
667  }
668 
669  /*
670  * Remember the last DEAD tuple seen. We will advance past
671  * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
672  * but we can't advance past anything else. (XXX is it really worth
673  * continuing to scan beyond RECENTLY_DEAD? The case where we will
674  * find another DEAD tuple is a fairly unusual corner case.)
675  */
676  if (tupdead)
677  {
678  latestdead = offnum;
680  &prstate->latestRemovedXid);
681  }
682  else if (!recent_dead)
683  break;
684 
685  /*
686  * If the tuple is not HOT-updated, then we are at the end of this
687  * HOT-update chain.
688  */
689  if (!HeapTupleHeaderIsHotUpdated(htup))
690  break;
691 
692  /* HOT implies it can't have moved to different partition */
694 
695  /*
696  * Advance to next chain member.
697  */
699  BufferGetBlockNumber(buffer));
700  offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
701  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
702  }
703 
704  /*
705  * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
706  * the DEAD tuples at the start of the chain are removed and the root line
707  * pointer is appropriately redirected.
708  */
709  if (OffsetNumberIsValid(latestdead))
710  {
711  /*
712  * Mark as unused each intermediate item that we are able to remove
713  * from the chain.
714  *
715  * When the previous item is the last dead tuple seen, we are at the
716  * right candidate for redirection.
717  */
718  for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
719  {
720  heap_prune_record_unused(prstate, chainitems[i]);
721  ndeleted++;
722  }
723 
724  /*
725  * If the root entry had been a normal tuple, we are deleting it, so
726  * count it in the result. But changing a redirect (even to DEAD
727  * state) doesn't count.
728  */
729  if (ItemIdIsNormal(rootlp))
730  ndeleted++;
731 
732  /*
733  * If the DEAD tuple is at the end of the chain, the entire chain is
734  * dead and the root line pointer can be marked dead. Otherwise just
735  * redirect the root to the correct chain member.
736  */
737  if (i >= nchain)
738  heap_prune_record_dead(prstate, rootoffnum);
739  else
740  heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
741  }
742  else if (nchain < 2 && ItemIdIsRedirected(rootlp))
743  {
744  /*
745  * We found a redirect item that doesn't point to a valid follow-on
746  * item. This can happen if the loop in heap_page_prune caused us to
747  * visit the dead successor of a redirect item before visiting the
748  * redirect item. We can clean up by setting the redirect item to
749  * DEAD state.
750  */
751  heap_prune_record_dead(prstate, rootoffnum);
752  }
753 
754  return ndeleted;
755 }
756 
757 /* Record lowest soon-prunable XID */
758 static void
760 {
761  /*
762  * This should exactly match the PageSetPrunable macro. We can't store
763  * directly into the page header yet, so we update working state.
764  */
766  if (!TransactionIdIsValid(prstate->new_prune_xid) ||
767  TransactionIdPrecedes(xid, prstate->new_prune_xid))
768  prstate->new_prune_xid = xid;
769 }
770 
771 /* Record line pointer to be redirected */
772 static void
774  OffsetNumber offnum, OffsetNumber rdoffnum)
775 {
777  prstate->redirected[prstate->nredirected * 2] = offnum;
778  prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
779  prstate->nredirected++;
780  Assert(!prstate->marked[offnum]);
781  prstate->marked[offnum] = true;
782  Assert(!prstate->marked[rdoffnum]);
783  prstate->marked[rdoffnum] = true;
784 }
785 
786 /* Record line pointer to be marked dead */
787 static void
789 {
790  Assert(prstate->ndead < MaxHeapTuplesPerPage);
791  prstate->nowdead[prstate->ndead] = offnum;
792  prstate->ndead++;
793  Assert(!prstate->marked[offnum]);
794  prstate->marked[offnum] = true;
795 }
796 
797 /* Record line pointer to be marked unused */
798 static void
800 {
801  Assert(prstate->nunused < MaxHeapTuplesPerPage);
802  prstate->nowunused[prstate->nunused] = offnum;
803  prstate->nunused++;
804  Assert(!prstate->marked[offnum]);
805  prstate->marked[offnum] = true;
806 }
807 
808 
809 /*
810  * Perform the actual page changes needed by heap_page_prune.
811  * It is expected that the caller has suitable pin and lock on the
812  * buffer, and is inside a critical section.
813  *
814  * This is split out because it is also used by heap_xlog_clean()
815  * to replay the WAL record when needed after a crash. Note that the
816  * arguments are identical to those of log_heap_clean().
817  */
818 void
820  OffsetNumber *redirected, int nredirected,
821  OffsetNumber *nowdead, int ndead,
822  OffsetNumber *nowunused, int nunused)
823 {
824  Page page = (Page) BufferGetPage(buffer);
825  OffsetNumber *offnum;
826  int i;
827 
828  /* Update all redirected line pointers */
829  offnum = redirected;
830  for (i = 0; i < nredirected; i++)
831  {
832  OffsetNumber fromoff = *offnum++;
833  OffsetNumber tooff = *offnum++;
834  ItemId fromlp = PageGetItemId(page, fromoff);
835 
836  ItemIdSetRedirect(fromlp, tooff);
837  }
838 
839  /* Update all now-dead line pointers */
840  offnum = nowdead;
841  for (i = 0; i < ndead; i++)
842  {
843  OffsetNumber off = *offnum++;
844  ItemId lp = PageGetItemId(page, off);
845 
846  ItemIdSetDead(lp);
847  }
848 
849  /* Update all now-unused line pointers */
850  offnum = nowunused;
851  for (i = 0; i < nunused; i++)
852  {
853  OffsetNumber off = *offnum++;
854  ItemId lp = PageGetItemId(page, off);
855 
856  ItemIdSetUnused(lp);
857  }
858 
859  /*
860  * Finally, repair any fragmentation, and update the page's hint bit about
861  * whether it has free pointers.
862  */
864 }
865 
866 
867 /*
868  * For all items in this page, find their respective root line pointers.
869  * If item k is part of a HOT-chain with root at item j, then we set
870  * root_offsets[k - 1] = j.
871  *
872  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
873  * Unused entries are filled with InvalidOffsetNumber (zero).
874  *
875  * The function must be called with at least share lock on the buffer, to
876  * prevent concurrent prune operations.
877  *
878  * Note: The information collected here is valid only as long as the caller
879  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
880  * and reused by a completely unrelated tuple.
881  */
882 void
884 {
885  OffsetNumber offnum,
886  maxoff;
887 
888  MemSet(root_offsets, InvalidOffsetNumber,
890 
891  maxoff = PageGetMaxOffsetNumber(page);
892  for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
893  {
894  ItemId lp = PageGetItemId(page, offnum);
895  HeapTupleHeader htup;
896  OffsetNumber nextoffnum;
897  TransactionId priorXmax;
898 
899  /* skip unused and dead items */
900  if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
901  continue;
902 
903  if (ItemIdIsNormal(lp))
904  {
905  htup = (HeapTupleHeader) PageGetItem(page, lp);
906 
907  /*
908  * Check if this tuple is part of a HOT-chain rooted at some other
909  * tuple. If so, skip it for now; we'll process it when we find
910  * its root.
911  */
912  if (HeapTupleHeaderIsHeapOnly(htup))
913  continue;
914 
915  /*
916  * This is either a plain tuple or the root of a HOT-chain.
917  * Remember it in the mapping.
918  */
919  root_offsets[offnum - 1] = offnum;
920 
921  /* If it's not the start of a HOT-chain, we're done with it */
922  if (!HeapTupleHeaderIsHotUpdated(htup))
923  continue;
924 
925  /* Set up to scan the HOT-chain */
926  nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
927  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
928  }
929  else
930  {
931  /* Must be a redirect item. We do not set its root_offsets entry */
933  /* Set up to scan the HOT-chain */
934  nextoffnum = ItemIdGetRedirect(lp);
935  priorXmax = InvalidTransactionId;
936  }
937 
938  /*
939  * Now follow the HOT-chain and collect other tuples in the chain.
940  *
941  * Note: Even though this is a nested loop, the complexity of the
942  * function is O(N) because a tuple in the page should be visited not
943  * more than twice, once in the outer loop and once in HOT-chain
944  * chases.
945  */
946  for (;;)
947  {
948  lp = PageGetItemId(page, nextoffnum);
949 
950  /* Check for broken chains */
951  if (!ItemIdIsNormal(lp))
952  break;
953 
954  htup = (HeapTupleHeader) PageGetItem(page, lp);
955 
956  if (TransactionIdIsValid(priorXmax) &&
957  !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
958  break;
959 
960  /* Remember the root line pointer for this item */
961  root_offsets[nextoffnum - 1] = offnum;
962 
963  /* Advance to next chain member, if any */
964  if (!HeapTupleHeaderIsHotUpdated(htup))
965  break;
966 
967  /* HOT implies it can't have moved to different partition */
969 
970  nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
971  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
972  }
973  }
974 }
#define HeapTupleHeaderGetUpdateXid(tup)
Definition: htup_details.h:365
void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId *latestRemovedXid)
Definition: heapam.c:6903
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
int heap_page_prune(Relation relation, Buffer buffer, GlobalVisState *vistest, TransactionId old_snap_xmin, TimestampTz old_snap_ts, bool report_stats, TransactionId *latestRemovedXid, OffsetNumber *off_loc)
Definition: pruneheap.c:223
int nredirected
Definition: pruneheap.c:52
static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
Definition: pruneheap.c:492
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
void pgstat_update_heap_dead_tuples(Relation rel, int delta)
Definition: pgstat.c:2241
uint32 TransactionId
Definition: c.h:521
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3583
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1471
int64 TimestampTz
Definition: timestamp.h:39
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
bool old_snap_used
Definition: pruneheap.c:48
static bool OldSnapshotThresholdActive(void)
Definition: snapmgr.h:102
#define ItemIdGetRedirect(itemId)
Definition: itemid.h:78
Relation rel
Definition: pruneheap.c:33
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define MaxHeapTuplesPerPage
Definition: htup_details.h:574
TimestampTz old_snap_ts
Definition: pruneheap.c:46
static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:788
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define MemSet(start, val, len)
Definition: c.h:950
#define HeapTupleHeaderIndicatesMovedPartitions(tup)
Definition: htup_details.h:445
OffsetNumber nowdead[MaxHeapTuplesPerPage]
Definition: pruneheap.c:57
bool TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, Relation relation, TransactionId *limit_xid, TimestampTz *limit_ts)
Definition: snapmgr.c:1751
bool RecoveryInProgress(void)
Definition: xlog.c:8076
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
bool marked[MaxHeapTuplesPerPage+1]
Definition: pruneheap.c:60
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId new_prune_xid
Definition: pruneheap.c:50
bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
Definition: procarray.c:4086
void SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit)
Definition: snapmgr.c:1672
#define PageIsFull(page)
Definition: bufpage.h:378
int nunused
Definition: pruneheap.c:54
uint16 OffsetNumber
Definition: off.h:24
HeapTupleHeader t_data
Definition: htup.h:68
#define HeapTupleHeaderIsHeapOnly(tup)
Definition: htup_details.h:501
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition: procarray.c:3918
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3946
#define ERROR
Definition: elog.h:43
Size PageGetHeapFreeSpace(Page page)
Definition: bufpage.c:874
ItemPointerData t_ctid
Definition: htup_details.h:160
ItemPointerData t_self
Definition: htup.h:65
uint32 t_len
Definition: htup.h:64
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
Definition: pruneheap.c:759
TransactionId latestRemovedXid
Definition: pruneheap.c:51
#define ItemIdSetRedirect(itemId, link)
Definition: itemid.h:152
#define FirstOffsetNumber
Definition: off.h:27
void SnapshotTooOldMagicForTest(void)
Definition: snapmgr.c:1689
#define InvalidTransactionId
Definition: transam.h:31
Oid t_tableOid
Definition: htup.h:66
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
Definition: pruneheap.c:883
HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after)
#define PageClearFull(page)
Definition: bufpage.h:382
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition: rel.h:340
#define HeapTupleHeaderIsHotUpdated(tup)
Definition: htup_details.h:484
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3752
TransactionId old_snap_xmin
Definition: pruneheap.c:47
static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
Definition: pruneheap.c:404
#define InvalidOffsetNumber
Definition: off.h:26
GlobalVisState * vistest
Definition: pruneheap.c:36
void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused)
Definition: pruneheap.c:819
#define Max(x, y)
Definition: c.h:922
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:746
static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum)
Definition: pruneheap.c:773
#define ItemIdIsNormal(itemId)
Definition: itemid.h:99
#define HeapTupleHeaderGetXmin(tup)
Definition: htup_details.h:313
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:474
OffsetNumber nowunused[MaxHeapTuplesPerPage]
Definition: pruneheap.c:58
XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid)
Definition: heapam.c:7173
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define RelationNeedsWAL(relation)
Definition: rel.h:562
OffsetNumber redirected[MaxHeapTuplesPerPage *2]
Definition: pruneheap.c:56
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
int ndead
Definition: pruneheap.c:53
void PageRepairFragmentation(Page page)
Definition: bufpage.c:682
#define elog(elevel,...)
Definition: elog.h:214
int old_snapshot_threshold
Definition: snapmgr.c:78
int i
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
HTSV_Result
Definition: heapam.h:87
#define ItemIdSetDead(itemId)
Definition: itemid.h:164
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition: pruneheap.c:87
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define HEAP_DEFAULT_FILLFACTOR
Definition: rel.h:311
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
TransactionId GlobalVisTestNonRemovableHorizon(GlobalVisState *state)
Definition: procarray.c:4124
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define RelationGetRelid(relation)
Definition: rel.h:456
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define ItemPointerSet(pointer, blockNumber, offNum)
Definition: itemptr.h:127
static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:799