PostgreSQL Source Code  git master
pruneheap.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * pruneheap.c
4  * heap page pruning and HOT-chain management code
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/pruneheap.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/heapam.h"
18 #include "access/heapam_xlog.h"
19 #include "access/htup_details.h"
20 #include "access/transam.h"
21 #include "access/xlog.h"
22 #include "catalog/catalog.h"
23 #include "miscadmin.h"
24 #include "pgstat.h"
25 #include "storage/bufmgr.h"
26 #include "utils/snapmgr.h"
27 #include "utils/rel.h"
28 #include "utils/snapmgr.h"
29 
30 /* Working data for heap_page_prune and subroutines */
31 typedef struct
32 {
34 
35  /* tuple visibility test, initialized for the relation */
37 
38  /*
39  * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
40  * been computed (done on demand, and only if
41  * OldSnapshotThresholdActive()). The first time a tuple is about to be
42  * removed based on the limited horizon, old_snap_used is set to true, and
43  * SetOldSnapshotThresholdTimestamp() is called. See
44  * heap_prune_satisfies_vacuum().
45  */
49 
50  TransactionId new_prune_xid; /* new prune hint value for page */
51  TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
52  int nredirected; /* numbers of entries in arrays below */
53  int ndead;
54  int nunused;
55  /* arrays that accumulate indexes of items to be changed */
59  /* marked[i] is true if item i is entered in one of the above arrays */
60  bool marked[MaxHeapTuplesPerPage + 1];
61 } PruneState;
62 
63 /* Local functions */
64 static int heap_prune_chain(Buffer buffer,
65  OffsetNumber rootoffnum,
66  PruneState *prstate);
67 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
68 static void heap_prune_record_redirect(PruneState *prstate,
69  OffsetNumber offnum, OffsetNumber rdoffnum);
70 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
71 static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
72 
73 
74 /*
75  * Optionally prune and repair fragmentation in the specified page.
76  *
77  * This is an opportunistic function. It will perform housekeeping
78  * only if the page heuristically looks like a candidate for pruning and we
79  * can acquire buffer cleanup lock without blocking.
80  *
81  * Note: this is called quite often. It's important that it fall out quickly
82  * if there's not any use in pruning.
83  *
84  * Caller must have pin on the buffer, and must *not* have a lock on it.
85  */
86 void
88 {
89  Page page = BufferGetPage(buffer);
90  TransactionId prune_xid;
91  GlobalVisState *vistest;
92  TransactionId limited_xmin = InvalidTransactionId;
93  TimestampTz limited_ts = 0;
94  Size minfree;
95 
96  /*
97  * We can't write WAL in recovery mode, so there's no point trying to
98  * clean the page. The primary will likely issue a cleaning WAL record soon
99  * anyway, so this is no particular loss.
100  */
101  if (RecoveryInProgress())
102  return;
103 
104  /*
105  * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
106  * currently are broken, and discussion of what to do about them is
107  * ongoing. See
108  * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
109  */
110  if (old_snapshot_threshold == 0)
112 
113  /*
114  * First check whether there's any chance there's something to prune,
115  * determining the appropriate horizon is a waste if there's no prune_xid
116  * (i.e. no updates/deletes left potentially dead tuples around).
117  */
118  prune_xid = ((PageHeader) page)->pd_prune_xid;
119  if (!TransactionIdIsValid(prune_xid))
120  return;
121 
122  /*
123  * Check whether prune_xid indicates that there may be dead rows that can
124  * be cleaned up.
125  *
126  * It is OK to check the old snapshot limit before acquiring the cleanup
127  * lock because the worst that can happen is that we are not quite as
128  * aggressive about the cleanup (by however many transaction IDs are
129  * consumed between this point and acquiring the lock). This allows us to
130  * save significant overhead in the case where the page is found not to be
131  * prunable.
132  *
133  * Even if old_snapshot_threshold is set, we first check whether the page
134  * can be pruned without. Both because
135  * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
136  * unnecessarily relying on old_snapshot_threshold avoids causing
137  * conflicts.
138  */
139  vistest = GlobalVisTestFor(relation);
140 
141  if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
142  {
144  return;
145 
147  relation,
148  &limited_xmin, &limited_ts))
149  return;
150 
151  if (!TransactionIdPrecedes(prune_xid, limited_xmin))
152  return;
153  }
154 
155  /*
156  * We prune when a previous UPDATE failed to find enough space on the page
157  * for a new tuple version, or when free space falls below the relation's
158  * fill-factor target (but not less than 10%).
159  *
160  * Checking free space here is questionable since we aren't holding any
161  * lock on the buffer; in the worst case we could get a bogus answer. It's
162  * unlikely to be *seriously* wrong, though, since reading either pd_lower
163  * or pd_upper is probably atomic. Avoiding taking a lock seems more
164  * important than sometimes getting a wrong answer in what is after all
165  * just a heuristic estimate.
166  */
167  minfree = RelationGetTargetPageFreeSpace(relation,
169  minfree = Max(minfree, BLCKSZ / 10);
170 
171  if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
172  {
173  /* OK, try to get exclusive buffer lock */
174  if (!ConditionalLockBufferForCleanup(buffer))
175  return;
176 
177  /*
178  * Now that we have buffer lock, get accurate information about the
179  * page's free space, and recheck the heuristic about whether to
180  * prune. (We needn't recheck PageIsPrunable, since no one else could
181  * have pruned while we hold pin.)
182  */
183  if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
184  {
185  /* OK to prune */
186  (void) heap_page_prune(relation, buffer, vistest,
187  limited_xmin, limited_ts,
188  true, NULL);
189  }
190 
191  /* And release buffer lock */
193  }
194 }
195 
196 
197 /*
198  * Prune and repair fragmentation in the specified page.
199  *
200  * Caller must have pin and buffer cleanup lock on the page.
201  *
202  * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
203  * (see heap_prune_satisfies_vacuum and
204  * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
205  * either have been set by TransactionIdLimitedForOldSnapshots, or
206  * InvalidTransactionId/0 respectively.
207  *
208  * If report_stats is true then we send the number of reclaimed heap-only
209  * tuples to pgstats. (This must be false during vacuum, since vacuum will
210  * send its own new total to pgstats, and we don't want this delta applied
211  * on top of that.)
212  *
213  * off_loc is the offset location required by the caller to use in error
214  * callback.
215  *
216  * Returns the number of tuples deleted from the page during this call.
217  */
218 int
219 heap_page_prune(Relation relation, Buffer buffer,
220  GlobalVisState *vistest,
221  TransactionId old_snap_xmin,
222  TimestampTz old_snap_ts,
223  bool report_stats,
224  OffsetNumber *off_loc)
225 {
226  int ndeleted = 0;
227  Page page = BufferGetPage(buffer);
228  OffsetNumber offnum,
229  maxoff;
230  PruneState prstate;
231 
232  /*
233  * Our strategy is to scan the page and make lists of items to change,
234  * then apply the changes within a critical section. This keeps as much
235  * logic as possible out of the critical section, and also ensures that
236  * WAL replay will work the same as the normal case.
237  *
238  * First, initialize the new pd_prune_xid value to zero (indicating no
239  * prunable tuples). If we find any tuples which may soon become
240  * prunable, we will save the lowest relevant XID in new_prune_xid. Also
241  * initialize the rest of our working state.
242  */
244  prstate.rel = relation;
245  prstate.vistest = vistest;
246  prstate.old_snap_xmin = old_snap_xmin;
247  prstate.old_snap_ts = old_snap_ts;
248  prstate.old_snap_used = false;
250  prstate.nredirected = prstate.ndead = prstate.nunused = 0;
251  memset(prstate.marked, 0, sizeof(prstate.marked));
252 
253  /* Scan the page */
254  maxoff = PageGetMaxOffsetNumber(page);
255  for (offnum = FirstOffsetNumber;
256  offnum <= maxoff;
257  offnum = OffsetNumberNext(offnum))
258  {
259  ItemId itemid;
260 
261  /* Ignore items already processed as part of an earlier chain */
262  if (prstate.marked[offnum])
263  continue;
264 
265  /*
266  * Set the offset number so that we can display it along with any
267  * error that occurred while processing this tuple.
268  */
269  if (off_loc)
270  *off_loc = offnum;
271 
272  /* Nothing to do if slot is empty or already dead */
273  itemid = PageGetItemId(page, offnum);
274  if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
275  continue;
276 
277  /* Process this item or chain of items */
278  ndeleted += heap_prune_chain(buffer, offnum, &prstate);
279  }
280 
281  /* Clear the offset information once we have processed the given page. */
282  if (off_loc)
283  *off_loc = InvalidOffsetNumber;
284 
285  /* Any error while applying the changes is critical */
287 
288  /* Have we found any prunable items? */
289  if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
290  {
291  /*
292  * Apply the planned item changes, then repair page fragmentation, and
293  * update the page's hint bit about whether it has free line pointers.
294  */
296  prstate.redirected, prstate.nredirected,
297  prstate.nowdead, prstate.ndead,
298  prstate.nowunused, prstate.nunused);
299 
300  /*
301  * Update the page's pd_prune_xid field to either zero, or the lowest
302  * XID of any soon-prunable tuple.
303  */
304  ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
305 
306  /*
307  * Also clear the "page is full" flag, since there's no point in
308  * repeating the prune/defrag process until something else happens to
309  * the page.
310  */
311  PageClearFull(page);
312 
313  MarkBufferDirty(buffer);
314 
315  /*
316  * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
317  */
318  if (RelationNeedsWAL(relation))
319  {
320  xl_heap_prune xlrec;
321  XLogRecPtr recptr;
322 
323  xlrec.latestRemovedXid = prstate.latestRemovedXid;
324  xlrec.nredirected = prstate.nredirected;
325  xlrec.ndead = prstate.ndead;
326 
327  XLogBeginInsert();
328  XLogRegisterData((char *) &xlrec, SizeOfHeapPrune);
329 
331 
332  /*
333  * The OffsetNumber arrays are not actually in the buffer, but we
334  * pretend that they are. When XLogInsert stores the whole
335  * buffer, the offset arrays need not be stored too.
336  */
337  if (prstate.nredirected > 0)
338  XLogRegisterBufData(0, (char *) prstate.redirected,
339  prstate.nredirected *
340  sizeof(OffsetNumber) * 2);
341 
342  if (prstate.ndead > 0)
343  XLogRegisterBufData(0, (char *) prstate.nowdead,
344  prstate.ndead * sizeof(OffsetNumber));
345 
346  if (prstate.nunused > 0)
347  XLogRegisterBufData(0, (char *) prstate.nowunused,
348  prstate.nunused * sizeof(OffsetNumber));
349 
350  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE);
351 
352  PageSetLSN(BufferGetPage(buffer), recptr);
353  }
354  }
355  else
356  {
357  /*
358  * If we didn't prune anything, but have found a new value for the
359  * pd_prune_xid field, update it and mark the buffer dirty. This is
360  * treated as a non-WAL-logged hint.
361  *
362  * Also clear the "page is full" flag if it is set, since there's no
363  * point in repeating the prune/defrag process until something else
364  * happens to the page.
365  */
366  if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
367  PageIsFull(page))
368  {
369  ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
370  PageClearFull(page);
371  MarkBufferDirtyHint(buffer, true);
372  }
373  }
374 
376 
377  /*
378  * If requested, report the number of tuples reclaimed to pgstats. This is
379  * ndeleted minus ndead, because we don't want to count a now-DEAD root
380  * item as a deletion for this purpose.
381  */
382  if (report_stats && ndeleted > prstate.ndead)
383  pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);
384 
385  /*
386  * XXX Should we update the FSM information of this page ?
387  *
388  * There are two schools of thought here. We may not want to update FSM
389  * information so that the page is not used for unrelated UPDATEs/INSERTs
390  * and any free space in this page will remain available for further
391  * UPDATEs in *this* page, thus improving chances for doing HOT updates.
392  *
393  * But for a large table and where a page does not receive further UPDATEs
394  * for a long time, we might waste this space by not updating the FSM
395  * information. The relation may get extended and fragmented further.
396  *
397  * One possibility is to leave "fillfactor" worth of space in this page
398  * and update FSM with the remaining space.
399  */
400 
401  return ndeleted;
402 }
403 
404 
405 /*
406  * Perform visibility checks for heap pruning.
407  *
408  * This is more complicated than just using GlobalVisTestIsRemovableXid()
409  * because of old_snapshot_threshold. We only want to increase the threshold
410  * that triggers errors for old snapshots when we actually decide to remove a
411  * row based on the limited horizon.
412  *
413  * Due to its cost we also only want to call
414  * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
415  * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
416  * still want to be able to remove rows that are too new to be removed
417  * according to prstate->vistest, but that can be removed based on
418  * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
419  * demand in here, if appropriate.
420  */
421 static HTSV_Result
423 {
424  HTSV_Result res;
425  TransactionId dead_after;
426 
427  res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
428 
429  if (res != HEAPTUPLE_RECENTLY_DEAD)
430  return res;
431 
432  /*
433  * If we are already relying on the limited xmin, there is no need to
434  * delay doing so anymore.
435  */
436  if (prstate->old_snap_used)
437  {
439 
440  if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
441  res = HEAPTUPLE_DEAD;
442  return res;
443  }
444 
445  /*
446  * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
447  * row dead. If not, and old_snapshot_threshold is enabled, try to use the
448  * lowered horizon.
449  */
450  if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
451  res = HEAPTUPLE_DEAD;
452  else if (OldSnapshotThresholdActive())
453  {
454  /* haven't determined limited horizon yet, requests */
455  if (!TransactionIdIsValid(prstate->old_snap_xmin))
456  {
457  TransactionId horizon =
459 
460  TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
461  &prstate->old_snap_xmin,
462  &prstate->old_snap_ts);
463  }
464 
465  if (TransactionIdIsValid(prstate->old_snap_xmin) &&
466  TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
467  {
468  /*
469  * About to remove row based on snapshot_too_old. Need to raise
470  * the threshold so problematic accesses would error.
471  */
472  Assert(!prstate->old_snap_used);
474  prstate->old_snap_xmin);
475  prstate->old_snap_used = true;
476  res = HEAPTUPLE_DEAD;
477  }
478  }
479 
480  return res;
481 }
482 
483 
484 /*
485  * Prune specified line pointer or a HOT chain originating at line pointer.
486  *
487  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
488  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
489  * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
490  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
491  * DEAD, the OldestXmin test is just too coarse to detect it.
492  *
493  * The root line pointer is redirected to the tuple immediately after the
494  * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
495  * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
496  * tuple, which we treat as a chain of length 1.)
497  *
498  * OldestXmin is the cutoff XID used to identify dead tuples.
499  *
500  * We don't actually change the page here, except perhaps for hint-bit updates
501  * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in
502  * prstate showing the changes to be made. Items to be redirected are added
503  * to the redirected[] array (two entries per redirection); items to be set to
504  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
505  * state are added to nowunused[].
506  *
507  * Returns the number of tuples (to be) deleted from the page.
508  */
509 static int
510 heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
511 {
512  int ndeleted = 0;
513  Page dp = (Page) BufferGetPage(buffer);
515  ItemId rootlp;
516  HeapTupleHeader htup;
517  OffsetNumber latestdead = InvalidOffsetNumber,
518  maxoff = PageGetMaxOffsetNumber(dp),
519  offnum;
521  int nchain = 0,
522  i;
523  HeapTupleData tup;
524 
525  tup.t_tableOid = RelationGetRelid(prstate->rel);
526 
527  rootlp = PageGetItemId(dp, rootoffnum);
528 
529  /*
530  * If it's a heap-only tuple, then it is not the start of a HOT chain.
531  */
532  if (ItemIdIsNormal(rootlp))
533  {
534  htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
535 
536  tup.t_data = htup;
537  tup.t_len = ItemIdGetLength(rootlp);
538  ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum);
539 
540  if (HeapTupleHeaderIsHeapOnly(htup))
541  {
542  /*
543  * If the tuple is DEAD and doesn't chain to anything else, mark
544  * it unused immediately. (If it does chain, we can only remove
545  * it as part of pruning its chain.)
546  *
547  * We need this primarily to handle aborted HOT updates, that is,
548  * XMIN_INVALID heap-only tuples. Those might not be linked to by
549  * any chain, since the parent tuple might be re-updated before
550  * any pruning occurs. So we have to be able to reap them
551  * separately from chain-pruning. (Note that
552  * HeapTupleHeaderIsHotUpdated will never return true for an
553  * XMIN_INVALID tuple, so this code will work even when there were
554  * sequential updates within the aborted transaction.)
555  *
556  * Note that we might first arrive at a dead heap-only tuple
557  * either here or while following a chain below. Whichever path
558  * gets there first will mark the tuple unused.
559  */
560  if (heap_prune_satisfies_vacuum(prstate, &tup, buffer)
562  {
563  heap_prune_record_unused(prstate, rootoffnum);
565  &prstate->latestRemovedXid);
566  ndeleted++;
567  }
568 
569  /* Nothing more to do */
570  return ndeleted;
571  }
572  }
573 
574  /* Start from the root tuple */
575  offnum = rootoffnum;
576 
577  /* while not end of the chain */
578  for (;;)
579  {
580  ItemId lp;
581  bool tupdead,
582  recent_dead;
583 
584  /* Some sanity checks */
585  if (offnum < FirstOffsetNumber || offnum > maxoff)
586  break;
587 
588  /* If item is already processed, stop --- it must not be same chain */
589  if (prstate->marked[offnum])
590  break;
591 
592  lp = PageGetItemId(dp, offnum);
593 
594  /* Unused item obviously isn't part of the chain */
595  if (!ItemIdIsUsed(lp))
596  break;
597 
598  /*
599  * If we are looking at the redirected root line pointer, jump to the
600  * first normal tuple in the chain. If we find a redirect somewhere
601  * else, stop --- it must not be same chain.
602  */
603  if (ItemIdIsRedirected(lp))
604  {
605  if (nchain > 0)
606  break; /* not at start of chain */
607  chainitems[nchain++] = offnum;
608  offnum = ItemIdGetRedirect(rootlp);
609  continue;
610  }
611 
612  /*
613  * Likewise, a dead line pointer can't be part of the chain. (We
614  * already eliminated the case of dead root tuple outside this
615  * function.)
616  */
617  if (ItemIdIsDead(lp))
618  break;
619 
620  Assert(ItemIdIsNormal(lp));
621  htup = (HeapTupleHeader) PageGetItem(dp, lp);
622 
623  tup.t_data = htup;
624  tup.t_len = ItemIdGetLength(lp);
625  ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
626 
627  /*
628  * Check the tuple XMIN against prior XMAX, if any
629  */
630  if (TransactionIdIsValid(priorXmax) &&
631  !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
632  break;
633 
634  /*
635  * OK, this tuple is indeed a member of the chain.
636  */
637  chainitems[nchain++] = offnum;
638 
639  /*
640  * Check tuple's visibility status.
641  */
642  tupdead = recent_dead = false;
643 
644  switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer))
645  {
646  case HEAPTUPLE_DEAD:
647  tupdead = true;
648  break;
649 
651  recent_dead = true;
652 
653  /*
654  * This tuple may soon become DEAD. Update the hint field so
655  * that the page is reconsidered for pruning in future.
656  */
659  break;
660 
662 
663  /*
664  * This tuple may soon become DEAD. Update the hint field so
665  * that the page is reconsidered for pruning in future.
666  */
669  break;
670 
671  case HEAPTUPLE_LIVE:
673 
674  /*
675  * If we wanted to optimize for aborts, we might consider
676  * marking the page prunable when we see INSERT_IN_PROGRESS.
677  * But we don't. See related decisions about when to mark the
678  * page prunable in heapam.c.
679  */
680  break;
681 
682  default:
683  elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
684  break;
685  }
686 
687  /*
688  * Remember the last DEAD tuple seen. We will advance past
689  * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
690  * but we can't advance past anything else. (XXX is it really worth
691  * continuing to scan beyond RECENTLY_DEAD? The case where we will
692  * find another DEAD tuple is a fairly unusual corner case.)
693  */
694  if (tupdead)
695  {
696  latestdead = offnum;
698  &prstate->latestRemovedXid);
699  }
700  else if (!recent_dead)
701  break;
702 
703  /*
704  * If the tuple is not HOT-updated, then we are at the end of this
705  * HOT-update chain.
706  */
707  if (!HeapTupleHeaderIsHotUpdated(htup))
708  break;
709 
710  /* HOT implies it can't have moved to different partition */
712 
713  /*
714  * Advance to next chain member.
715  */
717  BufferGetBlockNumber(buffer));
718  offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
719  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
720  }
721 
722  /*
723  * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
724  * the DEAD tuples at the start of the chain are removed and the root line
725  * pointer is appropriately redirected.
726  */
727  if (OffsetNumberIsValid(latestdead))
728  {
729  /*
730  * Mark as unused each intermediate item that we are able to remove
731  * from the chain.
732  *
733  * When the previous item is the last dead tuple seen, we are at the
734  * right candidate for redirection.
735  */
736  for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
737  {
738  heap_prune_record_unused(prstate, chainitems[i]);
739  ndeleted++;
740  }
741 
742  /*
743  * If the root entry had been a normal tuple, we are deleting it, so
744  * count it in the result. But changing a redirect (even to DEAD
745  * state) doesn't count.
746  */
747  if (ItemIdIsNormal(rootlp))
748  ndeleted++;
749 
750  /*
751  * If the DEAD tuple is at the end of the chain, the entire chain is
752  * dead and the root line pointer can be marked dead. Otherwise just
753  * redirect the root to the correct chain member.
754  */
755  if (i >= nchain)
756  heap_prune_record_dead(prstate, rootoffnum);
757  else
758  heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
759  }
760  else if (nchain < 2 && ItemIdIsRedirected(rootlp))
761  {
762  /*
763  * We found a redirect item that doesn't point to a valid follow-on
764  * item. This can happen if the loop in heap_page_prune caused us to
765  * visit the dead successor of a redirect item before visiting the
766  * redirect item. We can clean up by setting the redirect item to
767  * DEAD state.
768  */
769  heap_prune_record_dead(prstate, rootoffnum);
770  }
771 
772  return ndeleted;
773 }
774 
775 /* Record lowest soon-prunable XID */
776 static void
778 {
779  /*
780  * This should exactly match the PageSetPrunable macro. We can't store
781  * directly into the page header yet, so we update working state.
782  */
784  if (!TransactionIdIsValid(prstate->new_prune_xid) ||
785  TransactionIdPrecedes(xid, prstate->new_prune_xid))
786  prstate->new_prune_xid = xid;
787 }
788 
789 /* Record line pointer to be redirected */
790 static void
792  OffsetNumber offnum, OffsetNumber rdoffnum)
793 {
795  prstate->redirected[prstate->nredirected * 2] = offnum;
796  prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
797  prstate->nredirected++;
798  Assert(!prstate->marked[offnum]);
799  prstate->marked[offnum] = true;
800  Assert(!prstate->marked[rdoffnum]);
801  prstate->marked[rdoffnum] = true;
802 }
803 
804 /* Record line pointer to be marked dead */
805 static void
807 {
808  Assert(prstate->ndead < MaxHeapTuplesPerPage);
809  prstate->nowdead[prstate->ndead] = offnum;
810  prstate->ndead++;
811  Assert(!prstate->marked[offnum]);
812  prstate->marked[offnum] = true;
813 }
814 
815 /* Record line pointer to be marked unused */
816 static void
818 {
819  Assert(prstate->nunused < MaxHeapTuplesPerPage);
820  prstate->nowunused[prstate->nunused] = offnum;
821  prstate->nunused++;
822  Assert(!prstate->marked[offnum]);
823  prstate->marked[offnum] = true;
824 }
825 
826 
827 /*
828  * Perform the actual page changes needed by heap_page_prune.
829  * It is expected that the caller has a super-exclusive lock on the
830  * buffer.
831  */
832 void
834  OffsetNumber *redirected, int nredirected,
835  OffsetNumber *nowdead, int ndead,
836  OffsetNumber *nowunused, int nunused)
837 {
838  Page page = (Page) BufferGetPage(buffer);
839  OffsetNumber *offnum;
840  int i;
841 
842  /* Shouldn't be called unless there's something to do */
843  Assert(nredirected > 0 || ndead > 0 || nunused > 0);
844 
845  /* Update all redirected line pointers */
846  offnum = redirected;
847  for (i = 0; i < nredirected; i++)
848  {
849  OffsetNumber fromoff = *offnum++;
850  OffsetNumber tooff = *offnum++;
851  ItemId fromlp = PageGetItemId(page, fromoff);
852 
853  ItemIdSetRedirect(fromlp, tooff);
854  }
855 
856  /* Update all now-dead line pointers */
857  offnum = nowdead;
858  for (i = 0; i < ndead; i++)
859  {
860  OffsetNumber off = *offnum++;
861  ItemId lp = PageGetItemId(page, off);
862 
863  ItemIdSetDead(lp);
864  }
865 
866  /* Update all now-unused line pointers */
867  offnum = nowunused;
868  for (i = 0; i < nunused; i++)
869  {
870  OffsetNumber off = *offnum++;
871  ItemId lp = PageGetItemId(page, off);
872 
873  ItemIdSetUnused(lp);
874  }
875 
876  /*
877  * Finally, repair any fragmentation, and update the page's hint bit about
878  * whether it has free pointers.
879  */
881 }
882 
883 
884 /*
885  * For all items in this page, find their respective root line pointers.
886  * If item k is part of a HOT-chain with root at item j, then we set
887  * root_offsets[k - 1] = j.
888  *
889  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
890  * Unused entries are filled with InvalidOffsetNumber (zero).
891  *
892  * The function must be called with at least share lock on the buffer, to
893  * prevent concurrent prune operations.
894  *
895  * Note: The information collected here is valid only as long as the caller
896  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
897  * and reused by a completely unrelated tuple.
898  */
899 void
901 {
902  OffsetNumber offnum,
903  maxoff;
904 
905  MemSet(root_offsets, InvalidOffsetNumber,
907 
908  maxoff = PageGetMaxOffsetNumber(page);
909  for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
910  {
911  ItemId lp = PageGetItemId(page, offnum);
912  HeapTupleHeader htup;
913  OffsetNumber nextoffnum;
914  TransactionId priorXmax;
915 
916  /* skip unused and dead items */
917  if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
918  continue;
919 
920  if (ItemIdIsNormal(lp))
921  {
922  htup = (HeapTupleHeader) PageGetItem(page, lp);
923 
924  /*
925  * Check if this tuple is part of a HOT-chain rooted at some other
926  * tuple. If so, skip it for now; we'll process it when we find
927  * its root.
928  */
929  if (HeapTupleHeaderIsHeapOnly(htup))
930  continue;
931 
932  /*
933  * This is either a plain tuple or the root of a HOT-chain.
934  * Remember it in the mapping.
935  */
936  root_offsets[offnum - 1] = offnum;
937 
938  /* If it's not the start of a HOT-chain, we're done with it */
939  if (!HeapTupleHeaderIsHotUpdated(htup))
940  continue;
941 
942  /* Set up to scan the HOT-chain */
943  nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
944  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
945  }
946  else
947  {
948  /* Must be a redirect item. We do not set its root_offsets entry */
950  /* Set up to scan the HOT-chain */
951  nextoffnum = ItemIdGetRedirect(lp);
952  priorXmax = InvalidTransactionId;
953  }
954 
955  /*
956  * Now follow the HOT-chain and collect other tuples in the chain.
957  *
958  * Note: Even though this is a nested loop, the complexity of the
959  * function is O(N) because a tuple in the page should be visited not
960  * more than twice, once in the outer loop and once in HOT-chain
961  * chases.
962  */
963  for (;;)
964  {
965  /* Sanity check */
966  if (nextoffnum < FirstOffsetNumber || nextoffnum > maxoff)
967  break;
968 
969  lp = PageGetItemId(page, nextoffnum);
970 
971  /* Check for broken chains */
972  if (!ItemIdIsNormal(lp))
973  break;
974 
975  htup = (HeapTupleHeader) PageGetItem(page, lp);
976 
977  if (TransactionIdIsValid(priorXmax) &&
978  !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
979  break;
980 
981  /* Remember the root line pointer for this item */
982  root_offsets[nextoffnum - 1] = offnum;
983 
984  /* Advance to next chain member, if any */
985  if (!HeapTupleHeaderIsHotUpdated(htup))
986  break;
987 
988  /* HOT implies it can't have moved to different partition */
990 
991  nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
992  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
993  }
994  }
995 }
#define HeapTupleHeaderGetUpdateXid(tup)
Definition: htup_details.h:365
void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId *latestRemovedXid)
Definition: heapam.c:7236
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
int nredirected
Definition: pruneheap.c:52
static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
Definition: pruneheap.c:510
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
void pgstat_update_heap_dead_tuples(Relation rel, int delta)
Definition: pgstat.c:2347
uint32 TransactionId
Definition: c.h:587
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3854
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1562
int64 TimestampTz
Definition: timestamp.h:39
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
bool old_snap_used
Definition: pruneheap.c:48
static bool OldSnapshotThresholdActive(void)
Definition: snapmgr.h:101
#define ItemIdGetRedirect(itemId)
Definition: itemid.h:78
Relation rel
Definition: pruneheap.c:33
#define END_CRIT_SECTION()
Definition: miscadmin.h:137
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define MaxHeapTuplesPerPage
Definition: htup_details.h:573
TimestampTz old_snap_ts
Definition: pruneheap.c:46
static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:806
#define START_CRIT_SECTION()
Definition: miscadmin.h:135
#define MemSet(start, val, len)
Definition: c.h:1008
#define HeapTupleHeaderIndicatesMovedPartitions(tup)
Definition: htup_details.h:445
#define XLOG_HEAP2_PRUNE
Definition: heapam_xlog.h:54
OffsetNumber nowdead[MaxHeapTuplesPerPage]
Definition: pruneheap.c:57
bool TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, Relation relation, TransactionId *limit_xid, TimestampTz *limit_ts)
Definition: snapmgr.c:1751
bool RecoveryInProgress(void)
Definition: xlog.c:8237
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
bool marked[MaxHeapTuplesPerPage+1]
Definition: pruneheap.c:60
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId new_prune_xid
Definition: pruneheap.c:50
bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
Definition: procarray.c:4131
void SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit)
Definition: snapmgr.c:1672
#define PageIsFull(page)
Definition: bufpage.h:378
int nunused
Definition: pruneheap.c:54
uint16 OffsetNumber
Definition: off.h:24
HeapTupleHeader t_data
Definition: htup.h:68
uint16 nredirected
Definition: heapam_xlog.h:246
#define HeapTupleHeaderIsHeapOnly(tup)
Definition: htup_details.h:500
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition: procarray.c:3963
int heap_page_prune(Relation relation, Buffer buffer, GlobalVisState *vistest, TransactionId old_snap_xmin, TimestampTz old_snap_ts, bool report_stats, OffsetNumber *off_loc)
Definition: pruneheap.c:219
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4257
#define ERROR
Definition: elog.h:46
Size PageGetHeapFreeSpace(Page page)
Definition: bufpage.c:984
ItemPointerData t_ctid
Definition: htup_details.h:160
ItemPointerData t_self
Definition: htup.h:65
uint32 t_len
Definition: htup.h:64
TransactionId latestRemovedXid
Definition: heapam_xlog.h:245
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
Definition: pruneheap.c:777
TransactionId latestRemovedXid
Definition: pruneheap.c:51
#define ItemIdSetRedirect(itemId, link)
Definition: itemid.h:152
#define FirstOffsetNumber
Definition: off.h:27
#define REGBUF_STANDARD
Definition: xloginsert.h:35
void SnapshotTooOldMagicForTest(void)
Definition: snapmgr.c:1689
#define InvalidTransactionId
Definition: transam.h:31
Oid t_tableOid
Definition: htup.h:66
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
Definition: pruneheap.c:900
HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after)
#define PageClearFull(page)
Definition: bufpage.h:382
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition: rel.h:341
#define HeapTupleHeaderIsHotUpdated(tup)
Definition: htup_details.h:483
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4023
TransactionId old_snap_xmin
Definition: pruneheap.c:47
static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
Definition: pruneheap.c:422
#define InvalidOffsetNumber
Definition: off.h:26
GlobalVisState * vistest
Definition: pruneheap.c:36
void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused)
Definition: pruneheap.c:833
#define Max(x, y)
Definition: c.h:980
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:804
static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum)
Definition: pruneheap.c:791
#define ItemIdIsNormal(itemId)
Definition: itemid.h:99
#define HeapTupleHeaderGetXmin(tup)
Definition: htup_details.h:313
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:540
OffsetNumber nowunused[MaxHeapTuplesPerPage]
Definition: pruneheap.c:58
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define RelationNeedsWAL(relation)
Definition: rel.h:570
OffsetNumber redirected[MaxHeapTuplesPerPage *2]
Definition: pruneheap.c:56
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2758
int ndead
Definition: pruneheap.c:53
void PageRepairFragmentation(Page page)
Definition: bufpage.c:709
#define elog(elevel,...)
Definition: elog.h:232
int old_snapshot_threshold
Definition: snapmgr.c:78
int i
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define SizeOfHeapPrune
Definition: heapam_xlog.h:251
HTSV_Result
Definition: heapam.h:93
#define ItemIdSetDead(itemId)
Definition: itemid.h:164
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition: pruneheap.c:87
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define HEAP_DEFAULT_FILLFACTOR
Definition: rel.h:312
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
TransactionId GlobalVisTestNonRemovableHorizon(GlobalVisState *state)
Definition: procarray.c:4169
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
void XLogBeginInsert(void)
Definition: xloginsert.c:123
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define RelationGetRelid(relation)
Definition: rel.h:457
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define ItemPointerSet(pointer, blockNumber, offNum)
Definition: itemptr.h:127
static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:817