PostgreSQL Source Code  git master
pruneheap.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * pruneheap.c
4  * heap page pruning and HOT-chain management code
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/pruneheap.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/heapam.h"
18 #include "access/heapam_xlog.h"
19 #include "access/htup_details.h"
20 #include "access/transam.h"
21 #include "access/xlog.h"
22 #include "access/xloginsert.h"
23 #include "catalog/catalog.h"
24 #include "miscadmin.h"
25 #include "pgstat.h"
26 #include "storage/bufmgr.h"
27 #include "utils/snapmgr.h"
28 #include "utils/rel.h"
29 #include "utils/snapmgr.h"
30 
31 /* Working data for heap_page_prune and subroutines */
32 typedef struct
33 {
35 
36  /* tuple visibility test, initialized for the relation */
38 
39  TransactionId new_prune_xid; /* new prune hint value for page */
40  TransactionId snapshotConflictHorizon; /* latest xid removed */
41  int nredirected; /* numbers of entries in arrays below */
42  int ndead;
43  int nunused;
44  /* arrays that accumulate indexes of items to be changed */
48 
49  /*
50  * marked[i] is true if item i is entered in one of the above arrays.
51  *
52  * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
53  * 1. Otherwise every access would need to subtract 1.
54  */
55  bool marked[MaxHeapTuplesPerPage + 1];
56 
57  /*
58  * Tuple visibility is only computed once for each tuple, for correctness
59  * and efficiency reasons; see comment in heap_page_prune() for details.
60  * This is of type int8[], instead of HTSV_Result[], so we can use -1 to
61  * indicate no visibility has been computed, e.g. for LP_DEAD items.
62  *
63  * Same indexing as ->marked.
64  */
66 } PruneState;
67 
68 /* Local functions */
70  HeapTuple tup,
71  Buffer buffer);
72 static int heap_prune_chain(Buffer buffer,
73  OffsetNumber rootoffnum,
74  PruneState *prstate);
75 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
76 static void heap_prune_record_redirect(PruneState *prstate,
77  OffsetNumber offnum, OffsetNumber rdoffnum);
78 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
79 static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
80 static void page_verify_redirects(Page page);
81 
82 
83 /*
84  * Optionally prune and repair fragmentation in the specified page.
85  *
86  * This is an opportunistic function. It will perform housekeeping
87  * only if the page heuristically looks like a candidate for pruning and we
88  * can acquire buffer cleanup lock without blocking.
89  *
90  * Note: this is called quite often. It's important that it fall out quickly
91  * if there's not any use in pruning.
92  *
93  * Caller must have pin on the buffer, and must *not* have a lock on it.
94  */
95 void
97 {
98  Page page = BufferGetPage(buffer);
99  TransactionId prune_xid;
100  GlobalVisState *vistest;
101  Size minfree;
102 
103  /*
104  * We can't write WAL in recovery mode, so there's no point trying to
105  * clean the page. The primary will likely issue a cleaning WAL record
106  * soon anyway, so this is no particular loss.
107  */
108  if (RecoveryInProgress())
109  return;
110 
111  /*
112  * First check whether there's any chance there's something to prune,
113  * determining the appropriate horizon is a waste if there's no prune_xid
114  * (i.e. no updates/deletes left potentially dead tuples around).
115  */
116  prune_xid = ((PageHeader) page)->pd_prune_xid;
117  if (!TransactionIdIsValid(prune_xid))
118  return;
119 
120  /*
121  * Check whether prune_xid indicates that there may be dead rows that can
122  * be cleaned up.
123  */
124  vistest = GlobalVisTestFor(relation);
125 
126  if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
127  return;
128 
129  /*
130  * We prune when a previous UPDATE failed to find enough space on the page
131  * for a new tuple version, or when free space falls below the relation's
132  * fill-factor target (but not less than 10%).
133  *
134  * Checking free space here is questionable since we aren't holding any
135  * lock on the buffer; in the worst case we could get a bogus answer. It's
136  * unlikely to be *seriously* wrong, though, since reading either pd_lower
137  * or pd_upper is probably atomic. Avoiding taking a lock seems more
138  * important than sometimes getting a wrong answer in what is after all
139  * just a heuristic estimate.
140  */
141  minfree = RelationGetTargetPageFreeSpace(relation,
143  minfree = Max(minfree, BLCKSZ / 10);
144 
145  if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
146  {
147  /* OK, try to get exclusive buffer lock */
148  if (!ConditionalLockBufferForCleanup(buffer))
149  return;
150 
151  /*
152  * Now that we have buffer lock, get accurate information about the
153  * page's free space, and recheck the heuristic about whether to
154  * prune.
155  */
156  if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
157  {
158  int ndeleted,
159  nnewlpdead;
160 
161  ndeleted = heap_page_prune(relation, buffer, vistest,
162  &nnewlpdead, NULL);
163 
164  /*
165  * Report the number of tuples reclaimed to pgstats. This is
166  * ndeleted minus the number of newly-LP_DEAD-set items.
167  *
168  * We derive the number of dead tuples like this to avoid totally
169  * forgetting about items that were set to LP_DEAD, since they
170  * still need to be cleaned up by VACUUM. We only want to count
171  * heap-only tuples that just became LP_UNUSED in our report,
172  * which don't.
173  *
174  * VACUUM doesn't have to compensate in the same way when it
175  * tracks ndeleted, since it will set the same LP_DEAD items to
176  * LP_UNUSED separately.
177  */
178  if (ndeleted > nnewlpdead)
180  ndeleted - nnewlpdead);
181  }
182 
183  /* And release buffer lock */
185 
186  /*
187  * We avoid reuse of any free space created on the page by unrelated
188  * UPDATEs/INSERTs by opting to not update the FSM at this point. The
189  * free space should be reused by UPDATEs to *this* page.
190  */
191  }
192 }
193 
194 
195 /*
196  * Prune and repair fragmentation in the specified page.
197  *
198  * Caller must have pin and buffer cleanup lock on the page. Note that we
199  * don't update the FSM information for page on caller's behalf. Caller might
200  * also need to account for a reduction in the length of the line pointer
201  * array following array truncation by us.
202  *
203  * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
204  * (see heap_prune_satisfies_vacuum and
205  * HeapTupleSatisfiesVacuum).
206  *
207  * Sets *nnewlpdead for caller, indicating the number of items that were
208  * newly set LP_DEAD during prune operation.
209  *
210  * off_loc is the offset location required by the caller to use in error
211  * callback.
212  *
213  * Returns the number of tuples deleted from the page during this call.
214  */
215 int
216 heap_page_prune(Relation relation, Buffer buffer,
217  GlobalVisState *vistest,
218  int *nnewlpdead,
219  OffsetNumber *off_loc)
220 {
221  int ndeleted = 0;
222  Page page = BufferGetPage(buffer);
223  BlockNumber blockno = BufferGetBlockNumber(buffer);
224  OffsetNumber offnum,
225  maxoff;
226  PruneState prstate;
227  HeapTupleData tup;
228 
229  /*
230  * Our strategy is to scan the page and make lists of items to change,
231  * then apply the changes within a critical section. This keeps as much
232  * logic as possible out of the critical section, and also ensures that
233  * WAL replay will work the same as the normal case.
234  *
235  * First, initialize the new pd_prune_xid value to zero (indicating no
236  * prunable tuples). If we find any tuples which may soon become
237  * prunable, we will save the lowest relevant XID in new_prune_xid. Also
238  * initialize the rest of our working state.
239  */
241  prstate.rel = relation;
242  prstate.vistest = vistest;
244  prstate.nredirected = prstate.ndead = prstate.nunused = 0;
245  memset(prstate.marked, 0, sizeof(prstate.marked));
246 
247  maxoff = PageGetMaxOffsetNumber(page);
248  tup.t_tableOid = RelationGetRelid(prstate.rel);
249 
250  /*
251  * Determine HTSV for all tuples.
252  *
253  * This is required for correctness to deal with cases where running HTSV
254  * twice could result in different results (e.g. RECENTLY_DEAD can turn to
255  * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid()
256  * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the
257  * inserting transaction aborts, ...). That in turn could cause
258  * heap_prune_chain() to behave incorrectly if a tuple is reached twice,
259  * once directly via a heap_prune_chain() and once following a HOT chain.
260  *
261  * It's also good for performance. Most commonly tuples within a page are
262  * stored at decreasing offsets (while the items are stored at increasing
263  * offsets). When processing all tuples on a page this leads to reading
264  * memory at decreasing offsets within a page, with a variable stride.
265  * That's hard for CPU prefetchers to deal with. Processing the items in
266  * reverse order (and thus the tuples in increasing order) increases
267  * prefetching efficiency significantly / decreases the number of cache
268  * misses.
269  */
270  for (offnum = maxoff;
271  offnum >= FirstOffsetNumber;
272  offnum = OffsetNumberPrev(offnum))
273  {
274  ItemId itemid = PageGetItemId(page, offnum);
275  HeapTupleHeader htup;
276 
277  /* Nothing to do if slot doesn't contain a tuple */
278  if (!ItemIdIsNormal(itemid))
279  {
280  prstate.htsv[offnum] = -1;
281  continue;
282  }
283 
284  htup = (HeapTupleHeader) PageGetItem(page, itemid);
285  tup.t_data = htup;
286  tup.t_len = ItemIdGetLength(itemid);
287  ItemPointerSet(&(tup.t_self), blockno, offnum);
288 
289  /*
290  * Set the offset number so that we can display it along with any
291  * error that occurred while processing this tuple.
292  */
293  if (off_loc)
294  *off_loc = offnum;
295 
296  prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
297  buffer);
298  }
299 
300  /* Scan the page */
301  for (offnum = FirstOffsetNumber;
302  offnum <= maxoff;
303  offnum = OffsetNumberNext(offnum))
304  {
305  ItemId itemid;
306 
307  /* Ignore items already processed as part of an earlier chain */
308  if (prstate.marked[offnum])
309  continue;
310 
311  /* see preceding loop */
312  if (off_loc)
313  *off_loc = offnum;
314 
315  /* Nothing to do if slot is empty or already dead */
316  itemid = PageGetItemId(page, offnum);
317  if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
318  continue;
319 
320  /* Process this item or chain of items */
321  ndeleted += heap_prune_chain(buffer, offnum, &prstate);
322  }
323 
324  /* Clear the offset information once we have processed the given page. */
325  if (off_loc)
326  *off_loc = InvalidOffsetNumber;
327 
328  /* Any error while applying the changes is critical */
330 
331  /* Have we found any prunable items? */
332  if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
333  {
334  /*
335  * Apply the planned item changes, then repair page fragmentation, and
336  * update the page's hint bit about whether it has free line pointers.
337  */
339  prstate.redirected, prstate.nredirected,
340  prstate.nowdead, prstate.ndead,
341  prstate.nowunused, prstate.nunused);
342 
343  /*
344  * Update the page's pd_prune_xid field to either zero, or the lowest
345  * XID of any soon-prunable tuple.
346  */
347  ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
348 
349  /*
350  * Also clear the "page is full" flag, since there's no point in
351  * repeating the prune/defrag process until something else happens to
352  * the page.
353  */
354  PageClearFull(page);
355 
356  MarkBufferDirty(buffer);
357 
358  /*
359  * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
360  */
361  if (RelationNeedsWAL(relation))
362  {
363  xl_heap_prune xlrec;
364  XLogRecPtr recptr;
365 
368  xlrec.nredirected = prstate.nredirected;
369  xlrec.ndead = prstate.ndead;
370 
371  XLogBeginInsert();
372  XLogRegisterData((char *) &xlrec, SizeOfHeapPrune);
373 
375 
376  /*
377  * The OffsetNumber arrays are not actually in the buffer, but we
378  * pretend that they are. When XLogInsert stores the whole
379  * buffer, the offset arrays need not be stored too.
380  */
381  if (prstate.nredirected > 0)
382  XLogRegisterBufData(0, (char *) prstate.redirected,
383  prstate.nredirected *
384  sizeof(OffsetNumber) * 2);
385 
386  if (prstate.ndead > 0)
387  XLogRegisterBufData(0, (char *) prstate.nowdead,
388  prstate.ndead * sizeof(OffsetNumber));
389 
390  if (prstate.nunused > 0)
391  XLogRegisterBufData(0, (char *) prstate.nowunused,
392  prstate.nunused * sizeof(OffsetNumber));
393 
394  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE);
395 
396  PageSetLSN(BufferGetPage(buffer), recptr);
397  }
398  }
399  else
400  {
401  /*
402  * If we didn't prune anything, but have found a new value for the
403  * pd_prune_xid field, update it and mark the buffer dirty. This is
404  * treated as a non-WAL-logged hint.
405  *
406  * Also clear the "page is full" flag if it is set, since there's no
407  * point in repeating the prune/defrag process until something else
408  * happens to the page.
409  */
410  if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
411  PageIsFull(page))
412  {
413  ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
414  PageClearFull(page);
415  MarkBufferDirtyHint(buffer, true);
416  }
417  }
418 
420 
421  /* Record number of newly-set-LP_DEAD items for caller */
422  *nnewlpdead = prstate.ndead;
423 
424  return ndeleted;
425 }
426 
427 
428 /*
429  * Perform visibility checks for heap pruning.
430  */
431 static HTSV_Result
433 {
435  TransactionId dead_after;
436 
437  res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
438 
440  return res;
441 
442  if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
444 
445  return res;
446 }
447 
448 
449 /*
450  * Prune specified line pointer or a HOT chain originating at line pointer.
451  *
452  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
453  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
454  * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
455  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
456  * DEAD, our visibility test is just too coarse to detect it.
457  *
458  * In general, pruning must never leave behind a DEAD tuple that still has
459  * tuple storage. VACUUM isn't prepared to deal with that case. That's why
460  * VACUUM prunes the same heap page a second time (without dropping its lock
461  * in the interim) when it sees a newly DEAD tuple that we initially saw as
462  * in-progress. Retrying pruning like this can only happen when an inserting
463  * transaction concurrently aborts.
464  *
465  * The root line pointer is redirected to the tuple immediately after the
466  * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
467  * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
468  * tuple, which we treat as a chain of length 1.)
469  *
470  * We don't actually change the page here. We just add entries to the arrays in
471  * prstate showing the changes to be made. Items to be redirected are added
472  * to the redirected[] array (two entries per redirection); items to be set to
473  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
474  * state are added to nowunused[].
475  *
476  * Returns the number of tuples (to be) deleted from the page.
477  */
478 static int
479 heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
480 {
481  int ndeleted = 0;
482  Page dp = (Page) BufferGetPage(buffer);
484  ItemId rootlp;
485  HeapTupleHeader htup;
486  OffsetNumber latestdead = InvalidOffsetNumber,
487  maxoff = PageGetMaxOffsetNumber(dp),
488  offnum;
490  int nchain = 0,
491  i;
492 
493  rootlp = PageGetItemId(dp, rootoffnum);
494 
495  /*
496  * If it's a heap-only tuple, then it is not the start of a HOT chain.
497  */
498  if (ItemIdIsNormal(rootlp))
499  {
500  Assert(prstate->htsv[rootoffnum] != -1);
501  htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
502 
503  if (HeapTupleHeaderIsHeapOnly(htup))
504  {
505  /*
506  * If the tuple is DEAD and doesn't chain to anything else, mark
507  * it unused immediately. (If it does chain, we can only remove
508  * it as part of pruning its chain.)
509  *
510  * We need this primarily to handle aborted HOT updates, that is,
511  * XMIN_INVALID heap-only tuples. Those might not be linked to by
512  * any chain, since the parent tuple might be re-updated before
513  * any pruning occurs. So we have to be able to reap them
514  * separately from chain-pruning. (Note that
515  * HeapTupleHeaderIsHotUpdated will never return true for an
516  * XMIN_INVALID tuple, so this code will work even when there were
517  * sequential updates within the aborted transaction.)
518  *
519  * Note that we might first arrive at a dead heap-only tuple
520  * either here or while following a chain below. Whichever path
521  * gets there first will mark the tuple unused.
522  */
523  if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
525  {
526  heap_prune_record_unused(prstate, rootoffnum);
528  &prstate->snapshotConflictHorizon);
529  ndeleted++;
530  }
531 
532  /* Nothing more to do */
533  return ndeleted;
534  }
535  }
536 
537  /* Start from the root tuple */
538  offnum = rootoffnum;
539 
540  /* while not end of the chain */
541  for (;;)
542  {
543  ItemId lp;
544  bool tupdead,
545  recent_dead;
546 
547  /* Sanity check (pure paranoia) */
548  if (offnum < FirstOffsetNumber)
549  break;
550 
551  /*
552  * An offset past the end of page's line pointer array is possible
553  * when the array was truncated (original item must have been unused)
554  */
555  if (offnum > maxoff)
556  break;
557 
558  /* If item is already processed, stop --- it must not be same chain */
559  if (prstate->marked[offnum])
560  break;
561 
562  lp = PageGetItemId(dp, offnum);
563 
564  /* Unused item obviously isn't part of the chain */
565  if (!ItemIdIsUsed(lp))
566  break;
567 
568  /*
569  * If we are looking at the redirected root line pointer, jump to the
570  * first normal tuple in the chain. If we find a redirect somewhere
571  * else, stop --- it must not be same chain.
572  */
573  if (ItemIdIsRedirected(lp))
574  {
575  if (nchain > 0)
576  break; /* not at start of chain */
577  chainitems[nchain++] = offnum;
578  offnum = ItemIdGetRedirect(rootlp);
579  continue;
580  }
581 
582  /*
583  * Likewise, a dead line pointer can't be part of the chain. (We
584  * already eliminated the case of dead root tuple outside this
585  * function.)
586  */
587  if (ItemIdIsDead(lp))
588  break;
589 
590  Assert(ItemIdIsNormal(lp));
591  Assert(prstate->htsv[offnum] != -1);
592  htup = (HeapTupleHeader) PageGetItem(dp, lp);
593 
594  /*
595  * Check the tuple XMIN against prior XMAX, if any
596  */
597  if (TransactionIdIsValid(priorXmax) &&
598  !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
599  break;
600 
601  /*
602  * OK, this tuple is indeed a member of the chain.
603  */
604  chainitems[nchain++] = offnum;
605 
606  /*
607  * Check tuple's visibility status.
608  */
609  tupdead = recent_dead = false;
610 
611  switch ((HTSV_Result) prstate->htsv[offnum])
612  {
613  case HEAPTUPLE_DEAD:
614  tupdead = true;
615  break;
616 
618  recent_dead = true;
619 
620  /*
621  * This tuple may soon become DEAD. Update the hint field so
622  * that the page is reconsidered for pruning in future.
623  */
626  break;
627 
629 
630  /*
631  * This tuple may soon become DEAD. Update the hint field so
632  * that the page is reconsidered for pruning in future.
633  */
636  break;
637 
638  case HEAPTUPLE_LIVE:
640 
641  /*
642  * If we wanted to optimize for aborts, we might consider
643  * marking the page prunable when we see INSERT_IN_PROGRESS.
644  * But we don't. See related decisions about when to mark the
645  * page prunable in heapam.c.
646  */
647  break;
648 
649  default:
650  elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
651  break;
652  }
653 
654  /*
655  * Remember the last DEAD tuple seen. We will advance past
656  * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
657  * but we can't advance past anything else. We have to make sure that
658  * we don't miss any DEAD tuples, since DEAD tuples that still have
659  * tuple storage after pruning will confuse VACUUM.
660  */
661  if (tupdead)
662  {
663  latestdead = offnum;
665  &prstate->snapshotConflictHorizon);
666  }
667  else if (!recent_dead)
668  break;
669 
670  /*
671  * If the tuple is not HOT-updated, then we are at the end of this
672  * HOT-update chain.
673  */
674  if (!HeapTupleHeaderIsHotUpdated(htup))
675  break;
676 
677  /* HOT implies it can't have moved to different partition */
679 
680  /*
681  * Advance to next chain member.
682  */
684  BufferGetBlockNumber(buffer));
685  offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
686  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
687  }
688 
689  /*
690  * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
691  * the DEAD tuples at the start of the chain are removed and the root line
692  * pointer is appropriately redirected.
693  */
694  if (OffsetNumberIsValid(latestdead))
695  {
696  /*
697  * Mark as unused each intermediate item that we are able to remove
698  * from the chain.
699  *
700  * When the previous item is the last dead tuple seen, we are at the
701  * right candidate for redirection.
702  */
703  for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
704  {
705  heap_prune_record_unused(prstate, chainitems[i]);
706  ndeleted++;
707  }
708 
709  /*
710  * If the root entry had been a normal tuple, we are deleting it, so
711  * count it in the result. But changing a redirect (even to DEAD
712  * state) doesn't count.
713  */
714  if (ItemIdIsNormal(rootlp))
715  ndeleted++;
716 
717  /*
718  * If the DEAD tuple is at the end of the chain, the entire chain is
719  * dead and the root line pointer can be marked dead. Otherwise just
720  * redirect the root to the correct chain member.
721  */
722  if (i >= nchain)
723  heap_prune_record_dead(prstate, rootoffnum);
724  else
725  heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
726  }
727  else if (nchain < 2 && ItemIdIsRedirected(rootlp))
728  {
729  /*
730  * We found a redirect item that doesn't point to a valid follow-on
731  * item. This can happen if the loop in heap_page_prune caused us to
732  * visit the dead successor of a redirect item before visiting the
733  * redirect item. We can clean up by setting the redirect item to
734  * DEAD state.
735  */
736  heap_prune_record_dead(prstate, rootoffnum);
737  }
738 
739  return ndeleted;
740 }
741 
742 /* Record lowest soon-prunable XID */
743 static void
745 {
746  /*
747  * This should exactly match the PageSetPrunable macro. We can't store
748  * directly into the page header yet, so we update working state.
749  */
751  if (!TransactionIdIsValid(prstate->new_prune_xid) ||
752  TransactionIdPrecedes(xid, prstate->new_prune_xid))
753  prstate->new_prune_xid = xid;
754 }
755 
756 /* Record line pointer to be redirected */
757 static void
759  OffsetNumber offnum, OffsetNumber rdoffnum)
760 {
762  prstate->redirected[prstate->nredirected * 2] = offnum;
763  prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
764  prstate->nredirected++;
765  Assert(!prstate->marked[offnum]);
766  prstate->marked[offnum] = true;
767  Assert(!prstate->marked[rdoffnum]);
768  prstate->marked[rdoffnum] = true;
769 }
770 
771 /* Record line pointer to be marked dead */
772 static void
774 {
775  Assert(prstate->ndead < MaxHeapTuplesPerPage);
776  prstate->nowdead[prstate->ndead] = offnum;
777  prstate->ndead++;
778  Assert(!prstate->marked[offnum]);
779  prstate->marked[offnum] = true;
780 }
781 
782 /* Record line pointer to be marked unused */
783 static void
785 {
786  Assert(prstate->nunused < MaxHeapTuplesPerPage);
787  prstate->nowunused[prstate->nunused] = offnum;
788  prstate->nunused++;
789  Assert(!prstate->marked[offnum]);
790  prstate->marked[offnum] = true;
791 }
792 
793 
794 /*
795  * Perform the actual page changes needed by heap_page_prune.
796  * It is expected that the caller has a full cleanup lock on the
797  * buffer.
798  */
799 void
801  OffsetNumber *redirected, int nredirected,
802  OffsetNumber *nowdead, int ndead,
803  OffsetNumber *nowunused, int nunused)
804 {
805  Page page = (Page) BufferGetPage(buffer);
806  OffsetNumber *offnum;
808 
809  /* Shouldn't be called unless there's something to do */
810  Assert(nredirected > 0 || ndead > 0 || nunused > 0);
811 
812  /* Update all redirected line pointers */
813  offnum = redirected;
814  for (int i = 0; i < nredirected; i++)
815  {
816  OffsetNumber fromoff = *offnum++;
817  OffsetNumber tooff = *offnum++;
818  ItemId fromlp = PageGetItemId(page, fromoff);
820 
821 #ifdef USE_ASSERT_CHECKING
822 
823  /*
824  * Any existing item that we set as an LP_REDIRECT (any 'from' item)
825  * must be the first item from a HOT chain. If the item has tuple
826  * storage then it can't be a heap-only tuple. Otherwise we are just
827  * maintaining an existing LP_REDIRECT from an existing HOT chain that
828  * has been pruned at least once before now.
829  */
830  if (!ItemIdIsRedirected(fromlp))
831  {
832  Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
833 
834  htup = (HeapTupleHeader) PageGetItem(page, fromlp);
836  }
837  else
838  {
839  /* We shouldn't need to redundantly set the redirect */
840  Assert(ItemIdGetRedirect(fromlp) != tooff);
841  }
842 
843  /*
844  * The item that we're about to set as an LP_REDIRECT (the 'from'
845  * item) will point to an existing item (the 'to' item) that is
846  * already a heap-only tuple. There can be at most one LP_REDIRECT
847  * item per HOT chain.
848  *
849  * We need to keep around an LP_REDIRECT item (after original
850  * non-heap-only root tuple gets pruned away) so that it's always
851  * possible for VACUUM to easily figure out what TID to delete from
852  * indexes when an entire HOT chain becomes dead. A heap-only tuple
853  * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
854  * tuple can.
855  *
856  * This check may miss problems, e.g. the target of a redirect could
857  * be marked as unused subsequently. The page_verify_redirects() check
858  * below will catch such problems.
859  */
860  tolp = PageGetItemId(page, tooff);
861  Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
862  htup = (HeapTupleHeader) PageGetItem(page, tolp);
864 #endif
865 
866  ItemIdSetRedirect(fromlp, tooff);
867  }
868 
869  /* Update all now-dead line pointers */
870  offnum = nowdead;
871  for (int i = 0; i < ndead; i++)
872  {
873  OffsetNumber off = *offnum++;
874  ItemId lp = PageGetItemId(page, off);
875 
876 #ifdef USE_ASSERT_CHECKING
877 
878  /*
879  * An LP_DEAD line pointer must be left behind when the original item
880  * (which is dead to everybody) could still be referenced by a TID in
881  * an index. This should never be necessary with any individual
882  * heap-only tuple item, though. (It's not clear how much of a problem
883  * that would be, but there is no reason to allow it.)
884  */
885  if (ItemIdHasStorage(lp))
886  {
887  Assert(ItemIdIsNormal(lp));
888  htup = (HeapTupleHeader) PageGetItem(page, lp);
890  }
891  else
892  {
893  /* Whole HOT chain becomes dead */
895  }
896 #endif
897 
898  ItemIdSetDead(lp);
899  }
900 
901  /* Update all now-unused line pointers */
902  offnum = nowunused;
903  for (int i = 0; i < nunused; i++)
904  {
905  OffsetNumber off = *offnum++;
906  ItemId lp = PageGetItemId(page, off);
907 
908 #ifdef USE_ASSERT_CHECKING
909 
910  /*
911  * Only heap-only tuples can become LP_UNUSED during pruning. They
912  * don't need to be left in place as LP_DEAD items until VACUUM gets
913  * around to doing index vacuuming.
914  */
916  htup = (HeapTupleHeader) PageGetItem(page, lp);
918 #endif
919 
920  ItemIdSetUnused(lp);
921  }
922 
923  /*
924  * Finally, repair any fragmentation, and update the page's hint bit about
925  * whether it has free pointers.
926  */
928 
929  /*
930  * Now that the page has been modified, assert that redirect items still
931  * point to valid targets.
932  */
933  page_verify_redirects(page);
934 }
935 
936 
937 /*
938  * If built with assertions, verify that all LP_REDIRECT items point to a
939  * valid item.
940  *
941  * One way that bugs related to HOT pruning show is redirect items pointing to
942  * removed tuples. It's not trivial to reliably check that marking an item
943  * unused will not orphan a redirect item during heap_prune_chain() /
944  * heap_page_prune_execute(), so we additionally check the whole page after
945  * pruning. Without this check such bugs would typically only cause asserts
946  * later, potentially well after the corruption has been introduced.
947  *
948  * Also check comments in heap_page_prune_execute()'s redirection loop.
949  */
950 static void
952 {
953 #ifdef USE_ASSERT_CHECKING
954  OffsetNumber offnum;
955  OffsetNumber maxoff;
956 
957  maxoff = PageGetMaxOffsetNumber(page);
958  for (offnum = FirstOffsetNumber;
959  offnum <= maxoff;
960  offnum = OffsetNumberNext(offnum))
961  {
962  ItemId itemid = PageGetItemId(page, offnum);
963  OffsetNumber targoff;
964  ItemId targitem;
965  HeapTupleHeader htup;
966 
967  if (!ItemIdIsRedirected(itemid))
968  continue;
969 
970  targoff = ItemIdGetRedirect(itemid);
971  targitem = PageGetItemId(page, targoff);
972 
973  Assert(ItemIdIsUsed(targitem));
974  Assert(ItemIdIsNormal(targitem));
975  Assert(ItemIdHasStorage(targitem));
976  htup = (HeapTupleHeader) PageGetItem(page, targitem);
978  }
979 #endif
980 }
981 
982 
983 /*
984  * For all items in this page, find their respective root line pointers.
985  * If item k is part of a HOT-chain with root at item j, then we set
986  * root_offsets[k - 1] = j.
987  *
988  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
989  * Unused entries are filled with InvalidOffsetNumber (zero).
990  *
991  * The function must be called with at least share lock on the buffer, to
992  * prevent concurrent prune operations.
993  *
994  * Note: The information collected here is valid only as long as the caller
995  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
996  * and reused by a completely unrelated tuple.
997  */
998 void
1000 {
1001  OffsetNumber offnum,
1002  maxoff;
1003 
1004  MemSet(root_offsets, InvalidOffsetNumber,
1006 
1007  maxoff = PageGetMaxOffsetNumber(page);
1008  for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
1009  {
1010  ItemId lp = PageGetItemId(page, offnum);
1011  HeapTupleHeader htup;
1012  OffsetNumber nextoffnum;
1013  TransactionId priorXmax;
1014 
1015  /* skip unused and dead items */
1016  if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
1017  continue;
1018 
1019  if (ItemIdIsNormal(lp))
1020  {
1021  htup = (HeapTupleHeader) PageGetItem(page, lp);
1022 
1023  /*
1024  * Check if this tuple is part of a HOT-chain rooted at some other
1025  * tuple. If so, skip it for now; we'll process it when we find
1026  * its root.
1027  */
1028  if (HeapTupleHeaderIsHeapOnly(htup))
1029  continue;
1030 
1031  /*
1032  * This is either a plain tuple or the root of a HOT-chain.
1033  * Remember it in the mapping.
1034  */
1035  root_offsets[offnum - 1] = offnum;
1036 
1037  /* If it's not the start of a HOT-chain, we're done with it */
1038  if (!HeapTupleHeaderIsHotUpdated(htup))
1039  continue;
1040 
1041  /* Set up to scan the HOT-chain */
1042  nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1043  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1044  }
1045  else
1046  {
1047  /* Must be a redirect item. We do not set its root_offsets entry */
1049  /* Set up to scan the HOT-chain */
1050  nextoffnum = ItemIdGetRedirect(lp);
1051  priorXmax = InvalidTransactionId;
1052  }
1053 
1054  /*
1055  * Now follow the HOT-chain and collect other tuples in the chain.
1056  *
1057  * Note: Even though this is a nested loop, the complexity of the
1058  * function is O(N) because a tuple in the page should be visited not
1059  * more than twice, once in the outer loop and once in HOT-chain
1060  * chases.
1061  */
1062  for (;;)
1063  {
1064  /* Sanity check (pure paranoia) */
1065  if (offnum < FirstOffsetNumber)
1066  break;
1067 
1068  /*
1069  * An offset past the end of page's line pointer array is possible
1070  * when the array was truncated
1071  */
1072  if (offnum > maxoff)
1073  break;
1074 
1075  lp = PageGetItemId(page, nextoffnum);
1076 
1077  /* Check for broken chains */
1078  if (!ItemIdIsNormal(lp))
1079  break;
1080 
1081  htup = (HeapTupleHeader) PageGetItem(page, lp);
1082 
1083  if (TransactionIdIsValid(priorXmax) &&
1084  !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
1085  break;
1086 
1087  /* Remember the root line pointer for this item */
1088  root_offsets[nextoffnum - 1] = offnum;
1089 
1090  /* Advance to next chain member, if any */
1091  if (!HeapTupleHeaderIsHotUpdated(htup))
1092  break;
1093 
1094  /* HOT implies it can't have moved to different partition */
1096 
1097  nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1098  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1099  }
1100  }
1101 }
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3290
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2111
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4715
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4544
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4956
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:157
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
Size PageGetHeapFreeSpace(Page page)
Definition: bufpage.c:991
void PageRepairFragmentation(Page page)
Definition: bufpage.c:699
PageHeaderData * PageHeader
Definition: bufpage.h:170
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static void PageClearFull(Page page)
Definition: bufpage.h:420
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static bool PageIsFull(Page page)
Definition: bufpage.h:410
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
signed char int8
Definition: c.h:481
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:171
#define Max(x, y)
Definition: c.h:987
#define MemSet(start, val, len)
Definition: c.h:1009
uint32 TransactionId
Definition: c.h:641
size_t Size
Definition: c.h:594
#define ERROR
Definition: elog.h:39
void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
Definition: heapam.c:7484
HTSV_Result
Definition: heapam.h:95
@ HEAPTUPLE_RECENTLY_DEAD
Definition: heapam.h:98
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition: heapam.h:99
@ HEAPTUPLE_LIVE
Definition: heapam.h:97
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition: heapam.h:100
@ HEAPTUPLE_DEAD
Definition: heapam.h:96
HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after)
#define XLOG_HEAP2_PRUNE
Definition: heapam_xlog.h:54
#define SizeOfHeapPrune
Definition: heapam_xlog.h:253
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
#define HeapTupleHeaderIsHeapOnly(tup)
Definition: htup_details.h:499
#define HeapTupleHeaderIndicatesMovedPartitions(tup)
Definition: htup_details.h:444
#define HeapTupleHeaderGetXmin(tup)
Definition: htup_details.h:309
#define MaxHeapTuplesPerPage
Definition: htup_details.h:572
#define HeapTupleHeaderGetUpdateXid(tup)
Definition: htup_details.h:361
#define HeapTupleHeaderIsHotUpdated(tup)
Definition: htup_details.h:482
int i
Definition: isn.c:73
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdSetRedirect(itemId, link)
Definition: itemid.h:152
#define ItemIdIsNormal(itemId)
Definition: itemid.h:99
#define ItemIdGetRedirect(itemId)
Definition: itemid.h:78
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define ItemIdSetDead(itemId)
Definition: itemid.h:164
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define ItemIdHasStorage(itemId)
Definition: itemid.h:120
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
Definition: itemptr.h:135
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
Assert(fmt[strlen(fmt) - 1] !='\n')
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
void pgstat_update_heap_dead_tuples(Relation rel, int delta)
bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
Definition: procarray.c:4168
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition: procarray.c:4011
static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum)
Definition: pruneheap.c:758
static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:773
void heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
Definition: pruneheap.c:999
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition: pruneheap.c:96
static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
Definition: pruneheap.c:784
static void page_verify_redirects(Page page)
Definition: pruneheap.c:951
static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
Definition: pruneheap.c:479
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
Definition: pruneheap.c:744
static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
Definition: pruneheap.c:432
int heap_page_prune(Relation relation, Buffer buffer, GlobalVisState *vistest, int *nnewlpdead, OffsetNumber *off_loc)
Definition: pruneheap.c:216
void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused)
Definition: pruneheap.c:800
#define RelationGetRelid(relation)
Definition: rel.h:504
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition: rel.h:377
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:685
#define RelationNeedsWAL(relation)
Definition: rel.h:629
#define HEAP_DEFAULT_FILLFACTOR
Definition: rel.h:348
ItemPointerData t_self
Definition: htup.h:65
uint32 t_len
Definition: htup.h:64
HeapTupleHeader t_data
Definition: htup.h:68
Oid t_tableOid
Definition: htup.h:66
ItemPointerData t_ctid
Definition: htup_details.h:161
int ndead
Definition: pruneheap.c:42
TransactionId new_prune_xid
Definition: pruneheap.c:39
OffsetNumber nowdead[MaxHeapTuplesPerPage]
Definition: pruneheap.c:46
bool marked[MaxHeapTuplesPerPage+1]
Definition: pruneheap.c:55
OffsetNumber nowunused[MaxHeapTuplesPerPage]
Definition: pruneheap.c:47
GlobalVisState * vistest
Definition: pruneheap.c:37
Relation rel
Definition: pruneheap.c:34
OffsetNumber redirected[MaxHeapTuplesPerPage *2]
Definition: pruneheap.c:45
int nredirected
Definition: pruneheap.c:41
int8 htsv[MaxHeapTuplesPerPage+1]
Definition: pruneheap.c:65
int nunused
Definition: pruneheap.c:43
TransactionId snapshotConflictHorizon
Definition: pruneheap.c:40
TransactionId snapshotConflictHorizon
Definition: heapam_xlog.h:245
uint16 nredirected
Definition: heapam_xlog.h:246
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
bool RecoveryInProgress(void)
Definition: xlog.c:5948
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:351
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:461
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:392
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
#define REGBUF_STANDARD
Definition: xloginsert.h:34