PostgreSQL Source Code  git master
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * heap_beginscan - begin relation scan
16  * heap_rescan - restart a relation scan
17  * heap_endscan - end relation scan
18  * heap_getnext - retrieve next tuple in scan
19  * heap_fetch - retrieve tuple with given tid
20  * heap_insert - insert tuple into a relation
21  * heap_multi_insert - insert multiple tuples into a relation
22  * heap_delete - delete a tuple from a relation
23  * heap_update - replace a tuple in a relation with another tuple
24  *
25  * NOTES
26  * This file contains the heap_ routines which implement
27  * the POSTGRES heap access method used for all POSTGRES
28  * relations.
29  *
30  *-------------------------------------------------------------------------
31  */
32 #include "postgres.h"
33 
34 #include "access/bufmask.h"
35 #include "access/heapam.h"
36 #include "access/heapam_xlog.h"
37 #include "access/heaptoast.h"
38 #include "access/hio.h"
39 #include "access/multixact.h"
40 #include "access/parallel.h"
41 #include "access/relscan.h"
42 #include "access/subtrans.h"
43 #include "access/syncscan.h"
44 #include "access/sysattr.h"
45 #include "access/tableam.h"
46 #include "access/transam.h"
47 #include "access/valid.h"
48 #include "access/visibilitymap.h"
49 #include "access/xact.h"
50 #include "access/xlog.h"
51 #include "access/xloginsert.h"
52 #include "access/xlogutils.h"
53 #include "catalog/catalog.h"
54 #include "commands/vacuum.h"
55 #include "miscadmin.h"
56 #include "pgstat.h"
57 #include "port/atomics.h"
58 #include "port/pg_bitutils.h"
59 #include "storage/bufmgr.h"
60 #include "storage/freespace.h"
61 #include "storage/lmgr.h"
62 #include "storage/predicate.h"
63 #include "storage/procarray.h"
64 #include "storage/standby.h"
65 #include "utils/datum.h"
66 #include "utils/inval.h"
67 #include "utils/relcache.h"
68 #include "utils/snapmgr.h"
69 #include "utils/spccache.h"
70 
71 
72 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
73  TransactionId xid, CommandId cid, int options);
74 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
75  Buffer newbuf, HeapTuple oldtup,
76  HeapTuple newtup, HeapTuple old_key_tuple,
77  bool all_visible_cleared, bool new_all_visible_cleared);
79  Bitmapset *interesting_cols,
80  Bitmapset *external_cols,
81  HeapTuple oldtup, HeapTuple newtup,
82  bool *has_external);
83 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
84  LockTupleMode mode, LockWaitPolicy wait_policy,
85  bool *have_tuple_lock);
86 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
87  uint16 old_infomask2, TransactionId add_to_xmax,
88  LockTupleMode mode, bool is_update,
89  TransactionId *result_xmax, uint16 *result_infomask,
90  uint16 *result_infomask2);
92  ItemPointer ctid, TransactionId xid,
94 static int heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
95  xl_heap_freeze_plan *plans_out,
96  OffsetNumber *offsets_out);
97 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
98  uint16 *new_infomask2);
100  uint16 t_infomask);
101 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
102  LockTupleMode lockmode, bool *current_is_member);
103 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
104  Relation rel, ItemPointer ctid, XLTW_Oper oper,
105  int *remaining);
106 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
107  uint16 infomask, Relation rel, int *remaining);
108 static void index_delete_sort(TM_IndexDeleteOp *delstate);
109 static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
110 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
111 static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
112  bool *copy);
113 
114 
115 /*
116  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
117  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
118  * update them). This table (and the macros below) helps us determine the
119  * heavyweight lock mode and MultiXactStatus values to use for any particular
120  * tuple lock strength.
121  *
122  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
123  * instead.
124  */
125 static const struct
126 {
130 }
131 
133 {
134  { /* LockTupleKeyShare */
137  -1 /* KeyShare does not allow updating tuples */
138  },
139  { /* LockTupleShare */
140  RowShareLock,
142  -1 /* Share does not allow updating tuples */
143  },
144  { /* LockTupleNoKeyExclusive */
148  },
149  { /* LockTupleExclusive */
153  }
154 };
155 
156 /* Get the LOCKMODE for a given MultiXactStatus */
157 #define LOCKMODE_from_mxstatus(status) \
158  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
159 
160 /*
161  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
162  * This is more readable than having every caller translate it to lock.h's
163  * LOCKMODE.
164  */
165 #define LockTupleTuplock(rel, tup, mode) \
166  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
167 #define UnlockTupleTuplock(rel, tup, mode) \
168  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
169 #define ConditionalLockTupleTuplock(rel, tup, mode) \
170  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
171 
172 #ifdef USE_PREFETCH
173 /*
174  * heap_index_delete_tuples and index_delete_prefetch_buffer use this
175  * structure to coordinate prefetching activity
176  */
177 typedef struct
178 {
179  BlockNumber cur_hblkno;
180  int next_item;
181  int ndeltids;
182  TM_IndexDelete *deltids;
183 } IndexDeletePrefetchState;
184 #endif
185 
186 /* heap_index_delete_tuples bottom-up index deletion costing constants */
187 #define BOTTOMUP_MAX_NBLOCKS 6
188 #define BOTTOMUP_TOLERANCE_NBLOCKS 3
189 
190 /*
191  * heap_index_delete_tuples uses this when determining which heap blocks it
192  * must visit to help its bottom-up index deletion caller
193  */
194 typedef struct IndexDeleteCounts
195 {
196  int16 npromisingtids; /* Number of "promising" TIDs in group */
197  int16 ntids; /* Number of TIDs in group */
198  int16 ifirsttid; /* Offset to group's first deltid */
200 
201 /*
202  * This table maps tuple lock strength values for each particular
203  * MultiXactStatus value.
204  */
206 {
207  LockTupleKeyShare, /* ForKeyShare */
208  LockTupleShare, /* ForShare */
209  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
210  LockTupleExclusive, /* ForUpdate */
211  LockTupleNoKeyExclusive, /* NoKeyUpdate */
212  LockTupleExclusive /* Update */
213 };
214 
215 /* Get the LockTupleMode for a given MultiXactStatus */
216 #define TUPLOCK_from_mxstatus(status) \
217  (MultiXactStatusLock[(status)])
218 
219 /* ----------------------------------------------------------------
220  * heap support routines
221  * ----------------------------------------------------------------
222  */
223 
224 /* ----------------
225  * initscan - scan code common to heap_beginscan and heap_rescan
226  * ----------------
227  */
228 static void
229 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
230 {
231  ParallelBlockTableScanDesc bpscan = NULL;
232  bool allow_strat;
233  bool allow_sync;
234 
235  /*
236  * Determine the number of blocks we have to scan.
237  *
238  * It is sufficient to do this once at scan start, since any tuples added
239  * while the scan is in progress will be invisible to my snapshot anyway.
240  * (That is not true when using a non-MVCC snapshot. However, we couldn't
241  * guarantee to return tuples added after scan start anyway, since they
242  * might go into pages we already scanned. To guarantee consistent
243  * results for a non-MVCC snapshot, the caller must hold some higher-level
244  * lock that ensures the interesting tuple(s) won't change.)
245  */
246  if (scan->rs_base.rs_parallel != NULL)
247  {
249  scan->rs_nblocks = bpscan->phs_nblocks;
250  }
251  else
253 
254  /*
255  * If the table is large relative to NBuffers, use a bulk-read access
256  * strategy and enable synchronized scanning (see syncscan.c). Although
257  * the thresholds for these features could be different, we make them the
258  * same so that there are only two behaviors to tune rather than four.
259  * (However, some callers need to be able to disable one or both of these
260  * behaviors, independently of the size of the table; also there is a GUC
261  * variable that can disable synchronized scanning.)
262  *
263  * Note that table_block_parallelscan_initialize has a very similar test;
264  * if you change this, consider changing that one, too.
265  */
266  if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
267  scan->rs_nblocks > NBuffers / 4)
268  {
269  allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
270  allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
271  }
272  else
273  allow_strat = allow_sync = false;
274 
275  if (allow_strat)
276  {
277  /* During a rescan, keep the previous strategy object. */
278  if (scan->rs_strategy == NULL)
280  }
281  else
282  {
283  if (scan->rs_strategy != NULL)
285  scan->rs_strategy = NULL;
286  }
287 
288  if (scan->rs_base.rs_parallel != NULL)
289  {
290  /* For parallel scan, believe whatever ParallelTableScanDesc says. */
291  if (scan->rs_base.rs_parallel->phs_syncscan)
292  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
293  else
294  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
295  }
296  else if (keep_startblock)
297  {
298  /*
299  * When rescanning, we want to keep the previous startblock setting,
300  * so that rewinding a cursor doesn't generate surprising results.
301  * Reset the active syncscan setting, though.
302  */
303  if (allow_sync && synchronize_seqscans)
304  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
305  else
306  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
307  }
308  else if (allow_sync && synchronize_seqscans)
309  {
310  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
311  scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
312  }
313  else
314  {
315  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
316  scan->rs_startblock = 0;
317  }
318 
320  scan->rs_inited = false;
321  scan->rs_ctup.t_data = NULL;
323  scan->rs_cbuf = InvalidBuffer;
325 
326  /* page-at-a-time fields are always invalid when not rs_inited */
327 
328  /*
329  * copy the scan key, if appropriate
330  */
331  if (key != NULL && scan->rs_base.rs_nkeys > 0)
332  memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
333 
334  /*
335  * Currently, we only have a stats counter for sequential heap scans (but
336  * e.g for bitmap scans the underlying bitmap index scans will be counted,
337  * and for sample scans we update stats for tuple fetches).
338  */
339  if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
341 }
342 
343 /*
344  * heap_setscanlimits - restrict range of a heapscan
345  *
346  * startBlk is the page to start at
347  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
348  */
349 void
351 {
352  HeapScanDesc scan = (HeapScanDesc) sscan;
353 
354  Assert(!scan->rs_inited); /* else too late to change */
355  /* else rs_startblock is significant */
356  Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
357 
358  /* Check startBlk is valid (but allow case of zero blocks...) */
359  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
360 
361  scan->rs_startblock = startBlk;
362  scan->rs_numblocks = numBlks;
363 }
364 
365 /*
366  * heapgetpage - subroutine for heapgettup()
367  *
368  * This routine reads and pins the specified page of the relation.
369  * In page-at-a-time mode it performs additional work, namely determining
370  * which tuples on the page are visible.
371  */
372 void
374 {
375  HeapScanDesc scan = (HeapScanDesc) sscan;
376  Buffer buffer;
377  Snapshot snapshot;
378  Page page;
379  int lines;
380  int ntup;
381  OffsetNumber lineoff;
382  bool all_visible;
383 
384  Assert(block < scan->rs_nblocks);
385 
386  /* release previous scan buffer, if any */
387  if (BufferIsValid(scan->rs_cbuf))
388  {
389  ReleaseBuffer(scan->rs_cbuf);
390  scan->rs_cbuf = InvalidBuffer;
391  }
392 
393  /*
394  * Be sure to check for interrupts at least once per page. Checks at
395  * higher code levels won't be able to stop a seqscan that encounters many
396  * pages' worth of consecutive dead tuples.
397  */
399 
400  /* read page using selected strategy */
401  scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, block,
402  RBM_NORMAL, scan->rs_strategy);
403  scan->rs_cblock = block;
404 
405  if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
406  return;
407 
408  buffer = scan->rs_cbuf;
409  snapshot = scan->rs_base.rs_snapshot;
410 
411  /*
412  * Prune and repair fragmentation for the whole page, if possible.
413  */
414  heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
415 
416  /*
417  * We must hold share lock on the buffer content while examining tuple
418  * visibility. Afterwards, however, the tuples we have found to be
419  * visible are guaranteed good as long as we hold the buffer pin.
420  */
421  LockBuffer(buffer, BUFFER_LOCK_SHARE);
422 
423  page = BufferGetPage(buffer);
424  lines = PageGetMaxOffsetNumber(page);
425  ntup = 0;
426 
427  /*
428  * If the all-visible flag indicates that all tuples on the page are
429  * visible to everyone, we can skip the per-tuple visibility tests.
430  *
431  * Note: In hot standby, a tuple that's already visible to all
432  * transactions on the primary might still be invisible to a read-only
433  * transaction in the standby. We partly handle this problem by tracking
434  * the minimum xmin of visible tuples as the cut-off XID while marking a
435  * page all-visible on the primary and WAL log that along with the
436  * visibility map SET operation. In hot standby, we wait for (or abort)
437  * all transactions that can potentially may not see one or more tuples on
438  * the page. That's how index-only scans work fine in hot standby. A
439  * crucial difference between index-only scans and heap scans is that the
440  * index-only scan completely relies on the visibility map where as heap
441  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
442  * the page-level flag can be trusted in the same way, because it might
443  * get propagated somehow without being explicitly WAL-logged, e.g. via a
444  * full page write. Until we can prove that beyond doubt, let's check each
445  * tuple for visibility the hard way.
446  */
447  all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
448 
449  for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++)
450  {
451  ItemId lpp = PageGetItemId(page, lineoff);
452  HeapTupleData loctup;
453  bool valid;
454 
455  if (!ItemIdIsNormal(lpp))
456  continue;
457 
458  loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
459  loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp);
460  loctup.t_len = ItemIdGetLength(lpp);
461  ItemPointerSet(&(loctup.t_self), block, lineoff);
462 
463  if (all_visible)
464  valid = true;
465  else
466  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
467 
469  &loctup, buffer, snapshot);
470 
471  if (valid)
472  scan->rs_vistuples[ntup++] = lineoff;
473  }
474 
476 
477  Assert(ntup <= MaxHeapTuplesPerPage);
478  scan->rs_ntuples = ntup;
479 }
480 
481 /*
482  * heapgettup_initial_block - return the first BlockNumber to scan
483  *
484  * Returns InvalidBlockNumber when there are no blocks to scan. This can
485  * occur with empty tables and in parallel scans when parallel workers get all
486  * of the pages before we can get a chance to get our first page.
487  */
488 static BlockNumber
490 {
491  Assert(!scan->rs_inited);
492 
493  /* When there are no pages to scan, return InvalidBlockNumber */
494  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
495  return InvalidBlockNumber;
496 
497  if (ScanDirectionIsForward(dir))
498  {
499  /* serial scan */
500  if (scan->rs_base.rs_parallel == NULL)
501  return scan->rs_startblock;
502  else
503  {
504  /* parallel scan */
506  scan->rs_parallelworkerdata,
508 
509  /* may return InvalidBlockNumber if there are no more blocks */
511  scan->rs_parallelworkerdata,
513  }
514  }
515  else
516  {
517  /* backward parallel scan not supported */
518  Assert(scan->rs_base.rs_parallel == NULL);
519 
520  /*
521  * Disable reporting to syncscan logic in a backwards scan; it's not
522  * very likely anyone else is doing the same thing at the same time,
523  * and much more likely that we'll just bollix things for forward
524  * scanners.
525  */
526  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
527 
528  /*
529  * Start from last page of the scan. Ensure we take into account
530  * rs_numblocks if it's been adjusted by heap_setscanlimits().
531  */
532  if (scan->rs_numblocks != InvalidBlockNumber)
533  return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
534 
535  if (scan->rs_startblock > 0)
536  return scan->rs_startblock - 1;
537 
538  return scan->rs_nblocks - 1;
539  }
540 }
541 
542 
543 /*
544  * heapgettup_start_page - helper function for heapgettup()
545  *
546  * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
547  * to the number of tuples on this page. Also set *lineoff to the first
548  * offset to scan with forward scans getting the first offset and backward
549  * getting the final offset on the page.
550  */
551 static Page
553  OffsetNumber *lineoff)
554 {
555  Page page;
556 
557  Assert(scan->rs_inited);
558  Assert(BufferIsValid(scan->rs_cbuf));
559 
560  /* Caller is responsible for ensuring buffer is locked if needed */
561  page = BufferGetPage(scan->rs_cbuf);
562 
563  *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1;
564 
565  if (ScanDirectionIsForward(dir))
566  *lineoff = FirstOffsetNumber;
567  else
568  *lineoff = (OffsetNumber) (*linesleft);
569 
570  /* lineoff now references the physically previous or next tid */
571  return page;
572 }
573 
574 
575 /*
576  * heapgettup_continue_page - helper function for heapgettup()
577  *
578  * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
579  * to the number of tuples left to scan on this page. Also set *lineoff to
580  * the next offset to scan according to the ScanDirection in 'dir'.
581  */
582 static inline Page
584  OffsetNumber *lineoff)
585 {
586  Page page;
587 
588  Assert(scan->rs_inited);
589  Assert(BufferIsValid(scan->rs_cbuf));
590 
591  /* Caller is responsible for ensuring buffer is locked if needed */
592  page = BufferGetPage(scan->rs_cbuf);
593 
594  if (ScanDirectionIsForward(dir))
595  {
596  *lineoff = OffsetNumberNext(scan->rs_coffset);
597  *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
598  }
599  else
600  {
601  /*
602  * The previous returned tuple may have been vacuumed since the
603  * previous scan when we use a non-MVCC snapshot, so we must
604  * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
605  */
606  *lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset));
607  *linesleft = *lineoff;
608  }
609 
610  /* lineoff now references the physically previous or next tid */
611  return page;
612 }
613 
614 /*
615  * heapgettup_advance_block - helper for heapgettup() and heapgettup_pagemode()
616  *
617  * Given the current block number, the scan direction, and various information
618  * contained in the scan descriptor, calculate the BlockNumber to scan next
619  * and return it. If there are no further blocks to scan, return
620  * InvalidBlockNumber to indicate this fact to the caller.
621  *
622  * This should not be called to determine the initial block number -- only for
623  * subsequent blocks.
624  *
625  * This also adjusts rs_numblocks when a limit has been imposed by
626  * heap_setscanlimits().
627  */
628 static inline BlockNumber
630 {
631  if (ScanDirectionIsForward(dir))
632  {
633  if (scan->rs_base.rs_parallel == NULL)
634  {
635  block++;
636 
637  /* wrap back to the start of the heap */
638  if (block >= scan->rs_nblocks)
639  block = 0;
640 
641  /*
642  * Report our new scan position for synchronization purposes. We
643  * don't do that when moving backwards, however. That would just
644  * mess up any other forward-moving scanners.
645  *
646  * Note: we do this before checking for end of scan so that the
647  * final state of the position hint is back at the start of the
648  * rel. That's not strictly necessary, but otherwise when you run
649  * the same query multiple times the starting position would shift
650  * a little bit backwards on every invocation, which is confusing.
651  * We don't guarantee any specific ordering in general, though.
652  */
653  if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
654  ss_report_location(scan->rs_base.rs_rd, block);
655 
656  /* we're done if we're back at where we started */
657  if (block == scan->rs_startblock)
658  return InvalidBlockNumber;
659 
660  /* check if the limit imposed by heap_setscanlimits() is met */
661  if (scan->rs_numblocks != InvalidBlockNumber)
662  {
663  if (--scan->rs_numblocks == 0)
664  return InvalidBlockNumber;
665  }
666 
667  return block;
668  }
669  else
670  {
673  scan->rs_base.rs_parallel);
674  }
675  }
676  else
677  {
678  /* we're done if the last block is the start position */
679  if (block == scan->rs_startblock)
680  return InvalidBlockNumber;
681 
682  /* check if the limit imposed by heap_setscanlimits() is met */
683  if (scan->rs_numblocks != InvalidBlockNumber)
684  {
685  if (--scan->rs_numblocks == 0)
686  return InvalidBlockNumber;
687  }
688 
689  /* wrap to the end of the heap when the last page was page 0 */
690  if (block == 0)
691  block = scan->rs_nblocks;
692 
693  block--;
694 
695  return block;
696  }
697 }
698 
699 /* ----------------
700  * heapgettup - fetch next heap tuple
701  *
702  * Initialize the scan if not already done; then advance to the next
703  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
704  * or set scan->rs_ctup.t_data = NULL if no more tuples.
705  *
706  * Note: the reason nkeys/key are passed separately, even though they are
707  * kept in the scan descriptor, is that the caller may not want us to check
708  * the scankeys.
709  *
710  * Note: when we fall off the end of the scan in either direction, we
711  * reset rs_inited. This means that a further request with the same
712  * scan direction will restart the scan, which is a bit odd, but a
713  * request with the opposite scan direction will start a fresh scan
714  * in the proper direction. The latter is required behavior for cursors,
715  * while the former case is generally undefined behavior in Postgres
716  * so we don't care too much.
717  * ----------------
718  */
719 static void
721  ScanDirection dir,
722  int nkeys,
723  ScanKey key)
724 {
725  HeapTuple tuple = &(scan->rs_ctup);
726  BlockNumber block;
727  Page page;
728  OffsetNumber lineoff;
729  int linesleft;
730 
731  if (unlikely(!scan->rs_inited))
732  {
733  block = heapgettup_initial_block(scan, dir);
734  /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */
735  Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf));
736  scan->rs_inited = true;
737  }
738  else
739  {
740  /* continue from previously returned page/tuple */
741  block = scan->rs_cblock;
742 
744  page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
745  goto continue_page;
746  }
747 
748  /*
749  * advance the scan until we find a qualifying tuple or run out of stuff
750  * to scan
751  */
752  while (block != InvalidBlockNumber)
753  {
754  heapgetpage((TableScanDesc) scan, block);
755  LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
756  page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
757 continue_page:
758 
759  /*
760  * Only continue scanning the page while we have lines left.
761  *
762  * Note that this protects us from accessing line pointers past
763  * PageGetMaxOffsetNumber(); both for forward scans when we resume the
764  * table scan, and for when we start scanning a new page.
765  */
766  for (; linesleft > 0; linesleft--, lineoff += dir)
767  {
768  bool visible;
769  ItemId lpp = PageGetItemId(page, lineoff);
770 
771  if (!ItemIdIsNormal(lpp))
772  continue;
773 
774  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
775  tuple->t_len = ItemIdGetLength(lpp);
776  ItemPointerSet(&(tuple->t_self), block, lineoff);
777 
778  visible = HeapTupleSatisfiesVisibility(tuple,
779  scan->rs_base.rs_snapshot,
780  scan->rs_cbuf);
781 
782  HeapCheckForSerializableConflictOut(visible, scan->rs_base.rs_rd,
783  tuple, scan->rs_cbuf,
784  scan->rs_base.rs_snapshot);
785 
786  /* skip tuples not visible to this snapshot */
787  if (!visible)
788  continue;
789 
790  /* skip any tuples that don't match the scan key */
791  if (key != NULL &&
792  !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
793  nkeys, key))
794  continue;
795 
796  LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
797  scan->rs_coffset = lineoff;
798  return;
799  }
800 
801  /*
802  * if we get here, it means we've exhausted the items on this page and
803  * it's time to move to the next.
804  */
805  LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
806 
807  /* get the BlockNumber to scan next */
808  block = heapgettup_advance_block(scan, block, dir);
809  }
810 
811  /* end of scan */
812  if (BufferIsValid(scan->rs_cbuf))
813  ReleaseBuffer(scan->rs_cbuf);
814 
815  scan->rs_cbuf = InvalidBuffer;
817  tuple->t_data = NULL;
818  scan->rs_inited = false;
819 }
820 
821 /* ----------------
822  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
823  *
824  * Same API as heapgettup, but used in page-at-a-time mode
825  *
826  * The internal logic is much the same as heapgettup's too, but there are some
827  * differences: we do not take the buffer content lock (that only needs to
828  * happen inside heapgetpage), and we iterate through just the tuples listed
829  * in rs_vistuples[] rather than all tuples on the page. Notice that
830  * lineindex is 0-based, where the corresponding loop variable lineoff in
831  * heapgettup is 1-based.
832  * ----------------
833  */
834 static void
836  ScanDirection dir,
837  int nkeys,
838  ScanKey key)
839 {
840  HeapTuple tuple = &(scan->rs_ctup);
841  BlockNumber block;
842  Page page;
843  int lineindex;
844  int linesleft;
845 
846  if (unlikely(!scan->rs_inited))
847  {
848  block = heapgettup_initial_block(scan, dir);
849  /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */
850  Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf));
851  scan->rs_inited = true;
852  }
853  else
854  {
855  /* continue from previously returned page/tuple */
856  block = scan->rs_cblock; /* current page */
857  page = BufferGetPage(scan->rs_cbuf);
858 
859  lineindex = scan->rs_cindex + dir;
860  if (ScanDirectionIsForward(dir))
861  linesleft = scan->rs_ntuples - lineindex;
862  else
863  linesleft = scan->rs_cindex;
864  /* lineindex now references the next or previous visible tid */
865 
866  goto continue_page;
867  }
868 
869  /*
870  * advance the scan until we find a qualifying tuple or run out of stuff
871  * to scan
872  */
873  while (block != InvalidBlockNumber)
874  {
875  heapgetpage((TableScanDesc) scan, block);
876  page = BufferGetPage(scan->rs_cbuf);
877  linesleft = scan->rs_ntuples;
878  lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1;
879 
880  /* lineindex now references the next or previous visible tid */
881 continue_page:
882 
883  for (; linesleft > 0; linesleft--, lineindex += dir)
884  {
885  ItemId lpp;
886  OffsetNumber lineoff;
887 
888  lineoff = scan->rs_vistuples[lineindex];
889  lpp = PageGetItemId(page, lineoff);
890  Assert(ItemIdIsNormal(lpp));
891 
892  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
893  tuple->t_len = ItemIdGetLength(lpp);
894  ItemPointerSet(&(tuple->t_self), block, lineoff);
895 
896  /* skip any tuples that don't match the scan key */
897  if (key != NULL &&
898  !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
899  nkeys, key))
900  continue;
901 
902  scan->rs_cindex = lineindex;
903  return;
904  }
905 
906  /* get the BlockNumber to scan next */
907  block = heapgettup_advance_block(scan, block, dir);
908  }
909 
910  /* end of scan */
911  if (BufferIsValid(scan->rs_cbuf))
912  ReleaseBuffer(scan->rs_cbuf);
913  scan->rs_cbuf = InvalidBuffer;
915  tuple->t_data = NULL;
916  scan->rs_inited = false;
917 }
918 
919 
920 /* ----------------------------------------------------------------
921  * heap access method interface
922  * ----------------------------------------------------------------
923  */
924 
925 
927 heap_beginscan(Relation relation, Snapshot snapshot,
928  int nkeys, ScanKey key,
929  ParallelTableScanDesc parallel_scan,
930  uint32 flags)
931 {
932  HeapScanDesc scan;
933 
934  /*
935  * increment relation ref count while scanning relation
936  *
937  * This is just to make really sure the relcache entry won't go away while
938  * the scan has a pointer to it. Caller should be holding the rel open
939  * anyway, so this is redundant in all normal scenarios...
940  */
942 
943  /*
944  * allocate and initialize scan descriptor
945  */
946  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
947 
948  scan->rs_base.rs_rd = relation;
949  scan->rs_base.rs_snapshot = snapshot;
950  scan->rs_base.rs_nkeys = nkeys;
951  scan->rs_base.rs_flags = flags;
952  scan->rs_base.rs_parallel = parallel_scan;
953  scan->rs_strategy = NULL; /* set in initscan */
954 
955  /*
956  * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
957  */
958  if (!(snapshot && IsMVCCSnapshot(snapshot)))
960 
961  /*
962  * For seqscan and sample scans in a serializable transaction, acquire a
963  * predicate lock on the entire relation. This is required not only to
964  * lock all the matching tuples, but also to conflict with new insertions
965  * into the table. In an indexscan, we take page locks on the index pages
966  * covering the range specified in the scan qual, but in a heap scan there
967  * is nothing more fine-grained to lock. A bitmap scan is a different
968  * story, there we have already scanned the index and locked the index
969  * pages covering the predicate. But in that case we still have to lock
970  * any matching heap tuples. For sample scan we could optimize the locking
971  * to be at least page-level granularity, but we'd need to add per-tuple
972  * locking for that.
973  */
975  {
976  /*
977  * Ensure a missing snapshot is noticed reliably, even if the
978  * isolation mode means predicate locking isn't performed (and
979  * therefore the snapshot isn't used here).
980  */
981  Assert(snapshot);
982  PredicateLockRelation(relation, snapshot);
983  }
984 
985  /* we only need to set this up once */
986  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
987 
988  /*
989  * Allocate memory to keep track of page allocation for parallel workers
990  * when doing a parallel scan.
991  */
992  if (parallel_scan != NULL)
994  else
995  scan->rs_parallelworkerdata = NULL;
996 
997  /*
998  * we do this here instead of in initscan() because heap_rescan also calls
999  * initscan() and we don't want to allocate memory again
1000  */
1001  if (nkeys > 0)
1002  scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1003  else
1004  scan->rs_base.rs_key = NULL;
1005 
1006  initscan(scan, key, false);
1007 
1008  return (TableScanDesc) scan;
1009 }
1010 
1011 void
1012 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1013  bool allow_strat, bool allow_sync, bool allow_pagemode)
1014 {
1015  HeapScanDesc scan = (HeapScanDesc) sscan;
1016 
1017  if (set_params)
1018  {
1019  if (allow_strat)
1020  scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1021  else
1022  scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1023 
1024  if (allow_sync)
1025  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1026  else
1027  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1028 
1029  if (allow_pagemode && scan->rs_base.rs_snapshot &&
1032  else
1034  }
1035 
1036  /*
1037  * unpin scan buffers
1038  */
1039  if (BufferIsValid(scan->rs_cbuf))
1040  ReleaseBuffer(scan->rs_cbuf);
1041 
1042  /*
1043  * reinitialize scan descriptor
1044  */
1045  initscan(scan, key, true);
1046 }
1047 
1048 void
1050 {
1051  HeapScanDesc scan = (HeapScanDesc) sscan;
1052 
1053  /* Note: no locking manipulations needed */
1054 
1055  /*
1056  * unpin scan buffers
1057  */
1058  if (BufferIsValid(scan->rs_cbuf))
1059  ReleaseBuffer(scan->rs_cbuf);
1060 
1061  /*
1062  * decrement relation reference count and free scan descriptor storage
1063  */
1065 
1066  if (scan->rs_base.rs_key)
1067  pfree(scan->rs_base.rs_key);
1068 
1069  if (scan->rs_strategy != NULL)
1071 
1072  if (scan->rs_parallelworkerdata != NULL)
1074 
1075  if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1077 
1078  pfree(scan);
1079 }
1080 
1081 HeapTuple
1083 {
1084  HeapScanDesc scan = (HeapScanDesc) sscan;
1085 
1086  /*
1087  * This is still widely used directly, without going through table AM, so
1088  * add a safety check. It's possible we should, at a later point,
1089  * downgrade this to an assert. The reason for checking the AM routine,
1090  * rather than the AM oid, is that this allows to write regression tests
1091  * that create another AM reusing the heap handler.
1092  */
1094  ereport(ERROR,
1095  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1096  errmsg_internal("only heap AM is supported")));
1097 
1098  /*
1099  * We don't expect direct calls to heap_getnext with valid CheckXidAlive
1100  * for catalog or regular tables. See detailed comments in xact.c where
1101  * these variables are declared. Normally we have such a check at tableam
1102  * level API but this is called from many places so we need to ensure it
1103  * here.
1104  */
1106  elog(ERROR, "unexpected heap_getnext call during logical decoding");
1107 
1108  /* Note: no locking manipulations needed */
1109 
1110  if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1111  heapgettup_pagemode(scan, direction,
1112  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1113  else
1114  heapgettup(scan, direction,
1115  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1116 
1117  if (scan->rs_ctup.t_data == NULL)
1118  return NULL;
1119 
1120  /*
1121  * if we get here it means we have a new current scan tuple, so point to
1122  * the proper return buffer and return the tuple.
1123  */
1124 
1126 
1127  return &scan->rs_ctup;
1128 }
1129 
1130 bool
1132 {
1133  HeapScanDesc scan = (HeapScanDesc) sscan;
1134 
1135  /* Note: no locking manipulations needed */
1136 
1137  if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1138  heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1139  else
1140  heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1141 
1142  if (scan->rs_ctup.t_data == NULL)
1143  {
1144  ExecClearTuple(slot);
1145  return false;
1146  }
1147 
1148  /*
1149  * if we get here it means we have a new current scan tuple, so point to
1150  * the proper return buffer and return the tuple.
1151  */
1152 
1154 
1155  ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1156  scan->rs_cbuf);
1157  return true;
1158 }
1159 
1160 void
1162  ItemPointer maxtid)
1163 {
1164  HeapScanDesc scan = (HeapScanDesc) sscan;
1165  BlockNumber startBlk;
1166  BlockNumber numBlks;
1167  ItemPointerData highestItem;
1168  ItemPointerData lowestItem;
1169 
1170  /*
1171  * For relations without any pages, we can simply leave the TID range
1172  * unset. There will be no tuples to scan, therefore no tuples outside
1173  * the given TID range.
1174  */
1175  if (scan->rs_nblocks == 0)
1176  return;
1177 
1178  /*
1179  * Set up some ItemPointers which point to the first and last possible
1180  * tuples in the heap.
1181  */
1182  ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
1183  ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
1184 
1185  /*
1186  * If the given maximum TID is below the highest possible TID in the
1187  * relation, then restrict the range to that, otherwise we scan to the end
1188  * of the relation.
1189  */
1190  if (ItemPointerCompare(maxtid, &highestItem) < 0)
1191  ItemPointerCopy(maxtid, &highestItem);
1192 
1193  /*
1194  * If the given minimum TID is above the lowest possible TID in the
1195  * relation, then restrict the range to only scan for TIDs above that.
1196  */
1197  if (ItemPointerCompare(mintid, &lowestItem) > 0)
1198  ItemPointerCopy(mintid, &lowestItem);
1199 
1200  /*
1201  * Check for an empty range and protect from would be negative results
1202  * from the numBlks calculation below.
1203  */
1204  if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
1205  {
1206  /* Set an empty range of blocks to scan */
1207  heap_setscanlimits(sscan, 0, 0);
1208  return;
1209  }
1210 
1211  /*
1212  * Calculate the first block and the number of blocks we must scan. We
1213  * could be more aggressive here and perform some more validation to try
1214  * and further narrow the scope of blocks to scan by checking if the
1215  * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1216  * advance startBlk by one. Likewise, if highestItem has an offset of 0
1217  * we could scan one fewer blocks. However, such an optimization does not
1218  * seem worth troubling over, currently.
1219  */
1220  startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
1221 
1222  numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
1223  ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
1224 
1225  /* Set the start block and number of blocks to scan */
1226  heap_setscanlimits(sscan, startBlk, numBlks);
1227 
1228  /* Finally, set the TID range in sscan */
1229  ItemPointerCopy(&lowestItem, &sscan->rs_mintid);
1230  ItemPointerCopy(&highestItem, &sscan->rs_maxtid);
1231 }
1232 
1233 bool
1235  TupleTableSlot *slot)
1236 {
1237  HeapScanDesc scan = (HeapScanDesc) sscan;
1238  ItemPointer mintid = &sscan->rs_mintid;
1239  ItemPointer maxtid = &sscan->rs_maxtid;
1240 
1241  /* Note: no locking manipulations needed */
1242  for (;;)
1243  {
1244  if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1245  heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1246  else
1247  heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1248 
1249  if (scan->rs_ctup.t_data == NULL)
1250  {
1251  ExecClearTuple(slot);
1252  return false;
1253  }
1254 
1255  /*
1256  * heap_set_tidrange will have used heap_setscanlimits to limit the
1257  * range of pages we scan to only ones that can contain the TID range
1258  * we're scanning for. Here we must filter out any tuples from these
1259  * pages that are outside of that range.
1260  */
1261  if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1262  {
1263  ExecClearTuple(slot);
1264 
1265  /*
1266  * When scanning backwards, the TIDs will be in descending order.
1267  * Future tuples in this direction will be lower still, so we can
1268  * just return false to indicate there will be no more tuples.
1269  */
1270  if (ScanDirectionIsBackward(direction))
1271  return false;
1272 
1273  continue;
1274  }
1275 
1276  /*
1277  * Likewise for the final page, we must filter out TIDs greater than
1278  * maxtid.
1279  */
1280  if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1281  {
1282  ExecClearTuple(slot);
1283 
1284  /*
1285  * When scanning forward, the TIDs will be in ascending order.
1286  * Future tuples in this direction will be higher still, so we can
1287  * just return false to indicate there will be no more tuples.
1288  */
1289  if (ScanDirectionIsForward(direction))
1290  return false;
1291  continue;
1292  }
1293 
1294  break;
1295  }
1296 
1297  /*
1298  * if we get here it means we have a new current scan tuple, so point to
1299  * the proper return buffer and return the tuple.
1300  */
1302 
1303  ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1304  return true;
1305 }
1306 
1307 /*
1308  * heap_fetch - retrieve tuple with given tid
1309  *
1310  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1311  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1312  * against the specified snapshot.
1313  *
1314  * If successful (tuple found and passes snapshot time qual), then *userbuf
1315  * is set to the buffer holding the tuple and true is returned. The caller
1316  * must unpin the buffer when done with the tuple.
1317  *
1318  * If the tuple is not found (ie, item number references a deleted slot),
1319  * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
1320  * and false is returned.
1321  *
1322  * If the tuple is found but fails the time qual check, then the behavior
1323  * depends on the keep_buf parameter. If keep_buf is false, the results
1324  * are the same as for the tuple-not-found case. If keep_buf is true,
1325  * then tuple->t_data and *userbuf are returned as for the success case,
1326  * and again the caller must unpin the buffer; but false is returned.
1327  *
1328  * heap_fetch does not follow HOT chains: only the exact TID requested will
1329  * be fetched.
1330  *
1331  * It is somewhat inconsistent that we ereport() on invalid block number but
1332  * return false on invalid item number. There are a couple of reasons though.
1333  * One is that the caller can relatively easily check the block number for
1334  * validity, but cannot check the item number without reading the page
1335  * himself. Another is that when we are following a t_ctid link, we can be
1336  * reasonably confident that the page number is valid (since VACUUM shouldn't
1337  * truncate off the destination page without having killed the referencing
1338  * tuple first), but the item number might well not be good.
1339  */
1340 bool
1342  Snapshot snapshot,
1343  HeapTuple tuple,
1344  Buffer *userbuf,
1345  bool keep_buf)
1346 {
1347  ItemPointer tid = &(tuple->t_self);
1348  ItemId lp;
1349  Buffer buffer;
1350  Page page;
1351  OffsetNumber offnum;
1352  bool valid;
1353 
1354  /*
1355  * Fetch and pin the appropriate page of the relation.
1356  */
1357  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1358 
1359  /*
1360  * Need share lock on buffer to examine tuple commit status.
1361  */
1362  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1363  page = BufferGetPage(buffer);
1364 
1365  /*
1366  * We'd better check for out-of-range offnum in case of VACUUM since the
1367  * TID was obtained.
1368  */
1369  offnum = ItemPointerGetOffsetNumber(tid);
1370  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1371  {
1372  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1373  ReleaseBuffer(buffer);
1374  *userbuf = InvalidBuffer;
1375  tuple->t_data = NULL;
1376  return false;
1377  }
1378 
1379  /*
1380  * get the item line pointer corresponding to the requested tid
1381  */
1382  lp = PageGetItemId(page, offnum);
1383 
1384  /*
1385  * Must check for deleted tuple.
1386  */
1387  if (!ItemIdIsNormal(lp))
1388  {
1389  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1390  ReleaseBuffer(buffer);
1391  *userbuf = InvalidBuffer;
1392  tuple->t_data = NULL;
1393  return false;
1394  }
1395 
1396  /*
1397  * fill in *tuple fields
1398  */
1399  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1400  tuple->t_len = ItemIdGetLength(lp);
1401  tuple->t_tableOid = RelationGetRelid(relation);
1402 
1403  /*
1404  * check tuple visibility, then release lock
1405  */
1406  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1407 
1408  if (valid)
1409  PredicateLockTID(relation, &(tuple->t_self), snapshot,
1410  HeapTupleHeaderGetXmin(tuple->t_data));
1411 
1412  HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1413 
1414  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1415 
1416  if (valid)
1417  {
1418  /*
1419  * All checks passed, so return the tuple as valid. Caller is now
1420  * responsible for releasing the buffer.
1421  */
1422  *userbuf = buffer;
1423 
1424  return true;
1425  }
1426 
1427  /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1428  if (keep_buf)
1429  *userbuf = buffer;
1430  else
1431  {
1432  ReleaseBuffer(buffer);
1433  *userbuf = InvalidBuffer;
1434  tuple->t_data = NULL;
1435  }
1436 
1437  return false;
1438 }
1439 
1440 /*
1441  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1442  *
1443  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1444  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1445  * for the first chain member satisfying the given snapshot. If one is
1446  * found, we update *tid to reference that tuple's offset number, and
1447  * return true. If no match, return false without modifying *tid.
1448  *
1449  * heapTuple is a caller-supplied buffer. When a match is found, we return
1450  * the tuple here, in addition to updating *tid. If no match is found, the
1451  * contents of this buffer on return are undefined.
1452  *
1453  * If all_dead is not NULL, we check non-visible tuples to see if they are
1454  * globally dead; *all_dead is set true if all members of the HOT chain
1455  * are vacuumable, false if not.
1456  *
1457  * Unlike heap_fetch, the caller must already have pin and (at least) share
1458  * lock on the buffer; it is still pinned/locked at exit.
1459  */
1460 bool
1462  Snapshot snapshot, HeapTuple heapTuple,
1463  bool *all_dead, bool first_call)
1464 {
1465  Page page = BufferGetPage(buffer);
1466  TransactionId prev_xmax = InvalidTransactionId;
1467  BlockNumber blkno;
1468  OffsetNumber offnum;
1469  bool at_chain_start;
1470  bool valid;
1471  bool skip;
1472  GlobalVisState *vistest = NULL;
1473 
1474  /* If this is not the first call, previous call returned a (live!) tuple */
1475  if (all_dead)
1476  *all_dead = first_call;
1477 
1478  blkno = ItemPointerGetBlockNumber(tid);
1479  offnum = ItemPointerGetOffsetNumber(tid);
1480  at_chain_start = first_call;
1481  skip = !first_call;
1482 
1483  /* XXX: we should assert that a snapshot is pushed or registered */
1485  Assert(BufferGetBlockNumber(buffer) == blkno);
1486 
1487  /* Scan through possible multiple members of HOT-chain */
1488  for (;;)
1489  {
1490  ItemId lp;
1491 
1492  /* check for bogus TID */
1493  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1494  break;
1495 
1496  lp = PageGetItemId(page, offnum);
1497 
1498  /* check for unused, dead, or redirected items */
1499  if (!ItemIdIsNormal(lp))
1500  {
1501  /* We should only see a redirect at start of chain */
1502  if (ItemIdIsRedirected(lp) && at_chain_start)
1503  {
1504  /* Follow the redirect */
1505  offnum = ItemIdGetRedirect(lp);
1506  at_chain_start = false;
1507  continue;
1508  }
1509  /* else must be end of chain */
1510  break;
1511  }
1512 
1513  /*
1514  * Update heapTuple to point to the element of the HOT chain we're
1515  * currently investigating. Having t_self set correctly is important
1516  * because the SSI checks and the *Satisfies routine for historical
1517  * MVCC snapshots need the correct tid to decide about the visibility.
1518  */
1519  heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1520  heapTuple->t_len = ItemIdGetLength(lp);
1521  heapTuple->t_tableOid = RelationGetRelid(relation);
1522  ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1523 
1524  /*
1525  * Shouldn't see a HEAP_ONLY tuple at chain start.
1526  */
1527  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1528  break;
1529 
1530  /*
1531  * The xmin should match the previous xmax value, else chain is
1532  * broken.
1533  */
1534  if (TransactionIdIsValid(prev_xmax) &&
1535  !TransactionIdEquals(prev_xmax,
1536  HeapTupleHeaderGetXmin(heapTuple->t_data)))
1537  break;
1538 
1539  /*
1540  * When first_call is true (and thus, skip is initially false) we'll
1541  * return the first tuple we find. But on later passes, heapTuple
1542  * will initially be pointing to the tuple we returned last time.
1543  * Returning it again would be incorrect (and would loop forever), so
1544  * we skip it and return the next match we find.
1545  */
1546  if (!skip)
1547  {
1548  /* If it's visible per the snapshot, we must return it */
1549  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1550  HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1551  buffer, snapshot);
1552 
1553  if (valid)
1554  {
1555  ItemPointerSetOffsetNumber(tid, offnum);
1556  PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1557  HeapTupleHeaderGetXmin(heapTuple->t_data));
1558  if (all_dead)
1559  *all_dead = false;
1560  return true;
1561  }
1562  }
1563  skip = false;
1564 
1565  /*
1566  * If we can't see it, maybe no one else can either. At caller
1567  * request, check whether all chain members are dead to all
1568  * transactions.
1569  *
1570  * Note: if you change the criterion here for what is "dead", fix the
1571  * planner's get_actual_variable_range() function to match.
1572  */
1573  if (all_dead && *all_dead)
1574  {
1575  if (!vistest)
1576  vistest = GlobalVisTestFor(relation);
1577 
1578  if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1579  *all_dead = false;
1580  }
1581 
1582  /*
1583  * Check to see if HOT chain continues past this tuple; if so fetch
1584  * the next offnum and loop around.
1585  */
1586  if (HeapTupleIsHotUpdated(heapTuple))
1587  {
1589  blkno);
1590  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1591  at_chain_start = false;
1592  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1593  }
1594  else
1595  break; /* end of chain */
1596  }
1597 
1598  return false;
1599 }
1600 
1601 /*
1602  * heap_get_latest_tid - get the latest tid of a specified tuple
1603  *
1604  * Actually, this gets the latest version that is visible according to the
1605  * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1606  * possibly uncommitted version.
1607  *
1608  * *tid is both an input and an output parameter: it is updated to
1609  * show the latest version of the row. Note that it will not be changed
1610  * if no version of the row passes the snapshot test.
1611  */
1612 void
1614  ItemPointer tid)
1615 {
1616  Relation relation = sscan->rs_rd;
1617  Snapshot snapshot = sscan->rs_snapshot;
1618  ItemPointerData ctid;
1619  TransactionId priorXmax;
1620 
1621  /*
1622  * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1623  * Assume that t_ctid links are valid however - there shouldn't be invalid
1624  * ones in the table.
1625  */
1626  Assert(ItemPointerIsValid(tid));
1627 
1628  /*
1629  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1630  * need to examine, and *tid is the TID we will return if ctid turns out
1631  * to be bogus.
1632  *
1633  * Note that we will loop until we reach the end of the t_ctid chain.
1634  * Depending on the snapshot passed, there might be at most one visible
1635  * version of the row, but we don't try to optimize for that.
1636  */
1637  ctid = *tid;
1638  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1639  for (;;)
1640  {
1641  Buffer buffer;
1642  Page page;
1643  OffsetNumber offnum;
1644  ItemId lp;
1645  HeapTupleData tp;
1646  bool valid;
1647 
1648  /*
1649  * Read, pin, and lock the page.
1650  */
1651  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1652  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1653  page = BufferGetPage(buffer);
1654 
1655  /*
1656  * Check for bogus item number. This is not treated as an error
1657  * condition because it can happen while following a t_ctid link. We
1658  * just assume that the prior tid is OK and return it unchanged.
1659  */
1660  offnum = ItemPointerGetOffsetNumber(&ctid);
1661  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1662  {
1663  UnlockReleaseBuffer(buffer);
1664  break;
1665  }
1666  lp = PageGetItemId(page, offnum);
1667  if (!ItemIdIsNormal(lp))
1668  {
1669  UnlockReleaseBuffer(buffer);
1670  break;
1671  }
1672 
1673  /* OK to access the tuple */
1674  tp.t_self = ctid;
1675  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1676  tp.t_len = ItemIdGetLength(lp);
1677  tp.t_tableOid = RelationGetRelid(relation);
1678 
1679  /*
1680  * After following a t_ctid link, we might arrive at an unrelated
1681  * tuple. Check for XMIN match.
1682  */
1683  if (TransactionIdIsValid(priorXmax) &&
1685  {
1686  UnlockReleaseBuffer(buffer);
1687  break;
1688  }
1689 
1690  /*
1691  * Check tuple visibility; if visible, set it as the new result
1692  * candidate.
1693  */
1694  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1695  HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1696  if (valid)
1697  *tid = ctid;
1698 
1699  /*
1700  * If there's a valid t_ctid link, follow it, else we're done.
1701  */
1702  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1706  {
1707  UnlockReleaseBuffer(buffer);
1708  break;
1709  }
1710 
1711  ctid = tp.t_data->t_ctid;
1712  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1713  UnlockReleaseBuffer(buffer);
1714  } /* end of loop */
1715 }
1716 
1717 
1718 /*
1719  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1720  *
1721  * This is called after we have waited for the XMAX transaction to terminate.
1722  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1723  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1724  * hint bit if possible --- but beware that that may not yet be possible,
1725  * if the transaction committed asynchronously.
1726  *
1727  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1728  * even if it commits.
1729  *
1730  * Hence callers should look only at XMAX_INVALID.
1731  *
1732  * Note this is not allowed for tuples whose xmax is a multixact.
1733  */
1734 static void
1736 {
1738  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1739 
1740  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1741  {
1742  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1745  xid);
1746  else
1747  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1749  }
1750 }
1751 
1752 
1753 /*
1754  * GetBulkInsertState - prepare status object for a bulk insert
1755  */
1758 {
1759  BulkInsertState bistate;
1760 
1761  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1763  bistate->current_buf = InvalidBuffer;
1764  bistate->next_free = InvalidBlockNumber;
1765  bistate->last_free = InvalidBlockNumber;
1766  bistate->already_extended_by = 0;
1767  return bistate;
1768 }
1769 
1770 /*
1771  * FreeBulkInsertState - clean up after finishing a bulk insert
1772  */
1773 void
1775 {
1776  if (bistate->current_buf != InvalidBuffer)
1777  ReleaseBuffer(bistate->current_buf);
1778  FreeAccessStrategy(bistate->strategy);
1779  pfree(bistate);
1780 }
1781 
1782 /*
1783  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1784  */
1785 void
1787 {
1788  if (bistate->current_buf != InvalidBuffer)
1789  ReleaseBuffer(bistate->current_buf);
1790  bistate->current_buf = InvalidBuffer;
1791 
1792  /*
1793  * Despite the name, we also reset bulk relation extension state.
1794  * Otherwise we can end up erroring out due to looking for free space in
1795  * ->next_free of one partition, even though ->next_free was set when
1796  * extending another partition. It could obviously also be bad for
1797  * efficiency to look at existing blocks at offsets from another
1798  * partition, even if we don't error out.
1799  */
1800  bistate->next_free = InvalidBlockNumber;
1801  bistate->last_free = InvalidBlockNumber;
1802 }
1803 
1804 
1805 /*
1806  * heap_insert - insert tuple into a heap
1807  *
1808  * The new tuple is stamped with current transaction ID and the specified
1809  * command ID.
1810  *
1811  * See table_tuple_insert for comments about most of the input flags, except
1812  * that this routine directly takes a tuple rather than a slot.
1813  *
1814  * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1815  * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1816  * implement table_tuple_insert_speculative().
1817  *
1818  * On return the header fields of *tup are updated to match the stored tuple;
1819  * in particular tup->t_self receives the actual TID where the tuple was
1820  * stored. But note that any toasting of fields within the tuple data is NOT
1821  * reflected into *tup.
1822  */
1823 void
1825  int options, BulkInsertState bistate)
1826 {
1828  HeapTuple heaptup;
1829  Buffer buffer;
1830  Buffer vmbuffer = InvalidBuffer;
1831  bool all_visible_cleared = false;
1832 
1833  /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
1835  RelationGetNumberOfAttributes(relation));
1836 
1837  /*
1838  * Fill in tuple header fields and toast the tuple if necessary.
1839  *
1840  * Note: below this point, heaptup is the data we actually intend to store
1841  * into the relation; tup is the caller's original untoasted data.
1842  */
1843  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
1844 
1845  /*
1846  * Find buffer to insert this tuple into. If the page is all visible,
1847  * this will also pin the requisite visibility map page.
1848  */
1849  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1850  InvalidBuffer, options, bistate,
1851  &vmbuffer, NULL,
1852  0);
1853 
1854  /*
1855  * We're about to do the actual insert -- but check for conflict first, to
1856  * avoid possibly having to roll back work we've just done.
1857  *
1858  * This is safe without a recheck as long as there is no possibility of
1859  * another process scanning the page between this check and the insert
1860  * being visible to the scan (i.e., an exclusive buffer content lock is
1861  * continuously held from this point until the tuple insert is visible).
1862  *
1863  * For a heap insert, we only need to check for table-level SSI locks. Our
1864  * new tuple can't possibly conflict with existing tuple locks, and heap
1865  * page locks are only consolidated versions of tuple locks; they do not
1866  * lock "gaps" as index page locks do. So we don't need to specify a
1867  * buffer when making the call, which makes for a faster check.
1868  */
1870 
1871  /* NO EREPORT(ERROR) from here till changes are logged */
1873 
1874  RelationPutHeapTuple(relation, buffer, heaptup,
1875  (options & HEAP_INSERT_SPECULATIVE) != 0);
1876 
1877  if (PageIsAllVisible(BufferGetPage(buffer)))
1878  {
1879  all_visible_cleared = true;
1881  visibilitymap_clear(relation,
1882  ItemPointerGetBlockNumber(&(heaptup->t_self)),
1883  vmbuffer, VISIBILITYMAP_VALID_BITS);
1884  }
1885 
1886  /*
1887  * XXX Should we set PageSetPrunable on this page ?
1888  *
1889  * The inserting transaction may eventually abort thus making this tuple
1890  * DEAD and hence available for pruning. Though we don't want to optimize
1891  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1892  * aborted tuple will never be pruned until next vacuum is triggered.
1893  *
1894  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1895  */
1896 
1897  MarkBufferDirty(buffer);
1898 
1899  /* XLOG stuff */
1900  if (RelationNeedsWAL(relation))
1901  {
1902  xl_heap_insert xlrec;
1903  xl_heap_header xlhdr;
1904  XLogRecPtr recptr;
1905  Page page = BufferGetPage(buffer);
1906  uint8 info = XLOG_HEAP_INSERT;
1907  int bufflags = 0;
1908 
1909  /*
1910  * If this is a catalog, we need to transmit combo CIDs to properly
1911  * decode, so log that as well.
1912  */
1914  log_heap_new_cid(relation, heaptup);
1915 
1916  /*
1917  * If this is the single and first tuple on page, we can reinit the
1918  * page instead of restoring the whole thing. Set flag, and hide
1919  * buffer references from XLogInsert.
1920  */
1921  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1923  {
1924  info |= XLOG_HEAP_INIT_PAGE;
1925  bufflags |= REGBUF_WILL_INIT;
1926  }
1927 
1928  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
1929  xlrec.flags = 0;
1930  if (all_visible_cleared)
1935 
1936  /*
1937  * For logical decoding, we need the tuple even if we're doing a full
1938  * page write, so make sure it's included even if we take a full-page
1939  * image. (XXX We could alternatively store a pointer into the FPW).
1940  */
1941  if (RelationIsLogicallyLogged(relation) &&
1943  {
1945  bufflags |= REGBUF_KEEP_DATA;
1946 
1947  if (IsToastRelation(relation))
1949  }
1950 
1951  XLogBeginInsert();
1952  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
1953 
1954  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1955  xlhdr.t_infomask = heaptup->t_data->t_infomask;
1956  xlhdr.t_hoff = heaptup->t_data->t_hoff;
1957 
1958  /*
1959  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
1960  * write the whole page to the xlog, we don't need to store
1961  * xl_heap_header in the xlog.
1962  */
1963  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
1964  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
1965  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
1967  (char *) heaptup->t_data + SizeofHeapTupleHeader,
1968  heaptup->t_len - SizeofHeapTupleHeader);
1969 
1970  /* filtering by origin on a row level is much more efficient */
1972 
1973  recptr = XLogInsert(RM_HEAP_ID, info);
1974 
1975  PageSetLSN(page, recptr);
1976  }
1977 
1978  END_CRIT_SECTION();
1979 
1980  UnlockReleaseBuffer(buffer);
1981  if (vmbuffer != InvalidBuffer)
1982  ReleaseBuffer(vmbuffer);
1983 
1984  /*
1985  * If tuple is cachable, mark it for invalidation from the caches in case
1986  * we abort. Note it is OK to do this after releasing the buffer, because
1987  * the heaptup data structure is all in local memory, not in the shared
1988  * buffer.
1989  */
1990  CacheInvalidateHeapTuple(relation, heaptup, NULL);
1991 
1992  /* Note: speculative insertions are counted too, even if aborted later */
1993  pgstat_count_heap_insert(relation, 1);
1994 
1995  /*
1996  * If heaptup is a private copy, release it. Don't forget to copy t_self
1997  * back to the caller's image, too.
1998  */
1999  if (heaptup != tup)
2000  {
2001  tup->t_self = heaptup->t_self;
2002  heap_freetuple(heaptup);
2003  }
2004 }
2005 
2006 /*
2007  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2008  * tuple header fields and toasts the tuple if necessary. Returns a toasted
2009  * version of the tuple if it was toasted, or the original tuple if not. Note
2010  * that in any case, the header fields are also set in the original tuple.
2011  */
2012 static HeapTuple
2014  CommandId cid, int options)
2015 {
2016  /*
2017  * To allow parallel inserts, we need to ensure that they are safe to be
2018  * performed in workers. We have the infrastructure to allow parallel
2019  * inserts in general except for the cases where inserts generate a new
2020  * CommandId (eg. inserts into a table having a foreign key column).
2021  */
2022  if (IsParallelWorker())
2023  ereport(ERROR,
2024  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2025  errmsg("cannot insert tuples in a parallel worker")));
2026 
2027  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2028  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2030  HeapTupleHeaderSetXmin(tup->t_data, xid);
2033 
2034  HeapTupleHeaderSetCmin(tup->t_data, cid);
2035  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2036  tup->t_tableOid = RelationGetRelid(relation);
2037 
2038  /*
2039  * If the new tuple is too big for storage or contains already toasted
2040  * out-of-line attributes from some other relation, invoke the toaster.
2041  */
2042  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2043  relation->rd_rel->relkind != RELKIND_MATVIEW)
2044  {
2045  /* toast table entries should never be recursively toasted */
2047  return tup;
2048  }
2049  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2050  return heap_toast_insert_or_update(relation, tup, NULL, options);
2051  else
2052  return tup;
2053 }
2054 
2055 /*
2056  * Helper for heap_multi_insert() that computes the number of entire pages
2057  * that inserting the remaining heaptuples requires. Used to determine how
2058  * much the relation needs to be extended by.
2059  */
2060 static int
2061 heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
2062 {
2063  size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
2064  int npages = 1;
2065 
2066  for (int i = done; i < ntuples; i++)
2067  {
2068  size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2069 
2070  if (page_avail < tup_sz)
2071  {
2072  npages++;
2073  page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
2074  }
2075  page_avail -= tup_sz;
2076  }
2077 
2078  return npages;
2079 }
2080 
2081 /*
2082  * heap_multi_insert - insert multiple tuples into a heap
2083  *
2084  * This is like heap_insert(), but inserts multiple tuples in one operation.
2085  * That's faster than calling heap_insert() in a loop, because when multiple
2086  * tuples can be inserted on a single page, we can write just a single WAL
2087  * record covering all of them, and only need to lock/unlock the page once.
2088  *
2089  * Note: this leaks memory into the current memory context. You can create a
2090  * temporary context before calling this, if that's a problem.
2091  */
2092 void
2093 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2094  CommandId cid, int options, BulkInsertState bistate)
2095 {
2097  HeapTuple *heaptuples;
2098  int i;
2099  int ndone;
2100  PGAlignedBlock scratch;
2101  Page page;
2102  Buffer vmbuffer = InvalidBuffer;
2103  bool needwal;
2104  Size saveFreeSpace;
2105  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2106  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2107  bool starting_with_empty_page = false;
2108  int npages = 0;
2109  int npages_used = 0;
2110 
2111  /* currently not needed (thus unsupported) for heap_multi_insert() */
2113 
2114  needwal = RelationNeedsWAL(relation);
2115  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2117 
2118  /* Toast and set header data in all the slots */
2119  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2120  for (i = 0; i < ntuples; i++)
2121  {
2122  HeapTuple tuple;
2123 
2124  tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2125  slots[i]->tts_tableOid = RelationGetRelid(relation);
2126  tuple->t_tableOid = slots[i]->tts_tableOid;
2127  heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2128  options);
2129  }
2130 
2131  /*
2132  * We're about to do the actual inserts -- but check for conflict first,
2133  * to minimize the possibility of having to roll back work we've just
2134  * done.
2135  *
2136  * A check here does not definitively prevent a serialization anomaly;
2137  * that check MUST be done at least past the point of acquiring an
2138  * exclusive buffer content lock on every buffer that will be affected,
2139  * and MAY be done after all inserts are reflected in the buffers and
2140  * those locks are released; otherwise there is a race condition. Since
2141  * multiple buffers can be locked and unlocked in the loop below, and it
2142  * would not be feasible to identify and lock all of those buffers before
2143  * the loop, we must do a final check at the end.
2144  *
2145  * The check here could be omitted with no loss of correctness; it is
2146  * present strictly as an optimization.
2147  *
2148  * For heap inserts, we only need to check for table-level SSI locks. Our
2149  * new tuples can't possibly conflict with existing tuple locks, and heap
2150  * page locks are only consolidated versions of tuple locks; they do not
2151  * lock "gaps" as index page locks do. So we don't need to specify a
2152  * buffer when making the call, which makes for a faster check.
2153  */
2155 
2156  ndone = 0;
2157  while (ndone < ntuples)
2158  {
2159  Buffer buffer;
2160  bool all_visible_cleared = false;
2161  bool all_frozen_set = false;
2162  int nthispage;
2163 
2165 
2166  /*
2167  * Compute number of pages needed to fit the to-be-inserted tuples in
2168  * the worst case. This will be used to determine how much to extend
2169  * the relation by in RelationGetBufferForTuple(), if needed. If we
2170  * filled a prior page from scratch, we can just update our last
2171  * computation, but if we started with a partially filled page,
2172  * recompute from scratch, the number of potentially required pages
2173  * can vary due to tuples needing to fit onto the page, page headers
2174  * etc.
2175  */
2176  if (ndone == 0 || !starting_with_empty_page)
2177  {
2178  npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2179  saveFreeSpace);
2180  npages_used = 0;
2181  }
2182  else
2183  npages_used++;
2184 
2185  /*
2186  * Find buffer where at least the next tuple will fit. If the page is
2187  * all-visible, this will also pin the requisite visibility map page.
2188  *
2189  * Also pin visibility map page if COPY FREEZE inserts tuples into an
2190  * empty page. See all_frozen_set below.
2191  */
2192  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2193  InvalidBuffer, options, bistate,
2194  &vmbuffer, NULL,
2195  npages - npages_used);
2196  page = BufferGetPage(buffer);
2197 
2198  starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
2199 
2200  if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
2201  all_frozen_set = true;
2202 
2203  /* NO EREPORT(ERROR) from here till changes are logged */
2205 
2206  /*
2207  * RelationGetBufferForTuple has ensured that the first tuple fits.
2208  * Put that on the page, and then as many other tuples as fit.
2209  */
2210  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2211 
2212  /*
2213  * For logical decoding we need combo CIDs to properly decode the
2214  * catalog.
2215  */
2216  if (needwal && need_cids)
2217  log_heap_new_cid(relation, heaptuples[ndone]);
2218 
2219  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2220  {
2221  HeapTuple heaptup = heaptuples[ndone + nthispage];
2222 
2223  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2224  break;
2225 
2226  RelationPutHeapTuple(relation, buffer, heaptup, false);
2227 
2228  /*
2229  * For logical decoding we need combo CIDs to properly decode the
2230  * catalog.
2231  */
2232  if (needwal && need_cids)
2233  log_heap_new_cid(relation, heaptup);
2234  }
2235 
2236  /*
2237  * If the page is all visible, need to clear that, unless we're only
2238  * going to add further frozen rows to it.
2239  *
2240  * If we're only adding already frozen rows to a previously empty
2241  * page, mark it as all-visible.
2242  */
2243  if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN))
2244  {
2245  all_visible_cleared = true;
2246  PageClearAllVisible(page);
2247  visibilitymap_clear(relation,
2248  BufferGetBlockNumber(buffer),
2249  vmbuffer, VISIBILITYMAP_VALID_BITS);
2250  }
2251  else if (all_frozen_set)
2252  PageSetAllVisible(page);
2253 
2254  /*
2255  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2256  */
2257 
2258  MarkBufferDirty(buffer);
2259 
2260  /* XLOG stuff */
2261  if (needwal)
2262  {
2263  XLogRecPtr recptr;
2264  xl_heap_multi_insert *xlrec;
2266  char *tupledata;
2267  int totaldatalen;
2268  char *scratchptr = scratch.data;
2269  bool init;
2270  int bufflags = 0;
2271 
2272  /*
2273  * If the page was previously empty, we can reinit the page
2274  * instead of restoring the whole thing.
2275  */
2276  init = starting_with_empty_page;
2277 
2278  /* allocate xl_heap_multi_insert struct from the scratch area */
2279  xlrec = (xl_heap_multi_insert *) scratchptr;
2280  scratchptr += SizeOfHeapMultiInsert;
2281 
2282  /*
2283  * Allocate offsets array. Unless we're reinitializing the page,
2284  * in that case the tuples are stored in order starting at
2285  * FirstOffsetNumber and we don't need to store the offsets
2286  * explicitly.
2287  */
2288  if (!init)
2289  scratchptr += nthispage * sizeof(OffsetNumber);
2290 
2291  /* the rest of the scratch space is used for tuple data */
2292  tupledata = scratchptr;
2293 
2294  /* check that the mutually exclusive flags are not both set */
2295  Assert(!(all_visible_cleared && all_frozen_set));
2296 
2297  xlrec->flags = 0;
2298  if (all_visible_cleared)
2300  if (all_frozen_set)
2302 
2303  xlrec->ntuples = nthispage;
2304 
2305  /*
2306  * Write out an xl_multi_insert_tuple and the tuple data itself
2307  * for each tuple.
2308  */
2309  for (i = 0; i < nthispage; i++)
2310  {
2311  HeapTuple heaptup = heaptuples[ndone + i];
2312  xl_multi_insert_tuple *tuphdr;
2313  int datalen;
2314 
2315  if (!init)
2316  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2317  /* xl_multi_insert_tuple needs two-byte alignment. */
2318  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2319  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2320 
2321  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2322  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2323  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2324 
2325  /* write bitmap [+ padding] [+ oid] + data */
2326  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2327  memcpy(scratchptr,
2328  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2329  datalen);
2330  tuphdr->datalen = datalen;
2331  scratchptr += datalen;
2332  }
2333  totaldatalen = scratchptr - tupledata;
2334  Assert((scratchptr - scratch.data) < BLCKSZ);
2335 
2336  if (need_tuple_data)
2338 
2339  /*
2340  * Signal that this is the last xl_heap_multi_insert record
2341  * emitted by this call to heap_multi_insert(). Needed for logical
2342  * decoding so it knows when to cleanup temporary data.
2343  */
2344  if (ndone + nthispage == ntuples)
2345  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2346 
2347  if (init)
2348  {
2349  info |= XLOG_HEAP_INIT_PAGE;
2350  bufflags |= REGBUF_WILL_INIT;
2351  }
2352 
2353  /*
2354  * If we're doing logical decoding, include the new tuple data
2355  * even if we take a full-page image of the page.
2356  */
2357  if (need_tuple_data)
2358  bufflags |= REGBUF_KEEP_DATA;
2359 
2360  XLogBeginInsert();
2361  XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2362  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2363 
2364  XLogRegisterBufData(0, tupledata, totaldatalen);
2365 
2366  /* filtering by origin on a row level is much more efficient */
2368 
2369  recptr = XLogInsert(RM_HEAP2_ID, info);
2370 
2371  PageSetLSN(page, recptr);
2372  }
2373 
2374  END_CRIT_SECTION();
2375 
2376  /*
2377  * If we've frozen everything on the page, update the visibilitymap.
2378  * We're already holding pin on the vmbuffer.
2379  */
2380  if (all_frozen_set)
2381  {
2382  Assert(PageIsAllVisible(page));
2383  Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer));
2384 
2385  /*
2386  * It's fine to use InvalidTransactionId here - this is only used
2387  * when HEAP_INSERT_FROZEN is specified, which intentionally
2388  * violates visibility rules.
2389  */
2390  visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
2391  InvalidXLogRecPtr, vmbuffer,
2394  }
2395 
2396  UnlockReleaseBuffer(buffer);
2397  ndone += nthispage;
2398 
2399  /*
2400  * NB: Only release vmbuffer after inserting all tuples - it's fairly
2401  * likely that we'll insert into subsequent heap pages that are likely
2402  * to use the same vm page.
2403  */
2404  }
2405 
2406  /* We're done with inserting all tuples, so release the last vmbuffer. */
2407  if (vmbuffer != InvalidBuffer)
2408  ReleaseBuffer(vmbuffer);
2409 
2410  /*
2411  * We're done with the actual inserts. Check for conflicts again, to
2412  * ensure that all rw-conflicts in to these inserts are detected. Without
2413  * this final check, a sequential scan of the heap may have locked the
2414  * table after the "before" check, missing one opportunity to detect the
2415  * conflict, and then scanned the table before the new tuples were there,
2416  * missing the other chance to detect the conflict.
2417  *
2418  * For heap inserts, we only need to check for table-level SSI locks. Our
2419  * new tuples can't possibly conflict with existing tuple locks, and heap
2420  * page locks are only consolidated versions of tuple locks; they do not
2421  * lock "gaps" as index page locks do. So we don't need to specify a
2422  * buffer when making the call.
2423  */
2425 
2426  /*
2427  * If tuples are cachable, mark them for invalidation from the caches in
2428  * case we abort. Note it is OK to do this after releasing the buffer,
2429  * because the heaptuples data structure is all in local memory, not in
2430  * the shared buffer.
2431  */
2432  if (IsCatalogRelation(relation))
2433  {
2434  for (i = 0; i < ntuples; i++)
2435  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2436  }
2437 
2438  /* copy t_self fields back to the caller's slots */
2439  for (i = 0; i < ntuples; i++)
2440  slots[i]->tts_tid = heaptuples[i]->t_self;
2441 
2442  pgstat_count_heap_insert(relation, ntuples);
2443 }
2444 
2445 /*
2446  * simple_heap_insert - insert a tuple
2447  *
2448  * Currently, this routine differs from heap_insert only in supplying
2449  * a default command ID and not allowing access to the speedup options.
2450  *
2451  * This should be used rather than using heap_insert directly in most places
2452  * where we are modifying system catalogs.
2453  */
2454 void
2456 {
2457  heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2458 }
2459 
2460 /*
2461  * Given infomask/infomask2, compute the bits that must be saved in the
2462  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2463  * xl_heap_lock_updated WAL records.
2464  *
2465  * See fix_infomask_from_infobits.
2466  */
2467 static uint8
2468 compute_infobits(uint16 infomask, uint16 infomask2)
2469 {
2470  return
2471  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2472  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2473  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2474  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2475  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2476  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2477  XLHL_KEYS_UPDATED : 0);
2478 }
2479 
2480 /*
2481  * Given two versions of the same t_infomask for a tuple, compare them and
2482  * return whether the relevant status for a tuple Xmax has changed. This is
2483  * used after a buffer lock has been released and reacquired: we want to ensure
2484  * that the tuple state continues to be the same it was when we previously
2485  * examined it.
2486  *
2487  * Note the Xmax field itself must be compared separately.
2488  */
2489 static inline bool
2490 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2491 {
2492  const uint16 interesting =
2494 
2495  if ((new_infomask & interesting) != (old_infomask & interesting))
2496  return true;
2497 
2498  return false;
2499 }
2500 
2501 /*
2502  * heap_delete - delete a tuple
2503  *
2504  * See table_tuple_delete() for an explanation of the parameters, except that
2505  * this routine directly takes a tuple rather than a slot.
2506  *
2507  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2508  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2509  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2510  * generated by another transaction).
2511  */
2512 TM_Result
2514  CommandId cid, Snapshot crosscheck, bool wait,
2515  TM_FailureData *tmfd, bool changingPart)
2516 {
2517  TM_Result result;
2519  ItemId lp;
2520  HeapTupleData tp;
2521  Page page;
2522  BlockNumber block;
2523  Buffer buffer;
2524  Buffer vmbuffer = InvalidBuffer;
2525  TransactionId new_xmax;
2526  uint16 new_infomask,
2527  new_infomask2;
2528  bool have_tuple_lock = false;
2529  bool iscombo;
2530  bool all_visible_cleared = false;
2531  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2532  bool old_key_copied = false;
2533 
2534  Assert(ItemPointerIsValid(tid));
2535 
2536  /*
2537  * Forbid this during a parallel operation, lest it allocate a combo CID.
2538  * Other workers might need that combo CID for visibility checks, and we
2539  * have no provision for broadcasting it to them.
2540  */
2541  if (IsInParallelMode())
2542  ereport(ERROR,
2543  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2544  errmsg("cannot delete tuples during a parallel operation")));
2545 
2546  block = ItemPointerGetBlockNumber(tid);
2547  buffer = ReadBuffer(relation, block);
2548  page = BufferGetPage(buffer);
2549 
2550  /*
2551  * Before locking the buffer, pin the visibility map page if it appears to
2552  * be necessary. Since we haven't got the lock yet, someone else might be
2553  * in the middle of changing this, so we'll need to recheck after we have
2554  * the lock.
2555  */
2556  if (PageIsAllVisible(page))
2557  visibilitymap_pin(relation, block, &vmbuffer);
2558 
2560 
2561  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2562  Assert(ItemIdIsNormal(lp));
2563 
2564  tp.t_tableOid = RelationGetRelid(relation);
2565  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2566  tp.t_len = ItemIdGetLength(lp);
2567  tp.t_self = *tid;
2568 
2569 l1:
2570 
2571  /*
2572  * If we didn't pin the visibility map page and the page has become all
2573  * visible while we were busy locking the buffer, we'll have to unlock and
2574  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2575  * unfortunate, but hopefully shouldn't happen often.
2576  */
2577  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2578  {
2579  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2580  visibilitymap_pin(relation, block, &vmbuffer);
2582  }
2583 
2584  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2585 
2586  if (result == TM_Invisible)
2587  {
2588  UnlockReleaseBuffer(buffer);
2589  ereport(ERROR,
2590  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2591  errmsg("attempted to delete invisible tuple")));
2592  }
2593  else if (result == TM_BeingModified && wait)
2594  {
2595  TransactionId xwait;
2596  uint16 infomask;
2597 
2598  /* must copy state data before unlocking buffer */
2599  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2600  infomask = tp.t_data->t_infomask;
2601 
2602  /*
2603  * Sleep until concurrent transaction ends -- except when there's a
2604  * single locker and it's our own transaction. Note we don't care
2605  * which lock mode the locker has, because we need the strongest one.
2606  *
2607  * Before sleeping, we need to acquire tuple lock to establish our
2608  * priority for the tuple (see heap_lock_tuple). LockTuple will
2609  * release us when we are next-in-line for the tuple.
2610  *
2611  * If we are forced to "start over" below, we keep the tuple lock;
2612  * this arranges that we stay at the head of the line while rechecking
2613  * tuple state.
2614  */
2615  if (infomask & HEAP_XMAX_IS_MULTI)
2616  {
2617  bool current_is_member = false;
2618 
2619  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2620  LockTupleExclusive, &current_is_member))
2621  {
2622  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2623 
2624  /*
2625  * Acquire the lock, if necessary (but skip it when we're
2626  * requesting a lock and already have one; avoids deadlock).
2627  */
2628  if (!current_is_member)
2630  LockWaitBlock, &have_tuple_lock);
2631 
2632  /* wait for multixact */
2634  relation, &(tp.t_self), XLTW_Delete,
2635  NULL);
2637 
2638  /*
2639  * If xwait had just locked the tuple then some other xact
2640  * could update this tuple before we get to this point. Check
2641  * for xmax change, and start over if so.
2642  *
2643  * We also must start over if we didn't pin the VM page, and
2644  * the page has become all visible.
2645  */
2646  if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2647  xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2649  xwait))
2650  goto l1;
2651  }
2652 
2653  /*
2654  * You might think the multixact is necessarily done here, but not
2655  * so: it could have surviving members, namely our own xact or
2656  * other subxacts of this backend. It is legal for us to delete
2657  * the tuple in either case, however (the latter case is
2658  * essentially a situation of upgrading our former shared lock to
2659  * exclusive). We don't bother changing the on-disk hint bits
2660  * since we are about to overwrite the xmax altogether.
2661  */
2662  }
2663  else if (!TransactionIdIsCurrentTransactionId(xwait))
2664  {
2665  /*
2666  * Wait for regular transaction to end; but first, acquire tuple
2667  * lock.
2668  */
2669  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2671  LockWaitBlock, &have_tuple_lock);
2672  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2674 
2675  /*
2676  * xwait is done, but if xwait had just locked the tuple then some
2677  * other xact could update this tuple before we get to this point.
2678  * Check for xmax change, and start over if so.
2679  *
2680  * We also must start over if we didn't pin the VM page, and the
2681  * page has become all visible.
2682  */
2683  if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2684  xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2686  xwait))
2687  goto l1;
2688 
2689  /* Otherwise check if it committed or aborted */
2690  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2691  }
2692 
2693  /*
2694  * We may overwrite if previous xmax aborted, or if it committed but
2695  * only locked the tuple without updating it.
2696  */
2697  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2700  result = TM_Ok;
2701  else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2702  result = TM_Updated;
2703  else
2704  result = TM_Deleted;
2705  }
2706 
2707  /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
2708  if (result != TM_Ok)
2709  {
2710  Assert(result == TM_SelfModified ||
2711  result == TM_Updated ||
2712  result == TM_Deleted ||
2713  result == TM_BeingModified);
2715  Assert(result != TM_Updated ||
2716  !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2717  }
2718 
2719  if (crosscheck != InvalidSnapshot && result == TM_Ok)
2720  {
2721  /* Perform additional check for transaction-snapshot mode RI updates */
2722  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2723  result = TM_Updated;
2724  }
2725 
2726  if (result != TM_Ok)
2727  {
2728  tmfd->ctid = tp.t_data->t_ctid;
2730  if (result == TM_SelfModified)
2731  tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2732  else
2733  tmfd->cmax = InvalidCommandId;
2734  UnlockReleaseBuffer(buffer);
2735  if (have_tuple_lock)
2736  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2737  if (vmbuffer != InvalidBuffer)
2738  ReleaseBuffer(vmbuffer);
2739  return result;
2740  }
2741 
2742  /*
2743  * We're about to do the actual delete -- check for conflict first, to
2744  * avoid possibly having to roll back work we've just done.
2745  *
2746  * This is safe without a recheck as long as there is no possibility of
2747  * another process scanning the page between this check and the delete
2748  * being visible to the scan (i.e., an exclusive buffer content lock is
2749  * continuously held from this point until the tuple delete is visible).
2750  */
2751  CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
2752 
2753  /* replace cid with a combo CID if necessary */
2754  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2755 
2756  /*
2757  * Compute replica identity tuple before entering the critical section so
2758  * we don't PANIC upon a memory allocation failure.
2759  */
2760  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2761 
2762  /*
2763  * If this is the first possibly-multixact-able operation in the current
2764  * transaction, set my per-backend OldestMemberMXactId setting. We can be
2765  * certain that the transaction will never become a member of any older
2766  * MultiXactIds than that. (We have to do this even if we end up just
2767  * using our own TransactionId below, since some other backend could
2768  * incorporate our XID into a MultiXact immediately afterwards.)
2769  */
2771 
2774  xid, LockTupleExclusive, true,
2775  &new_xmax, &new_infomask, &new_infomask2);
2776 
2778 
2779  /*
2780  * If this transaction commits, the tuple will become DEAD sooner or
2781  * later. Set flag that this page is a candidate for pruning once our xid
2782  * falls below the OldestXmin horizon. If the transaction finally aborts,
2783  * the subsequent page pruning will be a no-op and the hint will be
2784  * cleared.
2785  */
2786  PageSetPrunable(page, xid);
2787 
2788  if (PageIsAllVisible(page))
2789  {
2790  all_visible_cleared = true;
2791  PageClearAllVisible(page);
2792  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2793  vmbuffer, VISIBILITYMAP_VALID_BITS);
2794  }
2795 
2796  /* store transaction information of xact deleting the tuple */
2799  tp.t_data->t_infomask |= new_infomask;
2800  tp.t_data->t_infomask2 |= new_infomask2;
2802  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2803  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2804  /* Make sure there is no forward chain link in t_ctid */
2805  tp.t_data->t_ctid = tp.t_self;
2806 
2807  /* Signal that this is actually a move into another partition */
2808  if (changingPart)
2810 
2811  MarkBufferDirty(buffer);
2812 
2813  /*
2814  * XLOG stuff
2815  *
2816  * NB: heap_abort_speculative() uses the same xlog record and replay
2817  * routines.
2818  */
2819  if (RelationNeedsWAL(relation))
2820  {
2821  xl_heap_delete xlrec;
2822  xl_heap_header xlhdr;
2823  XLogRecPtr recptr;
2824 
2825  /*
2826  * For logical decode we need combo CIDs to properly decode the
2827  * catalog
2828  */
2830  log_heap_new_cid(relation, &tp);
2831 
2832  xlrec.flags = 0;
2833  if (all_visible_cleared)
2835  if (changingPart)
2838  tp.t_data->t_infomask2);
2840  xlrec.xmax = new_xmax;
2841 
2842  if (old_key_tuple != NULL)
2843  {
2844  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
2846  else
2848  }
2849 
2850  XLogBeginInsert();
2851  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
2852 
2853  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2854 
2855  /*
2856  * Log replica identity of the deleted tuple if there is one
2857  */
2858  if (old_key_tuple != NULL)
2859  {
2860  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
2861  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
2862  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
2863 
2864  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
2865  XLogRegisterData((char *) old_key_tuple->t_data
2867  old_key_tuple->t_len
2869  }
2870 
2871  /* filtering by origin on a row level is much more efficient */
2873 
2874  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2875 
2876  PageSetLSN(page, recptr);
2877  }
2878 
2879  END_CRIT_SECTION();
2880 
2881  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2882 
2883  if (vmbuffer != InvalidBuffer)
2884  ReleaseBuffer(vmbuffer);
2885 
2886  /*
2887  * If the tuple has toasted out-of-line attributes, we need to delete
2888  * those items too. We have to do this before releasing the buffer
2889  * because we need to look at the contents of the tuple, but it's OK to
2890  * release the content lock on the buffer first.
2891  */
2892  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2893  relation->rd_rel->relkind != RELKIND_MATVIEW)
2894  {
2895  /* toast table entries should never be recursively toasted */
2897  }
2898  else if (HeapTupleHasExternal(&tp))
2899  heap_toast_delete(relation, &tp, false);
2900 
2901  /*
2902  * Mark tuple for invalidation from system caches at next command
2903  * boundary. We have to do this before releasing the buffer because we
2904  * need to look at the contents of the tuple.
2905  */
2906  CacheInvalidateHeapTuple(relation, &tp, NULL);
2907 
2908  /* Now we can release the buffer */
2909  ReleaseBuffer(buffer);
2910 
2911  /*
2912  * Release the lmgr tuple lock, if we had it.
2913  */
2914  if (have_tuple_lock)
2915  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2916 
2917  pgstat_count_heap_delete(relation);
2918 
2919  if (old_key_tuple != NULL && old_key_copied)
2920  heap_freetuple(old_key_tuple);
2921 
2922  return TM_Ok;
2923 }
2924 
2925 /*
2926  * simple_heap_delete - delete a tuple
2927  *
2928  * This routine may be used to delete a tuple when concurrent updates of
2929  * the target tuple are not expected (for example, because we have a lock
2930  * on the relation associated with the tuple). Any failure is reported
2931  * via ereport().
2932  */
2933 void
2935 {
2936  TM_Result result;
2937  TM_FailureData tmfd;
2938 
2939  result = heap_delete(relation, tid,
2941  true /* wait for commit */ ,
2942  &tmfd, false /* changingPart */ );
2943  switch (result)
2944  {
2945  case TM_SelfModified:
2946  /* Tuple was already updated in current command? */
2947  elog(ERROR, "tuple already updated by self");
2948  break;
2949 
2950  case TM_Ok:
2951  /* done successfully */
2952  break;
2953 
2954  case TM_Updated:
2955  elog(ERROR, "tuple concurrently updated");
2956  break;
2957 
2958  case TM_Deleted:
2959  elog(ERROR, "tuple concurrently deleted");
2960  break;
2961 
2962  default:
2963  elog(ERROR, "unrecognized heap_delete status: %u", result);
2964  break;
2965  }
2966 }
2967 
2968 /*
2969  * heap_update - replace a tuple
2970  *
2971  * See table_tuple_update() for an explanation of the parameters, except that
2972  * this routine directly takes a tuple rather than a slot.
2973  *
2974  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2975  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2976  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2977  * generated by another transaction).
2978  */
2979 TM_Result
2981  CommandId cid, Snapshot crosscheck, bool wait,
2982  TM_FailureData *tmfd, LockTupleMode *lockmode,
2983  TU_UpdateIndexes *update_indexes)
2984 {
2985  TM_Result result;
2987  Bitmapset *hot_attrs;
2988  Bitmapset *sum_attrs;
2989  Bitmapset *key_attrs;
2990  Bitmapset *id_attrs;
2991  Bitmapset *interesting_attrs;
2992  Bitmapset *modified_attrs;
2993  ItemId lp;
2994  HeapTupleData oldtup;
2995  HeapTuple heaptup;
2996  HeapTuple old_key_tuple = NULL;
2997  bool old_key_copied = false;
2998  Page page;
2999  BlockNumber block;
3000  MultiXactStatus mxact_status;
3001  Buffer buffer,
3002  newbuf,
3003  vmbuffer = InvalidBuffer,
3004  vmbuffer_new = InvalidBuffer;
3005  bool need_toast;
3006  Size newtupsize,
3007  pagefree;
3008  bool have_tuple_lock = false;
3009  bool iscombo;
3010  bool use_hot_update = false;
3011  bool summarized_update = false;
3012  bool key_intact;
3013  bool all_visible_cleared = false;
3014  bool all_visible_cleared_new = false;
3015  bool checked_lockers;
3016  bool locker_remains;
3017  bool id_has_external = false;
3018  TransactionId xmax_new_tuple,
3019  xmax_old_tuple;
3020  uint16 infomask_old_tuple,
3021  infomask2_old_tuple,
3022  infomask_new_tuple,
3023  infomask2_new_tuple;
3024 
3025  Assert(ItemPointerIsValid(otid));
3026 
3027  /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3029  RelationGetNumberOfAttributes(relation));
3030 
3031  /*
3032  * Forbid this during a parallel operation, lest it allocate a combo CID.
3033  * Other workers might need that combo CID for visibility checks, and we
3034  * have no provision for broadcasting it to them.
3035  */
3036  if (IsInParallelMode())
3037  ereport(ERROR,
3038  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3039  errmsg("cannot update tuples during a parallel operation")));
3040 
3041  /*
3042  * Fetch the list of attributes to be checked for various operations.
3043  *
3044  * For HOT considerations, this is wasted effort if we fail to update or
3045  * have to put the new tuple on a different page. But we must compute the
3046  * list before obtaining buffer lock --- in the worst case, if we are
3047  * doing an update on one of the relevant system catalogs, we could
3048  * deadlock if we try to fetch the list later. In any case, the relcache
3049  * caches the data so this is usually pretty cheap.
3050  *
3051  * We also need columns used by the replica identity and columns that are
3052  * considered the "key" of rows in the table.
3053  *
3054  * Note that we get copies of each bitmap, so we need not worry about
3055  * relcache flush happening midway through.
3056  */
3057  hot_attrs = RelationGetIndexAttrBitmap(relation,
3059  sum_attrs = RelationGetIndexAttrBitmap(relation,
3061  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3062  id_attrs = RelationGetIndexAttrBitmap(relation,
3064  interesting_attrs = NULL;
3065  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3066  interesting_attrs = bms_add_members(interesting_attrs, sum_attrs);
3067  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3068  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3069 
3070  block = ItemPointerGetBlockNumber(otid);
3071  buffer = ReadBuffer(relation, block);
3072  page = BufferGetPage(buffer);
3073 
3074  /*
3075  * Before locking the buffer, pin the visibility map page if it appears to
3076  * be necessary. Since we haven't got the lock yet, someone else might be
3077  * in the middle of changing this, so we'll need to recheck after we have
3078  * the lock.
3079  */
3080  if (PageIsAllVisible(page))
3081  visibilitymap_pin(relation, block, &vmbuffer);
3082 
3084 
3085  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3086  Assert(ItemIdIsNormal(lp));
3087 
3088  /*
3089  * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3090  * properly.
3091  */
3092  oldtup.t_tableOid = RelationGetRelid(relation);
3093  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3094  oldtup.t_len = ItemIdGetLength(lp);
3095  oldtup.t_self = *otid;
3096 
3097  /* the new tuple is ready, except for this: */
3098  newtup->t_tableOid = RelationGetRelid(relation);
3099 
3100  /*
3101  * Determine columns modified by the update. Additionally, identify
3102  * whether any of the unmodified replica identity key attributes in the
3103  * old tuple is externally stored or not. This is required because for
3104  * such attributes the flattened value won't be WAL logged as part of the
3105  * new tuple so we must include it as part of the old_key_tuple. See
3106  * ExtractReplicaIdentity.
3107  */
3108  modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs,
3109  id_attrs, &oldtup,
3110  newtup, &id_has_external);
3111 
3112  /*
3113  * If we're not updating any "key" column, we can grab a weaker lock type.
3114  * This allows for more concurrency when we are running simultaneously
3115  * with foreign key checks.
3116  *
3117  * Note that if a column gets detoasted while executing the update, but
3118  * the value ends up being the same, this test will fail and we will use
3119  * the stronger lock. This is acceptable; the important case to optimize
3120  * is updates that don't manipulate key columns, not those that
3121  * serendipitously arrive at the same key values.
3122  */
3123  if (!bms_overlap(modified_attrs, key_attrs))
3124  {
3125  *lockmode = LockTupleNoKeyExclusive;
3126  mxact_status = MultiXactStatusNoKeyUpdate;
3127  key_intact = true;
3128 
3129  /*
3130  * If this is the first possibly-multixact-able operation in the
3131  * current transaction, set my per-backend OldestMemberMXactId
3132  * setting. We can be certain that the transaction will never become a
3133  * member of any older MultiXactIds than that. (We have to do this
3134  * even if we end up just using our own TransactionId below, since
3135  * some other backend could incorporate our XID into a MultiXact
3136  * immediately afterwards.)
3137  */
3139  }
3140  else
3141  {
3142  *lockmode = LockTupleExclusive;
3143  mxact_status = MultiXactStatusUpdate;
3144  key_intact = false;
3145  }
3146 
3147  /*
3148  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3149  * otid may very well point at newtup->t_self, which we will overwrite
3150  * with the new tuple's location, so there's great risk of confusion if we
3151  * use otid anymore.
3152  */
3153 
3154 l2:
3155  checked_lockers = false;
3156  locker_remains = false;
3157  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3158 
3159  /* see below about the "no wait" case */
3160  Assert(result != TM_BeingModified || wait);
3161 
3162  if (result == TM_Invisible)
3163  {
3164  UnlockReleaseBuffer(buffer);
3165  ereport(ERROR,
3166  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3167  errmsg("attempted to update invisible tuple")));
3168  }
3169  else if (result == TM_BeingModified && wait)
3170  {
3171  TransactionId xwait;
3172  uint16 infomask;
3173  bool can_continue = false;
3174 
3175  /*
3176  * XXX note that we don't consider the "no wait" case here. This
3177  * isn't a problem currently because no caller uses that case, but it
3178  * should be fixed if such a caller is introduced. It wasn't a
3179  * problem previously because this code would always wait, but now
3180  * that some tuple locks do not conflict with one of the lock modes we
3181  * use, it is possible that this case is interesting to handle
3182  * specially.
3183  *
3184  * This may cause failures with third-party code that calls
3185  * heap_update directly.
3186  */
3187 
3188  /* must copy state data before unlocking buffer */
3189  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3190  infomask = oldtup.t_data->t_infomask;
3191 
3192  /*
3193  * Now we have to do something about the existing locker. If it's a
3194  * multi, sleep on it; we might be awakened before it is completely
3195  * gone (or even not sleep at all in some cases); we need to preserve
3196  * it as locker, unless it is gone completely.
3197  *
3198  * If it's not a multi, we need to check for sleeping conditions
3199  * before actually going to sleep. If the update doesn't conflict
3200  * with the locks, we just continue without sleeping (but making sure
3201  * it is preserved).
3202  *
3203  * Before sleeping, we need to acquire tuple lock to establish our
3204  * priority for the tuple (see heap_lock_tuple). LockTuple will
3205  * release us when we are next-in-line for the tuple. Note we must
3206  * not acquire the tuple lock until we're sure we're going to sleep;
3207  * otherwise we're open for race conditions with other transactions
3208  * holding the tuple lock which sleep on us.
3209  *
3210  * If we are forced to "start over" below, we keep the tuple lock;
3211  * this arranges that we stay at the head of the line while rechecking
3212  * tuple state.
3213  */
3214  if (infomask & HEAP_XMAX_IS_MULTI)
3215  {
3216  TransactionId update_xact;
3217  int remain;
3218  bool current_is_member = false;
3219 
3220  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3221  *lockmode, &current_is_member))
3222  {
3223  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3224 
3225  /*
3226  * Acquire the lock, if necessary (but skip it when we're
3227  * requesting a lock and already have one; avoids deadlock).
3228  */
3229  if (!current_is_member)
3230  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3231  LockWaitBlock, &have_tuple_lock);
3232 
3233  /* wait for multixact */
3234  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3235  relation, &oldtup.t_self, XLTW_Update,
3236  &remain);
3237  checked_lockers = true;
3238  locker_remains = remain != 0;
3240 
3241  /*
3242  * If xwait had just locked the tuple then some other xact
3243  * could update this tuple before we get to this point. Check
3244  * for xmax change, and start over if so.
3245  */
3247  infomask) ||
3249  xwait))
3250  goto l2;
3251  }
3252 
3253  /*
3254  * Note that the multixact may not be done by now. It could have
3255  * surviving members; our own xact or other subxacts of this
3256  * backend, and also any other concurrent transaction that locked
3257  * the tuple with LockTupleKeyShare if we only got
3258  * LockTupleNoKeyExclusive. If this is the case, we have to be
3259  * careful to mark the updated tuple with the surviving members in
3260  * Xmax.
3261  *
3262  * Note that there could have been another update in the
3263  * MultiXact. In that case, we need to check whether it committed
3264  * or aborted. If it aborted we are safe to update it again;
3265  * otherwise there is an update conflict, and we have to return
3266  * TableTuple{Deleted, Updated} below.
3267  *
3268  * In the LockTupleExclusive case, we still need to preserve the
3269  * surviving members: those would include the tuple locks we had
3270  * before this one, which are important to keep in case this
3271  * subxact aborts.
3272  */
3274  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3275  else
3276  update_xact = InvalidTransactionId;
3277 
3278  /*
3279  * There was no UPDATE in the MultiXact; or it aborted. No
3280  * TransactionIdIsInProgress() call needed here, since we called
3281  * MultiXactIdWait() above.
3282  */
3283  if (!TransactionIdIsValid(update_xact) ||
3284  TransactionIdDidAbort(update_xact))
3285  can_continue = true;
3286  }
3287  else if (TransactionIdIsCurrentTransactionId(xwait))
3288  {
3289  /*
3290  * The only locker is ourselves; we can avoid grabbing the tuple
3291  * lock here, but must preserve our locking information.
3292  */
3293  checked_lockers = true;
3294  locker_remains = true;
3295  can_continue = true;
3296  }
3297  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3298  {
3299  /*
3300  * If it's just a key-share locker, and we're not changing the key
3301  * columns, we don't need to wait for it to end; but we need to
3302  * preserve it as locker.
3303  */
3304  checked_lockers = true;
3305  locker_remains = true;
3306  can_continue = true;
3307  }
3308  else
3309  {
3310  /*
3311  * Wait for regular transaction to end; but first, acquire tuple
3312  * lock.
3313  */
3314  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3315  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3316  LockWaitBlock, &have_tuple_lock);
3317  XactLockTableWait(xwait, relation, &oldtup.t_self,
3318  XLTW_Update);
3319  checked_lockers = true;
3321 
3322  /*
3323  * xwait is done, but if xwait had just locked the tuple then some
3324  * other xact could update this tuple before we get to this point.
3325  * Check for xmax change, and start over if so.
3326  */
3327  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3328  !TransactionIdEquals(xwait,
3330  goto l2;
3331 
3332  /* Otherwise check if it committed or aborted */
3333  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3334  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3335  can_continue = true;
3336  }
3337 
3338  if (can_continue)
3339  result = TM_Ok;
3340  else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3341  result = TM_Updated;
3342  else
3343  result = TM_Deleted;
3344  }
3345 
3346  /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3347  if (result != TM_Ok)
3348  {
3349  Assert(result == TM_SelfModified ||
3350  result == TM_Updated ||
3351  result == TM_Deleted ||
3352  result == TM_BeingModified);
3353  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3354  Assert(result != TM_Updated ||
3355  !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3356  }
3357 
3358  if (crosscheck != InvalidSnapshot && result == TM_Ok)
3359  {
3360  /* Perform additional check for transaction-snapshot mode RI updates */
3361  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3362  result = TM_Updated;
3363  }
3364 
3365  if (result != TM_Ok)
3366  {
3367  tmfd->ctid = oldtup.t_data->t_ctid;
3368  tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3369  if (result == TM_SelfModified)
3370  tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3371  else
3372  tmfd->cmax = InvalidCommandId;
3373  UnlockReleaseBuffer(buffer);
3374  if (have_tuple_lock)
3375  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3376  if (vmbuffer != InvalidBuffer)
3377  ReleaseBuffer(vmbuffer);
3378  *update_indexes = TU_None;
3379 
3380  bms_free(hot_attrs);
3381  bms_free(sum_attrs);
3382  bms_free(key_attrs);
3383  bms_free(id_attrs);
3384  bms_free(modified_attrs);
3385  bms_free(interesting_attrs);
3386  return result;
3387  }
3388 
3389  /*
3390  * If we didn't pin the visibility map page and the page has become all
3391  * visible while we were busy locking the buffer, or during some
3392  * subsequent window during which we had it unlocked, we'll have to unlock
3393  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3394  * bit unfortunate, especially since we'll now have to recheck whether the
3395  * tuple has been locked or updated under us, but hopefully it won't
3396  * happen very often.
3397  */
3398  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3399  {
3400  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3401  visibilitymap_pin(relation, block, &vmbuffer);
3403  goto l2;
3404  }
3405 
3406  /* Fill in transaction status data */
3407 
3408  /*
3409  * If the tuple we're updating is locked, we need to preserve the locking
3410  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3411  */
3413  oldtup.t_data->t_infomask,
3414  oldtup.t_data->t_infomask2,
3415  xid, *lockmode, true,
3416  &xmax_old_tuple, &infomask_old_tuple,
3417  &infomask2_old_tuple);
3418 
3419  /*
3420  * And also prepare an Xmax value for the new copy of the tuple. If there
3421  * was no xmax previously, or there was one but all lockers are now gone,
3422  * then use InvalidTransactionId; otherwise, get the xmax from the old
3423  * tuple. (In rare cases that might also be InvalidTransactionId and yet
3424  * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3425  */
3426  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3428  (checked_lockers && !locker_remains))
3429  xmax_new_tuple = InvalidTransactionId;
3430  else
3431  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3432 
3433  if (!TransactionIdIsValid(xmax_new_tuple))
3434  {
3435  infomask_new_tuple = HEAP_XMAX_INVALID;
3436  infomask2_new_tuple = 0;
3437  }
3438  else
3439  {
3440  /*
3441  * If we found a valid Xmax for the new tuple, then the infomask bits
3442  * to use on the new tuple depend on what was there on the old one.
3443  * Note that since we're doing an update, the only possibility is that
3444  * the lockers had FOR KEY SHARE lock.
3445  */
3446  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3447  {
3448  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3449  &infomask2_new_tuple);
3450  }
3451  else
3452  {
3453  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3454  infomask2_new_tuple = 0;
3455  }
3456  }
3457 
3458  /*
3459  * Prepare the new tuple with the appropriate initial values of Xmin and
3460  * Xmax, as well as initial infomask bits as computed above.
3461  */
3462  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3463  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3464  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3465  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3466  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3467  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3468  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3469 
3470  /*
3471  * Replace cid with a combo CID if necessary. Note that we already put
3472  * the plain cid into the new tuple.
3473  */
3474  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3475 
3476  /*
3477  * If the toaster needs to be activated, OR if the new tuple will not fit
3478  * on the same page as the old, then we need to release the content lock
3479  * (but not the pin!) on the old tuple's buffer while we are off doing
3480  * TOAST and/or table-file-extension work. We must mark the old tuple to
3481  * show that it's locked, else other processes may try to update it
3482  * themselves.
3483  *
3484  * We need to invoke the toaster if there are already any out-of-line
3485  * toasted values present, or if the new tuple is over-threshold.
3486  */
3487  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3488  relation->rd_rel->relkind != RELKIND_MATVIEW)
3489  {
3490  /* toast table entries should never be recursively toasted */
3491  Assert(!HeapTupleHasExternal(&oldtup));
3492  Assert(!HeapTupleHasExternal(newtup));
3493  need_toast = false;
3494  }
3495  else
3496  need_toast = (HeapTupleHasExternal(&oldtup) ||
3497  HeapTupleHasExternal(newtup) ||
3498  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3499 
3500  pagefree = PageGetHeapFreeSpace(page);
3501 
3502  newtupsize = MAXALIGN(newtup->t_len);
3503 
3504  if (need_toast || newtupsize > pagefree)
3505  {
3506  TransactionId xmax_lock_old_tuple;
3507  uint16 infomask_lock_old_tuple,
3508  infomask2_lock_old_tuple;
3509  bool cleared_all_frozen = false;
3510 
3511  /*
3512  * To prevent concurrent sessions from updating the tuple, we have to
3513  * temporarily mark it locked, while we release the page-level lock.
3514  *
3515  * To satisfy the rule that any xid potentially appearing in a buffer
3516  * written out to disk, we unfortunately have to WAL log this
3517  * temporary modification. We can reuse xl_heap_lock for this
3518  * purpose. If we crash/error before following through with the
3519  * actual update, xmax will be of an aborted transaction, allowing
3520  * other sessions to proceed.
3521  */
3522 
3523  /*
3524  * Compute xmax / infomask appropriate for locking the tuple. This has
3525  * to be done separately from the combo that's going to be used for
3526  * updating, because the potentially created multixact would otherwise
3527  * be wrong.
3528  */
3530  oldtup.t_data->t_infomask,
3531  oldtup.t_data->t_infomask2,
3532  xid, *lockmode, false,
3533  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3534  &infomask2_lock_old_tuple);
3535 
3536  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3537 
3539 
3540  /* Clear obsolete visibility flags ... */
3541  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3542  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3543  HeapTupleClearHotUpdated(&oldtup);
3544  /* ... and store info about transaction updating this tuple */
3545  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3546  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3547  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3548  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3549  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3550 
3551  /* temporarily make it look not-updated, but locked */
3552  oldtup.t_data->t_ctid = oldtup.t_self;
3553 
3554  /*
3555  * Clear all-frozen bit on visibility map if needed. We could
3556  * immediately reset ALL_VISIBLE, but given that the WAL logging
3557  * overhead would be unchanged, that doesn't seem necessarily
3558  * worthwhile.
3559  */
3560  if (PageIsAllVisible(page) &&
3561  visibilitymap_clear(relation, block, vmbuffer,
3563  cleared_all_frozen = true;
3564 
3565  MarkBufferDirty(buffer);
3566 
3567  if (RelationNeedsWAL(relation))
3568  {
3569  xl_heap_lock xlrec;
3570  XLogRecPtr recptr;
3571 
3572  XLogBeginInsert();
3573  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3574 
3575  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3576  xlrec.xmax = xmax_lock_old_tuple;
3578  oldtup.t_data->t_infomask2);
3579  xlrec.flags =
3580  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3581  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3582  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3583  PageSetLSN(page, recptr);
3584  }
3585 
3586  END_CRIT_SECTION();
3587 
3588  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3589 
3590  /*
3591  * Let the toaster do its thing, if needed.
3592  *
3593  * Note: below this point, heaptup is the data we actually intend to
3594  * store into the relation; newtup is the caller's original untoasted
3595  * data.
3596  */
3597  if (need_toast)
3598  {
3599  /* Note we always use WAL and FSM during updates */
3600  heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3601  newtupsize = MAXALIGN(heaptup->t_len);
3602  }
3603  else
3604  heaptup = newtup;
3605 
3606  /*
3607  * Now, do we need a new page for the tuple, or not? This is a bit
3608  * tricky since someone else could have added tuples to the page while
3609  * we weren't looking. We have to recheck the available space after
3610  * reacquiring the buffer lock. But don't bother to do that if the
3611  * former amount of free space is still not enough; it's unlikely
3612  * there's more free now than before.
3613  *
3614  * What's more, if we need to get a new page, we will need to acquire
3615  * buffer locks on both old and new pages. To avoid deadlock against
3616  * some other backend trying to get the same two locks in the other
3617  * order, we must be consistent about the order we get the locks in.
3618  * We use the rule "lock the lower-numbered page of the relation
3619  * first". To implement this, we must do RelationGetBufferForTuple
3620  * while not holding the lock on the old page, and we must rely on it
3621  * to get the locks on both pages in the correct order.
3622  *
3623  * Another consideration is that we need visibility map page pin(s) if
3624  * we will have to clear the all-visible flag on either page. If we
3625  * call RelationGetBufferForTuple, we rely on it to acquire any such
3626  * pins; but if we don't, we have to handle that here. Hence we need
3627  * a loop.
3628  */
3629  for (;;)
3630  {
3631  if (newtupsize > pagefree)
3632  {
3633  /* It doesn't fit, must use RelationGetBufferForTuple. */
3634  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3635  buffer, 0, NULL,
3636  &vmbuffer_new, &vmbuffer,
3637  0);
3638  /* We're all done. */
3639  break;
3640  }
3641  /* Acquire VM page pin if needed and we don't have it. */
3642  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3643  visibilitymap_pin(relation, block, &vmbuffer);
3644  /* Re-acquire the lock on the old tuple's page. */
3646  /* Re-check using the up-to-date free space */
3647  pagefree = PageGetHeapFreeSpace(page);
3648  if (newtupsize > pagefree ||
3649  (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3650  {
3651  /*
3652  * Rats, it doesn't fit anymore, or somebody just now set the
3653  * all-visible flag. We must now unlock and loop to avoid
3654  * deadlock. Fortunately, this path should seldom be taken.
3655  */
3656  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3657  }
3658  else
3659  {
3660  /* We're all done. */
3661  newbuf = buffer;
3662  break;
3663  }
3664  }
3665  }
3666  else
3667  {
3668  /* No TOAST work needed, and it'll fit on same page */
3669  newbuf = buffer;
3670  heaptup = newtup;
3671  }
3672 
3673  /*
3674  * We're about to do the actual update -- check for conflict first, to
3675  * avoid possibly having to roll back work we've just done.
3676  *
3677  * This is safe without a recheck as long as there is no possibility of
3678  * another process scanning the pages between this check and the update
3679  * being visible to the scan (i.e., exclusive buffer content lock(s) are
3680  * continuously held from this point until the tuple update is visible).
3681  *
3682  * For the new tuple the only check needed is at the relation level, but
3683  * since both tuples are in the same relation and the check for oldtup
3684  * will include checking the relation level, there is no benefit to a
3685  * separate check for the new tuple.
3686  */
3687  CheckForSerializableConflictIn(relation, &oldtup.t_self,
3688  BufferGetBlockNumber(buffer));
3689 
3690  /*
3691  * At this point newbuf and buffer are both pinned and locked, and newbuf
3692  * has enough space for the new tuple. If they are the same buffer, only
3693  * one pin is held.
3694  */
3695 
3696  if (newbuf == buffer)
3697  {
3698  /*
3699  * Since the new tuple is going into the same page, we might be able
3700  * to do a HOT update. Check if any of the index columns have been
3701  * changed.
3702  */
3703  if (!bms_overlap(modified_attrs, hot_attrs))
3704  {
3705  use_hot_update = true;
3706 
3707  /*
3708  * If none of the columns that are used in hot-blocking indexes
3709  * were updated, we can apply HOT, but we do still need to check
3710  * if we need to update the summarizing indexes, and update those
3711  * indexes if the columns were updated, or we may fail to detect
3712  * e.g. value bound changes in BRIN minmax indexes.
3713  */
3714  if (bms_overlap(modified_attrs, sum_attrs))
3715  summarized_update = true;
3716  }
3717  }
3718  else
3719  {
3720  /* Set a hint that the old page could use prune/defrag */
3721  PageSetFull(page);
3722  }
3723 
3724  /*
3725  * Compute replica identity tuple before entering the critical section so
3726  * we don't PANIC upon a memory allocation failure.
3727  * ExtractReplicaIdentity() will return NULL if nothing needs to be
3728  * logged. Pass old key required as true only if the replica identity key
3729  * columns are modified or it has external data.
3730  */
3731  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3732  bms_overlap(modified_attrs, id_attrs) ||
3733  id_has_external,
3734  &old_key_copied);
3735 
3736  /* NO EREPORT(ERROR) from here till changes are logged */
3738 
3739  /*
3740  * If this transaction commits, the old tuple will become DEAD sooner or
3741  * later. Set flag that this page is a candidate for pruning once our xid
3742  * falls below the OldestXmin horizon. If the transaction finally aborts,
3743  * the subsequent page pruning will be a no-op and the hint will be
3744  * cleared.
3745  *
3746  * XXX Should we set hint on newbuf as well? If the transaction aborts,
3747  * there would be a prunable tuple in the newbuf; but for now we choose
3748  * not to optimize for aborts. Note that heap_xlog_update must be kept in
3749  * sync if this decision changes.
3750  */
3751  PageSetPrunable(page, xid);
3752 
3753  if (use_hot_update)
3754  {
3755  /* Mark the old tuple as HOT-updated */
3756  HeapTupleSetHotUpdated(&oldtup);
3757  /* And mark the new tuple as heap-only */
3758  HeapTupleSetHeapOnly(heaptup);
3759  /* Mark the caller's copy too, in case different from heaptup */
3760  HeapTupleSetHeapOnly(newtup);
3761  }
3762  else
3763  {
3764  /* Make sure tuples are correctly marked as not-HOT */
3765  HeapTupleClearHotUpdated(&oldtup);
3766  HeapTupleClearHeapOnly(heaptup);
3767  HeapTupleClearHeapOnly(newtup);
3768  }
3769 
3770  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3771 
3772 
3773  /* Clear obsolete visibility flags, possibly set by ourselves above... */
3774  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3775  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3776  /* ... and store info about transaction updating this tuple */
3777  Assert(TransactionIdIsValid(xmax_old_tuple));
3778  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3779  oldtup.t_data->t_infomask |= infomask_old_tuple;
3780  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3781  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3782 
3783  /* record address of new tuple in t_ctid of old one */
3784  oldtup.t_data->t_ctid = heaptup->t_self;
3785 
3786  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3787  if (PageIsAllVisible(BufferGetPage(buffer)))
3788  {
3789  all_visible_cleared = true;
3791  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3792  vmbuffer, VISIBILITYMAP_VALID_BITS);
3793  }
3794  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3795  {
3796  all_visible_cleared_new = true;
3798  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3799  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3800  }
3801 
3802  if (newbuf != buffer)
3803  MarkBufferDirty(newbuf);
3804  MarkBufferDirty(buffer);
3805 
3806  /* XLOG stuff */
3807  if (RelationNeedsWAL(relation))
3808  {
3809  XLogRecPtr recptr;
3810 
3811  /*
3812  * For logical decoding we need combo CIDs to properly decode the
3813  * catalog.
3814  */
3816  {
3817  log_heap_new_cid(relation, &oldtup);
3818  log_heap_new_cid(relation, heaptup);
3819  }
3820 
3821  recptr = log_heap_update(relation, buffer,
3822  newbuf, &oldtup, heaptup,
3823  old_key_tuple,
3824  all_visible_cleared,
3825  all_visible_cleared_new);
3826  if (newbuf != buffer)
3827  {
3828  PageSetLSN(BufferGetPage(newbuf), recptr);
3829  }
3830  PageSetLSN(BufferGetPage(buffer), recptr);
3831  }
3832 
3833  END_CRIT_SECTION();
3834 
3835  if (newbuf != buffer)
3836  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3837  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3838 
3839  /*
3840  * Mark old tuple for invalidation from system caches at next command
3841  * boundary, and mark the new tuple for invalidation in case we abort. We
3842  * have to do this before releasing the buffer because oldtup is in the
3843  * buffer. (heaptup is all in local memory, but it's necessary to process
3844  * both tuple versions in one call to inval.c so we can avoid redundant
3845  * sinval messages.)
3846  */
3847  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
3848 
3849  /* Now we can release the buffer(s) */
3850  if (newbuf != buffer)
3851  ReleaseBuffer(newbuf);
3852  ReleaseBuffer(buffer);
3853  if (BufferIsValid(vmbuffer_new))
3854  ReleaseBuffer(vmbuffer_new);
3855  if (BufferIsValid(vmbuffer))
3856  ReleaseBuffer(vmbuffer);
3857 
3858  /*
3859  * Release the lmgr tuple lock, if we had it.
3860  */
3861  if (have_tuple_lock)
3862  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3863 
3864  pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
3865 
3866  /*
3867  * If heaptup is a private copy, release it. Don't forget to copy t_self
3868  * back to the caller's image, too.
3869  */
3870  if (heaptup != newtup)
3871  {
3872  newtup->t_self = heaptup->t_self;
3873  heap_freetuple(heaptup);
3874  }
3875 
3876  /*
3877  * If it is a HOT update, the update may still need to update summarized
3878  * indexes, lest we fail to update those summaries and get incorrect
3879  * results (for example, minmax bounds of the block may change with this
3880  * update).
3881  */
3882  if (use_hot_update)
3883  {
3884  if (summarized_update)
3885  *update_indexes = TU_Summarizing;
3886  else
3887  *update_indexes = TU_None;
3888  }
3889  else
3890  *update_indexes = TU_All;
3891 
3892  if (old_key_tuple != NULL && old_key_copied)
3893  heap_freetuple(old_key_tuple);
3894 
3895  bms_free(hot_attrs);
3896  bms_free(sum_attrs);
3897  bms_free(key_attrs);
3898  bms_free(id_attrs);
3899  bms_free(modified_attrs);
3900  bms_free(interesting_attrs);
3901 
3902  return TM_Ok;
3903 }
3904 
3905 /*
3906  * Check if the specified attribute's values are the same. Subroutine for
3907  * HeapDetermineColumnsInfo.
3908  */
3909 static bool
3910 heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
3911  bool isnull1, bool isnull2)
3912 {
3913  Form_pg_attribute att;
3914 
3915  /*
3916  * If one value is NULL and other is not, then they are certainly not
3917  * equal
3918  */
3919  if (isnull1 != isnull2)
3920  return false;
3921 
3922  /*
3923  * If both are NULL, they can be considered equal.
3924  */
3925  if (isnull1)
3926  return true;
3927 
3928  /*
3929  * We do simple binary comparison of the two datums. This may be overly
3930  * strict because there can be multiple binary representations for the
3931  * same logical value. But we should be OK as long as there are no false
3932  * positives. Using a type-specific equality operator is messy because
3933  * there could be multiple notions of equality in different operator
3934  * classes; furthermore, we cannot safely invoke user-defined functions
3935  * while holding exclusive buffer lock.
3936  */
3937  if (attrnum <= 0)
3938  {
3939  /* The only allowed system columns are OIDs, so do this */
3940  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
3941  }
3942  else
3943  {
3944  Assert(attrnum <= tupdesc->natts);
3945  att = TupleDescAttr(tupdesc, attrnum - 1);
3946  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
3947  }
3948 }
3949 
3950 /*
3951  * Check which columns are being updated.
3952  *
3953  * Given an updated tuple, determine (and return into the output bitmapset),
3954  * from those listed as interesting, the set of columns that changed.
3955  *
3956  * has_external indicates if any of the unmodified attributes (from those
3957  * listed as interesting) of the old tuple is a member of external_cols and is
3958  * stored externally.
3959  */
3960 static Bitmapset *
3962  Bitmapset *interesting_cols,
3963  Bitmapset *external_cols,
3964  HeapTuple oldtup, HeapTuple newtup,
3965  bool *has_external)
3966 {
3967  int attidx;
3968  Bitmapset *modified = NULL;
3969  TupleDesc tupdesc = RelationGetDescr(relation);
3970 
3971  attidx = -1;
3972  while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
3973  {
3974  /* attidx is zero-based, attrnum is the normal attribute number */
3976  Datum value1,
3977  value2;
3978  bool isnull1,
3979  isnull2;
3980 
3981  /*
3982  * If it's a whole-tuple reference, say "not equal". It's not really
3983  * worth supporting this case, since it could only succeed after a
3984  * no-op update, which is hardly a case worth optimizing for.
3985  */
3986  if (attrnum == 0)
3987  {
3988  modified = bms_add_member(modified, attidx);
3989  continue;
3990  }
3991 
3992  /*
3993  * Likewise, automatically say "not equal" for any system attribute
3994  * other than tableOID; we cannot expect these to be consistent in a
3995  * HOT chain, or even to be set correctly yet in the new tuple.
3996  */
3997  if (attrnum < 0)
3998  {
3999  if (attrnum != TableOidAttributeNumber)
4000  {
4001  modified = bms_add_member(modified, attidx);
4002  continue;
4003  }
4004  }
4005 
4006  /*
4007  * Extract the corresponding values. XXX this is pretty inefficient
4008  * if there are many indexed columns. Should we do a single
4009  * heap_deform_tuple call on each tuple, instead? But that doesn't
4010  * work for system columns ...
4011  */
4012  value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4013  value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4014 
4015  if (!heap_attr_equals(tupdesc, attrnum, value1,
4016  value2, isnull1, isnull2))
4017  {
4018  modified = bms_add_member(modified, attidx);
4019  continue;
4020  }
4021 
4022  /*
4023  * No need to check attributes that can't be stored externally. Note
4024  * that system attributes can't be stored externally.
4025  */
4026  if (attrnum < 0 || isnull1 ||
4027  TupleDescAttr(tupdesc, attrnum - 1)->attlen != -1)
4028  continue;
4029 
4030  /*
4031  * Check if the old tuple's attribute is stored externally and is a
4032  * member of external_cols.
4033  */
4034  if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) &&
4035  bms_is_member(attidx, external_cols))
4036  *has_external = true;
4037  }
4038 
4039  return modified;
4040 }
4041 
4042 /*
4043  * simple_heap_update - replace a tuple
4044  *
4045  * This routine may be used to update a tuple when concurrent updates of
4046  * the target tuple are not expected (for example, because we have a lock
4047  * on the relation associated with the tuple). Any failure is reported
4048  * via ereport().
4049  */
4050 void
4052  TU_UpdateIndexes *update_indexes)
4053 {
4054  TM_Result result;
4055  TM_FailureData tmfd;
4056  LockTupleMode lockmode;
4057 
4058  result = heap_update(relation, otid, tup,
4060  true /* wait for commit */ ,
4061  &tmfd, &lockmode, update_indexes);
4062  switch (result)
4063  {
4064  case TM_SelfModified:
4065  /* Tuple was already updated in current command? */
4066  elog(ERROR, "tuple already updated by self");
4067  break;
4068 
4069  case TM_Ok:
4070  /* done successfully */
4071  break;
4072 
4073  case TM_Updated:
4074  elog(ERROR, "tuple concurrently updated");
4075  break;
4076 
4077  case TM_Deleted:
4078  elog(ERROR, "tuple concurrently deleted");
4079  break;
4080 
4081  default:
4082  elog(ERROR, "unrecognized heap_update status: %u", result);
4083  break;
4084  }
4085 }
4086 
4087 
4088 /*
4089  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4090  */
4091 static MultiXactStatus
4093 {
4094  int retval;
4095 
4096  if (is_update)
4097  retval = tupleLockExtraInfo[mode].updstatus;
4098  else
4099  retval = tupleLockExtraInfo[mode].lockstatus;
4100 
4101  if (retval == -1)
4102  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4103  is_update ? "true" : "false");
4104 
4105  return (MultiXactStatus) retval;
4106 }
4107 
4108 /*
4109  * heap_lock_tuple - lock a tuple in shared or exclusive mode
4110  *
4111  * Note that this acquires a buffer pin, which the caller must release.
4112  *
4113  * Input parameters:
4114  * relation: relation containing tuple (caller must hold suitable lock)
4115  * tid: TID of tuple to lock
4116  * cid: current command ID (used for visibility test, and stored into
4117  * tuple's cmax if lock is successful)
4118  * mode: indicates if shared or exclusive tuple lock is desired
4119  * wait_policy: what to do if tuple lock is not available
4120  * follow_updates: if true, follow the update chain to also lock descendant
4121  * tuples.
4122  *
4123  * Output parameters:
4124  * *tuple: all fields filled in
4125  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4126  * *tmfd: filled in failure cases (see below)
4127  *
4128  * Function results are the same as the ones for table_tuple_lock().
4129  *
4130  * In the failure cases other than TM_Invisible, the routine fills
4131  * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4132  * if necessary), and t_cmax (the last only for TM_SelfModified,
4133  * since we cannot obtain cmax from a combo CID generated by another
4134  * transaction).
4135  * See comments for struct TM_FailureData for additional info.
4136  *
4137  * See README.tuplock for a thorough explanation of this mechanism.
4138  */
4139 TM_Result
4141  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4142  bool follow_updates,
4143  Buffer *buffer, TM_FailureData *tmfd)
4144 {
4145  TM_Result result;
4146  ItemPointer tid = &(tuple->t_self);
4147  ItemId lp;
4148  Page page;
4149  Buffer vmbuffer = InvalidBuffer;
4150  BlockNumber block;
4151  TransactionId xid,
4152  xmax;
4153  uint16 old_infomask,
4154  new_infomask,
4155  new_infomask2;
4156  bool first_time = true;
4157  bool skip_tuple_lock = false;
4158  bool have_tuple_lock = false;
4159  bool cleared_all_frozen = false;
4160 
4161  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4162  block = ItemPointerGetBlockNumber(tid);
4163 
4164  /*
4165  * Before locking the buffer, pin the visibility map page if it appears to
4166  * be necessary. Since we haven't got the lock yet, someone else might be
4167  * in the middle of changing this, so we'll need to recheck after we have
4168  * the lock.
4169  */
4170  if (PageIsAllVisible(BufferGetPage(*buffer)))
4171  visibilitymap_pin(relation, block, &vmbuffer);
4172 
4174 
4175  page = BufferGetPage(*buffer);
4176  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4177  Assert(ItemIdIsNormal(lp));
4178 
4179  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4180  tuple->t_len = ItemIdGetLength(lp);
4181  tuple->t_tableOid = RelationGetRelid(relation);
4182 
4183 l3:
4184  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4185 
4186  if (result == TM_Invisible)
4187  {
4188  /*
4189  * This is possible, but only when locking a tuple for ON CONFLICT
4190  * UPDATE. We return this value here rather than throwing an error in
4191  * order to give that case the opportunity to throw a more specific
4192  * error.
4193  */
4194  result = TM_Invisible;
4195  goto out_locked;
4196  }
4197  else if (result == TM_BeingModified ||
4198  result == TM_Updated ||
4199  result == TM_Deleted)
4200  {
4201  TransactionId xwait;
4202  uint16 infomask;
4203  uint16 infomask2;
4204  bool require_sleep;
4205  ItemPointerData t_ctid;
4206 
4207  /* must copy state data before unlocking buffer */
4208  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4209  infomask = tuple->t_data->t_infomask;
4210  infomask2 = tuple->t_data->t_infomask2;
4211  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4212 
4213  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4214 
4215  /*
4216  * If any subtransaction of the current top transaction already holds
4217  * a lock as strong as or stronger than what we're requesting, we
4218  * effectively hold the desired lock already. We *must* succeed
4219  * without trying to take the tuple lock, else we will deadlock
4220  * against anyone wanting to acquire a stronger lock.
4221  *
4222  * Note we only do this the first time we loop on the HTSU result;
4223  * there is no point in testing in subsequent passes, because
4224  * evidently our own transaction cannot have acquired a new lock after
4225  * the first time we checked.
4226  */
4227  if (first_time)
4228  {
4229  first_time = false;
4230 
4231  if (infomask & HEAP_XMAX_IS_MULTI)
4232  {
4233  int i;
4234  int nmembers;
4235  MultiXactMember *members;
4236 
4237  /*
4238  * We don't need to allow old multixacts here; if that had
4239  * been the case, HeapTupleSatisfiesUpdate would have returned
4240  * MayBeUpdated and we wouldn't be here.
4241  */
4242  nmembers =
4243  GetMultiXactIdMembers(xwait, &members, false,
4244  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4245 
4246  for (i = 0; i < nmembers; i++)
4247  {
4248  /* only consider members of our own transaction */
4249  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4250  continue;
4251 
4252  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4253  {
4254  pfree(members);
4255  result = TM_Ok;
4256  goto out_unlocked;
4257  }
4258  else
4259  {
4260  /*
4261  * Disable acquisition of the heavyweight tuple lock.
4262  * Otherwise, when promoting a weaker lock, we might
4263  * deadlock with another locker that has acquired the
4264  * heavyweight tuple lock and is waiting for our
4265  * transaction to finish.
4266  *
4267  * Note that in this case we still need to wait for
4268  * the multixact if required, to avoid acquiring
4269  * conflicting locks.
4270  */
4271  skip_tuple_lock = true;
4272  }
4273  }
4274 
4275  if (members)
4276  pfree(members);
4277  }
4278  else if (TransactionIdIsCurrentTransactionId(xwait))
4279  {
4280  switch (mode)
4281  {
4282  case LockTupleKeyShare:
4283  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4284  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4285  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4286  result = TM_Ok;
4287  goto out_unlocked;
4288  case LockTupleShare:
4289  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4290  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4291  {
4292  result = TM_Ok;
4293  goto out_unlocked;
4294  }
4295  break;
4297  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4298  {
4299  result = TM_Ok;
4300  goto out_unlocked;
4301  }
4302  break;
4303  case LockTupleExclusive:
4304  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4305  infomask2 & HEAP_KEYS_UPDATED)
4306  {
4307  result = TM_Ok;
4308  goto out_unlocked;
4309  }
4310  break;
4311  }
4312  }
4313  }
4314 
4315  /*
4316  * Initially assume that we will have to wait for the locking
4317  * transaction(s) to finish. We check various cases below in which
4318  * this can be turned off.
4319  */
4320  require_sleep = true;
4321  if (mode == LockTupleKeyShare)
4322  {
4323  /*
4324  * If we're requesting KeyShare, and there's no update present, we
4325  * don't need to wait. Even if there is an update, we can still
4326  * continue if the key hasn't been modified.
4327  *
4328  * However, if there are updates, we need to walk the update chain
4329  * to mark future versions of the row as locked, too. That way,
4330  * if somebody deletes that future version, we're protected
4331  * against the key going away. This locking of future versions
4332  * could block momentarily, if a concurrent transaction is
4333  * deleting a key; or it could return a value to the effect that
4334  * the transaction deleting the key has already committed. So we
4335  * do this before re-locking the buffer; otherwise this would be
4336  * prone to deadlocks.
4337  *
4338  * Note that the TID we're locking was grabbed before we unlocked
4339  * the buffer. For it to change while we're not looking, the
4340  * other properties we're testing for below after re-locking the
4341  * buffer would also change, in which case we would restart this
4342  * loop above.
4343  */
4344  if (!(infomask2 & HEAP_KEYS_UPDATED))
4345  {
4346  bool updated;
4347 
4348  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4349 
4350  /*
4351  * If there are updates, follow the update chain; bail out if
4352  * that cannot be done.
4353  */
4354  if (follow_updates && updated)
4355  {
4356  TM_Result res;
4357 
4358  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4360  mode);
4361  if (res != TM_Ok)
4362  {
4363  result = res;
4364  /* recovery code expects to have buffer lock held */
4366  goto failed;
4367  }
4368  }
4369 
4371 
4372  /*
4373  * Make sure it's still an appropriate lock, else start over.
4374  * Also, if it wasn't updated before we released the lock, but
4375  * is updated now, we start over too; the reason is that we
4376  * now need to follow the update chain to lock the new
4377  * versions.
4378  */
4379  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4380  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4381  !updated))
4382  goto l3;
4383 
4384  /* Things look okay, so we can skip sleeping */
4385  require_sleep = false;
4386 
4387  /*
4388  * Note we allow Xmax to change here; other updaters/lockers
4389  * could have modified it before we grabbed the buffer lock.
4390  * However, this is not a problem, because with the recheck we
4391  * just did we ensure that they still don't conflict with the
4392  * lock we want.
4393  */
4394  }
4395  }
4396  else if (mode == LockTupleShare)
4397  {
4398  /*
4399  * If we're requesting Share, we can similarly avoid sleeping if
4400  * there's no update and no exclusive lock present.
4401  */
4402  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4403  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4404  {
4406 
4407  /*
4408  * Make sure it's still an appropriate lock, else start over.
4409  * See above about allowing xmax to change.
4410  */
4411  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4413  goto l3;
4414  require_sleep = false;
4415  }
4416  }
4417  else if (mode == LockTupleNoKeyExclusive)
4418  {
4419  /*
4420  * If we're requesting NoKeyExclusive, we might also be able to
4421  * avoid sleeping; just ensure that there no conflicting lock
4422  * already acquired.
4423  */
4424  if (infomask & HEAP_XMAX_IS_MULTI)
4425  {
4426  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4427  mode, NULL))
4428  {
4429  /*
4430  * No conflict, but if the xmax changed under us in the
4431  * meantime, start over.
4432  */
4434  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4436  xwait))
4437  goto l3;
4438 
4439  /* otherwise, we're good */
4440  require_sleep = false;
4441  }
4442  }
4443  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4444  {
4446 
4447  /* if the xmax changed in the meantime, start over */
4448  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4450  xwait))
4451  goto l3;
4452  /* otherwise, we're good */
4453  require_sleep = false;
4454  }
4455  }
4456 
4457  /*
4458  * As a check independent from those above, we can also avoid sleeping
4459  * if the current transaction is the sole locker of the tuple. Note
4460  * that the strength of the lock already held is irrelevant; this is
4461  * not about recording the lock in Xmax (which will be done regardless
4462  * of this optimization, below). Also, note that the cases where we
4463  * hold a lock stronger than we are requesting are already handled
4464  * above by not doing anything.
4465  *
4466  * Note we only deal with the non-multixact case here; MultiXactIdWait
4467  * is well equipped to deal with this situation on its own.
4468  */
4469  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4471  {
4472  /* ... but if the xmax changed in the meantime, start over */
4474  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4476  xwait))
4477  goto l3;
4479  require_sleep = false;
4480  }
4481 
4482  /*
4483  * Time to sleep on the other transaction/multixact, if necessary.
4484  *
4485  * If the other transaction is an update/delete that's already
4486  * committed, then sleeping cannot possibly do any good: if we're
4487  * required to sleep, get out to raise an error instead.
4488  *
4489  * By here, we either have already acquired the buffer exclusive lock,
4490  * or we must wait for the locking transaction or multixact; so below
4491  * we ensure that we grab buffer lock after the sleep.
4492  */
4493  if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4494  {
4496  goto failed;
4497  }
4498  else if (require_sleep)
4499  {
4500  /*
4501  * Acquire tuple lock to establish our priority for the tuple, or
4502  * die trying. LockTuple will release us when we are next-in-line
4503  * for the tuple. We must do this even if we are share-locking,
4504  * but not if we already have a weaker lock on the tuple.
4505  *
4506  * If we are forced to "start over" below, we keep the tuple lock;
4507  * this arranges that we stay at the head of the line while
4508  * rechecking tuple state.
4509  */
4510  if (!skip_tuple_lock &&
4511  !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4512  &have_tuple_lock))
4513  {
4514  /*
4515  * This can only happen if wait_policy is Skip and the lock
4516  * couldn't be obtained.
4517  */
4518  result = TM_WouldBlock;
4519  /* recovery code expects to have buffer lock held */
4521  goto failed;
4522  }
4523 
4524  if (infomask & HEAP_XMAX_IS_MULTI)
4525  {
4527 
4528  /* We only ever lock tuples, never update them */
4529  if (status >= MultiXactStatusNoKeyUpdate)
4530  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4531 
4532  /* wait for multixact to end, or die trying */
4533  switch (wait_policy)
4534  {
4535  case LockWaitBlock:
4536  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4537  relation, &tuple->t_self, XLTW_Lock, NULL);
4538  break;
4539  case LockWaitSkip:
4541  status, infomask, relation,
4542  NULL))
4543  {
4544  result = TM_WouldBlock;
4545  /* recovery code expects to have buffer lock held */
4547  goto failed;
4548  }
4549  break;
4550  case LockWaitError:
4552  status, infomask, relation,
4553  NULL))
4554  ereport(ERROR,
4555  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4556  errmsg("could not obtain lock on row in relation \"%s\"",
4557  RelationGetRelationName(relation))));
4558 
4559  break;
4560  }
4561 
4562  /*
4563  * Of course, the multixact might not be done here: if we're
4564  * requesting a light lock mode, other transactions with light
4565  * locks could still be alive, as well as locks owned by our
4566  * own xact or other subxacts of this backend. We need to
4567  * preserve the surviving MultiXact members. Note that it
4568  * isn't absolutely necessary in the latter case, but doing so
4569  * is simpler.
4570  */
4571  }
4572  else
4573  {
4574  /* wait for regular transaction to end, or die trying */
4575  switch (wait_policy)
4576  {
4577  case LockWaitBlock:
4578  XactLockTableWait(xwait, relation, &tuple->t_self,
4579  XLTW_Lock);
4580  break;
4581  case LockWaitSkip:
4582  if (!ConditionalXactLockTableWait(xwait))
4583  {
4584  result = TM_WouldBlock;
4585  /* recovery code expects to have buffer lock held */
4587  goto failed;
4588  }
4589  break;
4590  case LockWaitError:
4591  if (!ConditionalXactLockTableWait(xwait))
4592  ereport(ERROR,
4593  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4594  errmsg("could not obtain lock on row in relation \"%s\"",
4595  RelationGetRelationName(relation))));
4596  break;
4597  }
4598  }
4599 
4600  /* if there are updates, follow the update chain */
4601  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4602  {
4603  TM_Result res;
4604 
4605  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4607  mode);
4608  if (res != TM_Ok)
4609  {
4610  result = res;
4611  /* recovery code expects to have buffer lock held */
4613  goto failed;
4614  }
4615  }
4616 
4618 
4619  /*
4620  * xwait is done, but if xwait had just locked the tuple then some
4621  * other xact could update this tuple before we get to this point.
4622  * Check for xmax change, and start over if so.
4623  */
4624  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4626  xwait))
4627  goto l3;
4628 
4629  if (!(infomask & HEAP_XMAX_IS_MULTI))
4630  {
4631  /*
4632  * Otherwise check if it committed or aborted. Note we cannot
4633  * be here if the tuple was only locked by somebody who didn't
4634  * conflict with us; that would have been handled above. So
4635  * that transaction must necessarily be gone by now. But
4636  * don't check for this in the multixact case, because some
4637  * locker transactions might still be running.
4638  */
4639  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4640  }
4641  }
4642 
4643  /* By here, we're certain that we hold buffer exclusive lock again */
4644 
4645  /*
4646  * We may lock if previous xmax aborted, or if it committed but only
4647  * locked the tuple without updating it; or if we didn't have to wait
4648  * at all for whatever reason.
4649  */
4650  if (!require_sleep ||
4651  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4654  result = TM_Ok;
4655  else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
4656  result = TM_Updated;
4657  else
4658  result = TM_Deleted;
4659  }
4660 
4661 failed:
4662  if (result != TM_Ok)
4663  {
4664  Assert(result == TM_SelfModified || result == TM_Updated ||
4665  result == TM_Deleted || result == TM_WouldBlock);
4666 
4667  /*
4668  * When locking a tuple under LockWaitSkip semantics and we fail with
4669  * TM_WouldBlock above, it's possible for concurrent transactions to
4670  * release the lock and set HEAP_XMAX_INVALID in the meantime. So
4671  * this assert is slightly different from the equivalent one in
4672  * heap_delete and heap_update.
4673  */
4674  Assert((result == TM_WouldBlock) ||
4675  !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4676  Assert(result != TM_Updated ||
4677  !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4678  tmfd->ctid = tuple->t_data->t_ctid;
4679  tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4680  if (result == TM_SelfModified)
4681  tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4682  else
4683  tmfd->cmax = InvalidCommandId;
4684  goto out_locked;
4685  }
4686 
4687  /*
4688  * If we didn't pin the visibility map page and the page has become all
4689  * visible while we were busy locking the buffer, or during some
4690  * subsequent window during which we had it unlocked, we'll have to unlock
4691  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4692  * unfortunate, especially since we'll now have to recheck whether the
4693  * tuple has been locked or updated under us, but hopefully it won't
4694  * happen very often.
4695  */
4696  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4697  {
4698  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4699  visibilitymap_pin(relation, block, &vmbuffer);
4701  goto l3;
4702  }
4703 
4704  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4705  old_infomask = tuple->t_data->t_infomask;
4706 
4707  /*
4708  * If this is the first possibly-multixact-able operation in the current
4709  * transaction, set my per-backend OldestMemberMXactId setting. We can be
4710  * certain that the transaction will never become a member of any older
4711  * MultiXactIds than that. (We have to do this even if we end up just
4712  * using our own TransactionId below, since some other backend could
4713  * incorporate our XID into a MultiXact immediately afterwards.)
4714  */
4716 
4717  /*
4718  * Compute the new xmax and infomask to store into the tuple. Note we do
4719  * not modify the tuple just yet, because that would leave it in the wrong
4720  * state if multixact.c elogs.
4721  */
4722  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4723  GetCurrentTransactionId(), mode, false,
4724  &xid, &new_infomask, &new_infomask2);
4725 
4727 
4728  /*
4729  * Store transaction information of xact locking the tuple.
4730  *
4731  * Note: Cmax is meaningless in this context, so don't set it; this avoids
4732  * possibly generating a useless combo CID. Moreover, if we're locking a
4733  * previously updated tuple, it's important to preserve the Cmax.
4734  *
4735  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4736  * we would break the HOT chain.
4737  */
4738  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4739  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4740  tuple->t_data->t_infomask |= new_infomask;
4741  tuple->t_data->t_infomask2 |= new_infomask2;
4742  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4744  HeapTupleHeaderSetXmax(tuple->t_data, xid);
4745 
4746  /*
4747  * Make sure there is no forward chain link in t_ctid. Note that in the
4748  * cases where the tuple has been updated, we must not overwrite t_ctid,
4749  * because it was set by the updater. Moreover, if the tuple has been
4750  * updated, we need to follow the update chain to lock the new versions of
4751  * the tuple as well.
4752  */
4753  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4754  tuple->t_data->t_ctid = *tid;
4755 
4756  /* Clear only the all-frozen bit on visibility map if needed */
4757  if (PageIsAllVisible(page) &&
4758  visibilitymap_clear(relation, block, vmbuffer,
4760  cleared_all_frozen = true;
4761 
4762 
4763  MarkBufferDirty(*buffer);
4764 
4765  /*
4766  * XLOG stuff. You might think that we don't need an XLOG record because
4767  * there is no state change worth restoring after a crash. You would be
4768  * wrong however: we have just written either a TransactionId or a
4769  * MultiXactId that may never have been seen on disk before, and we need
4770  * to make sure that there are XLOG entries covering those ID numbers.
4771  * Else the same IDs might be re-used after a crash, which would be
4772  * disastrous if this page made it to disk before the crash. Essentially
4773  * we have to enforce the WAL log-before-data rule even in this case.
4774  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4775  * entries for everything anyway.)
4776  */
4777  if (RelationNeedsWAL(relation))
4778  {
4779  xl_heap_lock xlrec;
4780  XLogRecPtr recptr;
4781 
4782  XLogBeginInsert();
4783  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4784 
4785  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4786  xlrec.xmax = xid;
4787  xlrec.infobits_set = compute_infobits(new_infomask,
4788  tuple->t_data->t_infomask2);
4789  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4790  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4791 
4792  /* we don't decode row locks atm, so no need to log the origin */
4793 
4794  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4795 
4796  PageSetLSN(page, recptr);
4797  }
4798 
4799  END_CRIT_SECTION();
4800 
4801  result = TM_Ok;
4802 
4803 out_locked:
4804  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4805 
4806 out_unlocked:
4807  if (BufferIsValid(vmbuffer))
4808  ReleaseBuffer(vmbuffer);
4809 
4810  /*
4811  * Don't update the visibility map here. Locking a tuple doesn't change
4812  * visibility info.
4813  */
4814 
4815  /*
4816  * Now that we have successfully marked the tuple as locked, we can
4817  * release the lmgr tuple lock, if we had it.
4818  */
4819  if (have_tuple_lock)
4820  UnlockTupleTuplock(relation, tid, mode);
4821 
4822  return result;
4823 }
4824 
4825 /*
4826  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4827  * its normal, Xmax-based tuple lock.
4828  *
4829  * have_tuple_lock is an input and output parameter: on input, it indicates
4830  * whether the lock has previously been acquired (and this function does
4831  * nothing in that case). If this function returns success, have_tuple_lock
4832  * has been flipped to true.
4833  *
4834  * Returns false if it was unable to obtain the lock; this can only happen if
4835  * wait_policy is Skip.
4836  */
4837 static bool
4839  LockWaitPolicy wait_policy, bool *have_tuple_lock)
4840 {
4841  if (*have_tuple_lock)
4842  return true;
4843 
4844  switch (wait_policy)
4845  {
4846  case LockWaitBlock:
4847  LockTupleTuplock(relation, tid, mode);
4848  break;
4849 
4850  case LockWaitSkip:
4851  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4852  return false;
4853  break;
4854 
4855  case LockWaitError:
4856  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4857  ereport(ERROR,
4858  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4859  errmsg("could not obtain lock on row in relation \"%s\"",
4860  RelationGetRelationName(relation))));
4861  break;
4862  }
4863  *have_tuple_lock = true;
4864 
4865  return true;
4866 }
4867 
4868 /*
4869  * Given an original set of Xmax and infomask, and a transaction (identified by
4870  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4871  * corresponding infomasks to use on the tuple.
4872  *
4873  * Note that this might have side effects such as creating a new MultiXactId.
4874  *
4875  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4876  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4877  * but it was not running anymore. There is a race condition, which is that the
4878  * MultiXactId may have finished since then, but that uncommon case is handled
4879  * either here, or within MultiXactIdExpand.
4880  *
4881  * There is a similar race condition possible when the old xmax was a regular
4882  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
4883  * window, but it's still possible to end up creating an unnecessary
4884  * MultiXactId. Fortunately this is harmless.
4885  */
4886 static void
4888  uint16 old_infomask2, TransactionId add_to_xmax,
4889  LockTupleMode mode, bool is_update,
4890  TransactionId *result_xmax, uint16 *result_infomask,
4891  uint16 *result_infomask2)
4892 {
4893  TransactionId new_xmax;
4894  uint16 new_infomask,
4895  new_infomask2;
4896 
4898 
4899 l5:
4900  new_infomask = 0;
4901  new_infomask2 = 0;
4902  if (old_infomask & HEAP_XMAX_INVALID)
4903  {
4904  /*
4905  * No previous locker; we just insert our own TransactionId.
4906  *
4907  * Note that it's critical that this case be the first one checked,
4908  * because there are several blocks below that come back to this one
4909  * to implement certain optimizations; old_infomask might contain
4910  * other dirty bits in those cases, but we don't really care.
4911  */
4912  if (is_update)
4913  {
4914  new_xmax = add_to_xmax;
4915  if (mode == LockTupleExclusive)
4916  new_infomask2 |= HEAP_KEYS_UPDATED;
4917  }
4918  else
4919  {
4920  new_infomask |= HEAP_XMAX_LOCK_ONLY;
4921  switch (mode)
4922  {
4923  case LockTupleKeyShare:
4924  new_xmax = add_to_xmax;
4925  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
4926  break;
4927  case LockTupleShare:
4928  new_xmax = add_to_xmax;
4929  new_infomask |= HEAP_XMAX_SHR_LOCK;
4930  break;
4932  new_xmax = add_to_xmax;
4933  new_infomask |= HEAP_XMAX_EXCL_LOCK;
4934  break;
4935  case LockTupleExclusive:
4936  new_xmax = add_to_xmax;
4937  new_infomask |= HEAP_XMAX_EXCL_LOCK;
4938  new_infomask2 |= HEAP_KEYS_UPDATED;
4939  break;
4940  default:
4941  new_xmax = InvalidTransactionId; /* silence compiler */
4942  elog(ERROR, "invalid lock mode");
4943  }
4944  }
4945  }
4946  else if (old_infomask & HEAP_XMAX_IS_MULTI)
4947  {
4948  MultiXactStatus new_status;
4949 
4950  /*
4951  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
4952  * cross-check.
4953  */
4954  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
4955 
4956  /*
4957  * A multixact together with LOCK_ONLY set but neither lock bit set
4958  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
4959  * anymore. This check is critical for databases upgraded by
4960  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
4961  * that such multis are never passed.
4962  */
4963  if (HEAP_LOCKED_UPGRADED(old_infomask))
4964  {
4965  old_infomask &= ~HEAP_XMAX_IS_MULTI;
4966  old_infomask |= HEAP_XMAX_INVALID;
4967  goto l5;
4968  }
4969 
4970  /*
4971  * If the XMAX is already a MultiXactId, then we need to expand it to
4972  * include add_to_xmax; but if all the members were lockers and are
4973  * all gone, we can do away with the IS_MULTI bit and just set
4974  * add_to_xmax as the only locker/updater. If all lockers are gone
4975  * and we have an updater that aborted, we can also do without a
4976  * multi.
4977  *
4978  * The cost of doing GetMultiXactIdMembers would be paid by
4979  * MultiXactIdExpand if we weren't to do this, so this check is not
4980  * incurring extra work anyhow.
4981  */
4982  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
4983  {
4984  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
4986  old_infomask)))
4987  {
4988  /*
4989  * Reset these bits and restart; otherwise fall through to
4990  * create a new multi below.
4991  */
4992  old_infomask &= ~HEAP_XMAX_IS_MULTI;
4993  old_infomask |= HEAP_XMAX_INVALID;
4994  goto l5;
4995  }
4996  }
4997 
4998  new_status = get_mxact_status_for_lock(mode, is_update);
4999 
5000  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5001  new_status);
5002  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5003  }
5004  else if (old_infomask & HEAP_XMAX_COMMITTED)
5005  {
5006  /*
5007  * It's a committed update, so we need to preserve him as updater of
5008  * the tuple.
5009  */
5010  MultiXactStatus status;
5011  MultiXactStatus new_status;
5012 
5013  if (old_infomask2 & HEAP_KEYS_UPDATED)
5014  status = MultiXactStatusUpdate;
5015  else
5016  status = MultiXactStatusNoKeyUpdate;
5017 
5018  new_status = get_mxact_status_for_lock(mode, is_update);
5019 
5020  /*
5021  * since it's not running, it's obviously impossible for the old
5022  * updater to be identical to the current one, so we need not check
5023  * for that case as we do in the block above.
5024  */
5025  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5026  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5027  }
5028  else if (TransactionIdIsInProgress(xmax))
5029  {
5030  /*
5031  * If the XMAX is a valid, in-progress TransactionId, then we need to
5032  * create a new MultiXactId that includes both the old locker or
5033  * updater and our own TransactionId.
5034  */
5035  MultiXactStatus new_status;
5036  MultiXactStatus old_status;
5037  LockTupleMode old_mode;
5038 
5039  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5040  {
5041  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5042  old_status = MultiXactStatusForKeyShare;
5043  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5044  old_status = MultiXactStatusForShare;
5045  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5046  {
5047  if (old_infomask2 & HEAP_KEYS_UPDATED)
5048  old_status = MultiXactStatusForUpdate;
5049  else
5050  old_status = MultiXactStatusForNoKeyUpdate;
5051  }
5052  else
5053  {
5054  /*
5055  * LOCK_ONLY can be present alone only when a page has been
5056  * upgraded by pg_upgrade. But in that case,
5057  * TransactionIdIsInProgress() should have returned false. We
5058  * assume it's no longer locked in this case.
5059  */
5060  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5061  old_infomask |= HEAP_XMAX_INVALID;
5062  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5063  goto l5;
5064  }
5065  }
5066  else
5067  {
5068  /* it's an update, but which kind? */
5069  if (old_infomask2 & HEAP_KEYS_UPDATED)
5070  old_status = MultiXactStatusUpdate;
5071  else
5072  old_status = MultiXactStatusNoKeyUpdate;
5073  }
5074 
5075  old_mode = TUPLOCK_from_mxstatus(old_status);
5076 
5077  /*
5078  * If the lock to be acquired is for the same TransactionId as the
5079  * existing lock, there's an optimization possible: consider only the
5080  * strongest of both locks as the only one present, and restart.
5081  */
5082  if (xmax == add_to_xmax)
5083  {
5084  /*
5085  * Note that it's not possible for the original tuple to be
5086  * updated: we wouldn't be here because the tuple would have been
5087  * invisible and we wouldn't try to update it. As a subtlety,
5088  * this code can also run when traversing an update chain to lock
5089  * future versions of a tuple. But we wouldn't be here either,
5090  * because the add_to_xmax would be different from the original
5091  * updater.
5092  */
5093  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5094 
5095  /* acquire the strongest of both */
5096  if (mode < old_mode)
5097  mode = old_mode;
5098  /* mustn't touch is_update */
5099 
5100  old_infomask |= HEAP_XMAX_INVALID;
5101  goto l5;
5102  }
5103 
5104  /* otherwise, just fall back to creating a new multixact */
5105  new_status = get_mxact_status_for_lock(mode, is_update);
5106  new_xmax = MultiXactIdCreate(xmax, old_status,
5107  add_to_xmax, new_status);
5108  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5109  }
5110  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5111  TransactionIdDidCommit(xmax))
5112  {
5113  /*
5114  * It's a committed update, so we gotta preserve him as updater of the
5115  * tuple.
5116  */
5117  MultiXactStatus status;
5118  MultiXactStatus new_status;
5119 
5120  if (old_infomask2 & HEAP_KEYS_UPDATED)
5121  status = MultiXactStatusUpdate;
5122  else
5123  status = MultiXactStatusNoKeyUpdate;
5124 
5125  new_status = get_mxact_status_for_lock(mode, is_update);
5126 
5127  /*
5128  * since it's not running, it's obviously impossible for the old
5129  * updater to be identical to the current one, so we need not check
5130  * for that case as we do in the block above.
5131  */
5132  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5133  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5134  }
5135  else
5136  {
5137  /*
5138  * Can get here iff the locking/updating transaction was running when
5139  * the infomask was extracted from the tuple, but finished before
5140  * TransactionIdIsInProgress got to run. Deal with it as if there was
5141  * no locker at all in the first place.
5142  */
5143  old_infomask |= HEAP_XMAX_INVALID;
5144  goto l5;
5145  }
5146 
5147  *result_infomask = new_infomask;
5148  *result_infomask2 = new_infomask2;
5149  *result_xmax = new_xmax;
5150 }
5151 
5152 /*
5153  * Subroutine for heap_lock_updated_tuple_rec.
5154  *
5155  * Given a hypothetical multixact status held by the transaction identified
5156  * with the given xid, does the current transaction need to wait, fail, or can
5157  * it continue if it wanted to acquire a lock of the given mode? "needwait"
5158  * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5159  * returned. If the lock is already held by the current transaction, return
5160  * TM_SelfModified. In case of a conflict with another transaction, a
5161  * different HeapTupleSatisfiesUpdate return code is returned.
5162  *
5163  * The held status is said to be hypothetical because it might correspond to a
5164  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5165  * way for simplicity of API.
5166  */
5167 static TM_Result
5170  bool *needwait)
5171 {
5172  MultiXactStatus wantedstatus;
5173 
5174  *needwait = false;
5175  wantedstatus = get_mxact_status_for_lock(mode, false);
5176 
5177  /*
5178  * Note: we *must* check TransactionIdIsInProgress before
5179  * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5180  * for an explanation.
5181  */
5183  {
5184  /*
5185  * The tuple has already been locked by our own transaction. This is
5186  * very rare but can happen if multiple transactions are trying to
5187  * lock an ancient version of the same tuple.
5188  */
5189  return TM_SelfModified;
5190  }
5191  else if (TransactionIdIsInProgress(xid))
5192  {
5193  /*
5194  * If the locking transaction is running, what we do depends on
5195  * whether the lock modes conflict: if they do, then we must wait for
5196  * it to finish; otherwise we can fall through to lock this tuple
5197  * version without waiting.
5198  */
5200  LOCKMODE_from_mxstatus(wantedstatus)))
5201  {
5202  *needwait = true;
5203  }
5204 
5205  /*
5206  * If we set needwait above, then this value doesn't matter;
5207  * otherwise, this value signals to caller that it's okay to proceed.
5208  */
5209  return TM_Ok;
5210  }
5211  else if (TransactionIdDidAbort(xid))
5212  return TM_Ok;
5213  else if (TransactionIdDidCommit(xid))
5214  {
5215  /*
5216  * The other transaction committed. If it was only a locker, then the
5217  * lock is completely gone now and we can return success; but if it
5218  * was an update, then what we do depends on whether the two lock
5219  * modes conflict. If they conflict, then we must report error to
5220  * caller. But if they don't, we can fall through to allow the current
5221  * transaction to lock the tuple.
5222  *
5223  * Note: the reason we worry about ISUPDATE here is because as soon as
5224  * a transaction ends, all its locks are gone and meaningless, and
5225  * thus we can ignore them; whereas its updates persist. In the
5226  * TransactionIdIsInProgress case, above, we don't need to check
5227  * because we know the lock is still "alive" and thus a conflict needs
5228  * always be checked.
5229  */
5230  if (!ISUPDATE_from_mxstatus(status))
5231  return TM_Ok;
5232 
5234  LOCKMODE_from_mxstatus(wantedstatus)))
5235  {
5236  /* bummer */
5237  if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5238  return TM_Updated;
5239  else
5240  return TM_Deleted;
5241  }
5242 
5243  return TM_Ok;
5244  }
5245 
5246  /* Not in progress, not aborted, not committed -- must have crashed */
5247  return TM_Ok;
5248 }
5249 
5250 
5251 /*
5252  * Recursive part of heap_lock_updated_tuple
5253  *
5254  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5255  * xid with the given mode; if this tuple is updated, recurse to lock the new
5256  * version as well.
5257  */
5258 static TM_Result
5261 {
5262  TM_Result result;
5263  ItemPointerData tupid;
5264  HeapTupleData mytup;
5265  Buffer buf;
5266  uint16 new_infomask,
5267  new_infomask2,
5268  old_infomask,
5269  old_infomask2;
5270  TransactionId xmax,
5271  new_xmax;
5272  TransactionId priorXmax = InvalidTransactionId;
5273  bool cleared_all_frozen = false;
5274  bool pinned_desired_page;
5275  Buffer vmbuffer = InvalidBuffer;
5276  BlockNumber block;
5277 
5278  ItemPointerCopy(tid, &tupid);
5279 
5280  for (;;)
5281  {
5282  new_infomask = 0;
5283  new_xmax = InvalidTransactionId;
5284  block = ItemPointerGetBlockNumber(&tupid);
5285  ItemPointerCopy(&tupid, &(mytup.t_self));
5286 
5287  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5288  {
5289  /*
5290  * if we fail to find the updated version of the tuple, it's
5291  * because it was vacuumed/pruned away after its creator
5292  * transaction aborted. So behave as if we got to the end of the
5293  * chain, and there's no further tuple to lock: return success to
5294  * caller.
5295  */
5296  result = TM_Ok;
5297  goto out_unlocked;
5298  }
5299 
5300 l4:
5302 
5303  /*
5304  * Before locking the buffer, pin the visibility map page if it
5305  * appears to be necessary. Since we haven't got the lock yet,
5306  * someone else might be in the middle of changing this, so we'll need
5307  * to recheck after we have the lock.
5308  */
5310  {
5311  visibilitymap_pin(rel, block, &vmbuffer);
5312  pinned_desired_page = true;
5313  }
5314  else
5315  pinned_desired_page = false;
5316 
5318 
5319  /*
5320  * If we didn't pin the visibility map page and the page has become
5321  * all visible while we were busy locking the buffer, we'll have to
5322  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5323  * That's a bit unfortunate, but hopefully shouldn't happen often.
5324  *
5325  * Note: in some paths through this function, we will reach here
5326  * holding a pin on a vm page that may or may not be the one matching
5327  * this page. If this page isn't all-visible, we won't use the vm
5328  * page, but we hold onto such a pin till the end of the function.
5329  */
5330  if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5331  {
5333  visibilitymap_pin(rel, block, &vmbuffer);
5335  }
5336 
5337  /*
5338  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5339  * end of the chain, we're done, so return success.
5340  */
5341  if (TransactionIdIsValid(priorXmax) &&
5343  priorXmax))
5344  {
5345  result = TM_Ok;
5346  goto out_locked;
5347  }
5348 
5349  /*
5350  * Also check Xmin: if this tuple was created by an aborted
5351  * (sub)transaction, then we already locked the last live one in the
5352  * chain, thus we're done, so return success.
5353  */
5355  {
5356  result = TM_Ok;
5357  goto out_locked;
5358  }
5359 
5360  old_infomask = mytup.t_data->t_infomask;
5361  old_infomask2 = mytup.t_data->t_infomask2;
5362  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5363 
5364  /*
5365  * If this tuple version has been updated or locked by some concurrent
5366  * transaction(s), what we do depends on whether our lock mode
5367  * conflicts with what those other transactions hold, and also on the
5368  * status of them.
5369  */
5370  if (!(old_infomask & HEAP_XMAX_INVALID))
5371  {
5372  TransactionId rawxmax;
5373  bool needwait;
5374 
5375  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5376  if (old_infomask & HEAP_XMAX_IS_MULTI)
5377  {
5378  int nmembers;
5379  int i;
5380  MultiXactMember *members;
5381 
5382  /*
5383  * We don't need a test for pg_upgrade'd tuples: this is only
5384  * applied to tuples after the first in an update chain. Said
5385  * first tuple in the chain may well be locked-in-9.2-and-
5386  * pg_upgraded, but that one was already locked by our caller,
5387  * not us; and any subsequent ones cannot be because our
5388  * caller must necessarily have obtained a snapshot later than
5389  * the pg_upgrade itself.
5390  */
5392 
5393  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5394  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5395  for (i = 0; i < nmembers; i++)
5396  {
5397  result = test_lockmode_for_conflict(members[i].status,
5398  members[i].xid,
5399  mode,
5400  &mytup,
5401  &needwait);
5402 
5403  /*
5404  * If the tuple was already locked by ourselves in a
5405  * previous iteration of this (say heap_lock_tuple was
5406  * forced to restart the locking loop because of a change
5407  * in xmax), then we hold the lock already on this tuple
5408  * version and we don't need to do anything; and this is
5409  * not an error condition either. We just need to skip
5410  * this tuple and continue locking the next version in the
5411  * update chain.
5412  */
5413  if (result == TM_SelfModified)
5414  {
5415  pfree(members);
5416  goto next;
5417  }
5418 
5419  if (needwait)
5420  {
5422  XactLockTableWait(members[i].xid, rel,
5423  &mytup.t_self,
5425  pfree(members);
5426  goto l4;
5427  }
5428  if (result != TM_Ok)
5429  {
5430  pfree(members);
5431  goto out_locked;
5432  }
5433  }
5434  if (members)
5435  pfree(members);
5436  }
5437  else
5438  {
5439  MultiXactStatus status;
5440 
5441  /*
5442  * For a non-multi Xmax, we first need to compute the
5443  * corresponding MultiXactStatus by using the infomask bits.
5444  */
5445  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5446  {
5447  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5448  status = MultiXactStatusForKeyShare;
5449  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5450  status = MultiXactStatusForShare;
5451  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5452  {
5453  if (old_infomask2 & HEAP_KEYS_UPDATED)
5454  status = MultiXactStatusForUpdate;
5455  else
5457  }
5458  else
5459  {
5460  /*
5461  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5462  * as share-locked in the old cluster) shouldn't be
5463  * seen in the middle of an update chain.
5464  */
5465  elog(ERROR, "invalid lock status in tuple");
5466  }
5467  }
5468  else
5469  {
5470  /* it's an update, but which kind? */
5471  if (old_infomask2 & HEAP_KEYS_UPDATED)
5472  status = MultiXactStatusUpdate;
5473  else
5474  status = MultiXactStatusNoKeyUpdate;
5475  }
5476 
5477  result = test_lockmode_for_conflict(status, rawxmax, mode,
5478  &mytup, &needwait);
5479 
5480  /*
5481  * If the tuple was already locked by ourselves in a previous
5482  * iteration of this (say heap_lock_tuple was forced to
5483  * restart the locking loop because of a change in xmax), then
5484  * we hold the lock already on this tuple version and we don't
5485  * need to do anything; and this is not an error condition
5486  * either. We just need to skip this tuple and continue
5487  * locking the next version in the update chain.
5488  */
5489  if (result == TM_SelfModified)
5490  goto next;
5491 
5492  if (needwait)
5493  {
5495  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5497  goto l4;
5498  }
5499  if (result != TM_Ok)
5500  {
5501  goto out_locked;
5502  }
5503  }
5504  }
5505 
5506  /* compute the new Xmax and infomask values for the tuple ... */
5507  compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5508  xid, mode, false,
5509  &new_xmax, &new_infomask, &new_infomask2);
5510 
5512  visibilitymap_clear(rel, block, vmbuffer,
5514  cleared_all_frozen = true;
5515 
5517 
5518  /* ... and set them */
5519  HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5520  mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5522  mytup.t_data->t_infomask |= new_infomask;
5523  mytup.t_data->t_infomask2 |= new_infomask2;
5524 
5526 
5527  /* XLOG stuff */
5528  if (RelationNeedsWAL(rel))
5529  {
5530  xl_heap_lock_updated xlrec;
5531  XLogRecPtr recptr;
5532  Page page = BufferGetPage(buf);
5533 
5534  XLogBeginInsert();
5536 
5537  xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5538  xlrec.xmax = new_xmax;
5539  xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5540  xlrec.flags =
5541  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5542 
5543  XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5544 
5545  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5546 
5547  PageSetLSN(page, recptr);
5548  }
5549 
5550  END_CRIT_SECTION();
5551 
5552 next:
5553  /* if we find the end of update chain, we're done. */
5554  if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5556  ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5558  {
5559  result = TM_Ok;
5560  goto out_locked;
5561  }
5562 
5563  /* tail recursion */
5564  priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5565  ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5567  }
5568 
5569  result = TM_Ok;
5570 
5571 out_locked:
5573 
5574 out_unlocked:
5575  if (vmbuffer != InvalidBuffer)
5576  ReleaseBuffer(vmbuffer);
5577 
5578  return result;
5579 }
5580 
5581 /*
5582  * heap_lock_updated_tuple
5583  * Follow update chain when locking an updated tuple, acquiring locks (row
5584  * marks) on the updated versions.
5585  *
5586  * The initial tuple is assumed to be already locked.
5587  *
5588  * This function doesn't check visibility, it just unconditionally marks the
5589  * tuple(s) as locked. If any tuple in the updated chain is being deleted
5590  * concurrently (or updated with the key being modified), sleep until the
5591  * transaction doing it is finished.
5592  *
5593  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5594  * when we have to wait for other transactions to release them, as opposed to
5595  * what heap_lock_tuple does. The reason is that having more than one
5596  * transaction walking the chain is probably uncommon enough that risk of
5597  * starvation is not likely: one of the preconditions for being here is that
5598  * the snapshot in use predates the update that created this tuple (because we
5599  * started at an earlier version of the tuple), but at the same time such a
5600  * transaction cannot be using repeatable read or serializable isolation
5601  * levels, because that would lead to a serializability failure.
5602  */
5603 static TM_Result
5606 {
5607  /*
5608  * If the tuple has not been updated, or has moved into another partition
5609  * (effectively a delete) stop here.
5610  */
5612  !ItemPointerEquals(&tuple->t_self, ctid))
5613  {
5614  /*
5615  * If this is the first possibly-multixact-able operation in the
5616  * current transaction, set my per-backend OldestMemberMXactId
5617  * setting. We can be certain that the transaction will never become a
5618  * member of any older MultiXactIds than that. (We have to do this
5619  * even if we end up just using our own TransactionId below, since
5620  * some other backend could incorporate our XID into a MultiXact
5621  * immediately afterwards.)
5622  */
5624 
5625  return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5626  }
5627 
5628  /* nothing to lock */
5629  return TM_Ok;
5630 }
5631 
5632 /*
5633  * heap_finish_speculative - mark speculative insertion as successful
5634  *
5635  * To successfully finish a speculative insertion we have to clear speculative
5636  * token from tuple. To do so the t_ctid field, which will contain a
5637  * speculative token value, is modified in place to point to the tuple itself,
5638  * which is characteristic of a newly inserted ordinary tuple.
5639  *
5640  * NB: It is not ok to commit without either finishing or aborting a
5641  * speculative insertion. We could treat speculative tuples of committed
5642  * transactions implicitly as completed, but then we would have to be prepared
5643  * to deal with speculative tokens on committed tuples. That wouldn't be
5644  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5645  * but clearing the token at completion isn't very expensive either.
5646  * An explicit confirmation WAL record also makes logical decoding simpler.
5647  */
5648 void
5650 {
5651  Buffer buffer;
5652  Page page;
5653  OffsetNumber offnum;
5654  ItemId lp = NULL;
5655  HeapTupleHeader htup;
5656 
5657  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5659  page = (Page) BufferGetPage(buffer);
5660 
5661  offnum = ItemPointerGetOffsetNumber(tid);
5662  if (PageGetMaxOffsetNumber(page) >= offnum)
5663  lp = PageGetItemId(page, offnum);
5664 
5665  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5666  elog(ERROR, "invalid lp");
5667 
5668  htup = (HeapTupleHeader) PageGetItem(page, lp);
5669 
5670  /* NO EREPORT(ERROR) from here till changes are logged */
5672 
5674 
5675  MarkBufferDirty(buffer);
5676 
5677  /*
5678  * Replace the speculative insertion token with a real t_ctid, pointing to
5679  * itself like it does on regular tuples.
5680  */
5681  htup->t_ctid = *tid;
5682 
5683  /* XLOG stuff */
5684  if (RelationNeedsWAL(relation))
5685  {
5686  xl_heap_confirm xlrec;
5687  XLogRecPtr recptr;
5688 
5689  xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5690 
5691  XLogBeginInsert();
5692 
5693  /* We want the same filtering on this as on a plain insert */
5695 
5696  XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5697  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5698 
5699  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5700 
5701  PageSetLSN(page, recptr);
5702  }
5703 
5704  END_CRIT_SECTION();
5705 
5706  UnlockReleaseBuffer(buffer);
5707 }
5708 
5709 /*
5710  * heap_abort_speculative - kill a speculatively inserted tuple
5711  *
5712  * Marks a tuple that was speculatively inserted in the same command as dead,
5713  * by setting its xmin as invalid. That makes it immediately appear as dead
5714  * to all transactions, including our own. In particular, it makes
5715  * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5716  * inserting a duplicate key value won't unnecessarily wait for our whole
5717  * transaction to finish (it'll just wait for our speculative insertion to
5718  * finish).
5719  *
5720  * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5721  * that arise due to a mutual dependency that is not user visible. By
5722  * definition, unprincipled deadlocks cannot be prevented by the user
5723  * reordering lock acquisition in client code, because the implementation level
5724  * lock acquisitions are not under the user's direct control. If speculative
5725  * inserters did not take this precaution, then under high concurrency they
5726  * could deadlock with each other, which would not be acceptable.
5727  *
5728  * This is somewhat redundant with heap_delete, but we prefer to have a
5729  * dedicated routine with stripped down requirements. Note that this is also
5730  * used to delete the TOAST tuples created during speculative insertion.
5731  *
5732  * This routine does not affect logical decoding as it only looks at
5733  * confirmation records.
5734  */
5735 void
5737 {
5739  ItemId lp;
5740  HeapTupleData tp;
5741  Page page;
5742  BlockNumber block;
5743  Buffer buffer;
5744  TransactionId prune_xid;
5745 
5746  Assert(ItemPointerIsValid(tid));
5747 
5748  block = ItemPointerGetBlockNumber(tid);
5749  buffer = ReadBuffer(relation, block);
5750  page = BufferGetPage(buffer);
5751 
5753 
5754  /*
5755  * Page can't be all visible, we just inserted into it, and are still
5756  * running.
5757  */
5758  Assert(!PageIsAllVisible(page));
5759 
5760  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5761  Assert(ItemIdIsNormal(lp));
5762 
5763  tp.t_tableOid = RelationGetRelid(relation);
5764  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5765  tp.t_len = ItemIdGetLength(lp);
5766  tp.t_self = *tid;
5767 
5768  /*
5769  * Sanity check that the tuple really is a speculatively inserted tuple,
5770  * inserted by us.
5771  */
5772  if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5773  elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5774  if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5775  elog(ERROR, "attempted to kill a non-speculative tuple");
5777 
5778  /*
5779  * No need to check for serializable conflicts here. There is never a
5780  * need for a combo CID, either. No need to extract replica identity, or
5781  * do anything special with infomask bits.
5782  */
5783 
5785 
5786  /*
5787  * The tuple will become DEAD immediately. Flag that this page is a
5788  * candidate for pruning by setting xmin to TransactionXmin. While not
5789  * immediately prunable, it is the oldest xid we can cheaply determine
5790  * that's safe against wraparound / being older than the table's
5791  * relfrozenxid. To defend against the unlikely case of a new relation
5792  * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
5793  * if so (vacuum can't subsequently move relfrozenxid to beyond
5794  * TransactionXmin, so there's no race here).
5795  */
5797  if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
5798  prune_xid = relation->rd_rel->relfrozenxid;
5799  else
5800  prune_xid = TransactionXmin;
5801  PageSetPrunable(page, prune_xid);
5802 
5803  /* store transaction information of xact deleting the tuple */
5806 
5807  /*
5808  * Set the tuple header xmin to InvalidTransactionId. This makes the
5809  * tuple immediately invisible everyone. (In particular, to any
5810  * transactions waiting on the speculative token, woken up later.)
5811  */
5813 
5814  /* Clear the speculative insertion token too */
5815  tp.t_data->t_ctid = tp.t_self;
5816 
5817  MarkBufferDirty(buffer);
5818 
5819  /*
5820  * XLOG stuff
5821  *
5822  * The WAL records generated here match heap_delete(). The same recovery
5823  * routines are used.
5824  */
5825  if (RelationNeedsWAL(relation))
5826  {
5827  xl_heap_delete xlrec;
5828  XLogRecPtr recptr;
5829 
5830  xlrec.flags = XLH_DELETE_IS_SUPER;
5832  tp.t_data->t_infomask2);
5834  xlrec.xmax = xid;
5835 
5836  XLogBeginInsert();
5837  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5838  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5839 
5840  /* No replica identity & replication origin logged */
5841 
5842  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5843 
5844  PageSetLSN(page, recptr);
5845  }
5846 
5847  END_CRIT_SECTION();
5848 
5849  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5850 
5851  if (HeapTupleHasExternal(&tp))
5852  {
5853  Assert(!IsToastRelation(relation));
5854  heap_toast_delete(relation, &tp, true);
5855  }
5856 
5857  /*
5858  * Never need to mark tuple for invalidation, since catalogs don't support
5859  * speculative insertion
5860  */
5861 
5862  /* Now we can release the buffer */
5863  ReleaseBuffer(buffer);
5864 
5865  /* count deletion, as we counted the insertion too */
5866  pgstat_count_heap_delete(relation);
5867 }
5868 
5869 /*
5870  * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5871  *
5872  * Overwriting violates both MVCC and transactional safety, so the uses
5873  * of this function in Postgres are extremely limited. Nonetheless we
5874  * find some places to use it.
5875  *
5876  * The tuple cannot change size, and therefore it's reasonable to assume
5877  * that its null bitmap (if any) doesn't change either. So we just
5878  * overwrite the data portion of the tuple without touching the null
5879  * bitmap or any of the header fields.
5880  *
5881  * tuple is an in-memory tuple structure containing the data to be written
5882  * over the target tuple. Also, tuple->t_self identifies the target tuple.
5883  *
5884  * Note that the tuple updated here had better not come directly from the
5885  * syscache if the relation has a toast relation as this tuple could
5886  * include toast values that have been expanded, causing a failure here.
5887  */
5888 void
5890 {
5891  Buffer buffer;
5892  Page page;
5893  OffsetNumber offnum;
5894  ItemId lp = NULL;
5895  HeapTupleHeader htup;
5896  uint32 oldlen;
5897  uint32 newlen;
5898 
5899  /*
5900  * For now, we don't allow parallel updates. Unlike a regular update,
5901  * this should never create a combo CID, so it might be possible to relax
5902  * this restriction, but not without more thought and testing. It's not
5903  * clear that it would be useful, anyway.
5904  */
5905  if (IsInParallelMode())
5906  ereport(ERROR,
5907  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5908  errmsg("cannot update tuples during a parallel operation")));
5909 
5910  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
5912  page = (Page) BufferGetPage(buffer);
5913 
5914  offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
5915  if (PageGetMaxOffsetNumber(page) >= offnum)
5916  lp = PageGetItemId(page, offnum);
5917 
5918  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5919  elog(ERROR, "invalid lp");
5920 
5921  htup = (HeapTupleHeader) PageGetItem(page, lp);
5922 
5923  oldlen = ItemIdGetLength(lp) - htup->t_hoff;
5924  newlen = tuple->t_len - tuple->t_data->t_hoff;
5925  if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
5926  elog(ERROR, "wrong tuple length");
5927 
5928  /* NO EREPORT(ERROR) from here till changes are logged */
5930 
5931  memcpy((char *) htup + htup->t_hoff,
5932  (char *) tuple->t_data + tuple->t_data->t_hoff,
5933  newlen);
5934 
5935  MarkBufferDirty(buffer);
5936 
5937  /* XLOG stuff */
5938  if (RelationNeedsWAL(relation))
5939  {
5940  xl_heap_inplace xlrec;
5941  XLogRecPtr recptr;
5942 
5943  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5944 
5945  XLogBeginInsert();
5946  XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
5947 
5948  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5949  XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
5950 
5951  /* inplace updates aren't decoded atm, don't log the origin */
5952 
5953  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
5954 
5955  PageSetLSN(page, recptr);
5956  }
5957 
5958  END_CRIT_SECTION();
5959 
5960  UnlockReleaseBuffer(buffer);
5961 
5962  /*
5963  * Send out shared cache inval if necessary. Note that because we only
5964  * pass the new version of the tuple, this mustn't be used for any
5965  * operations that could change catcache lookup keys. But we aren't
5966  * bothering with index updates either, so that's true a fortiori.
5967  */
5969  CacheInvalidateHeapTuple(relation, tuple, NULL);
5970 }
5971 
5972 #define FRM_NOOP 0x0001
5973 #define FRM_INVALIDATE_XMAX 0x0002
5974 #define FRM_RETURN_IS_XID 0x0004
5975 #define FRM_RETURN_IS_MULTI 0x0008
5976 #define FRM_MARK_COMMITTED 0x0010
5977 
5978 /*
5979  * FreezeMultiXactId
5980  * Determine what to do during freezing when a tuple is marked by a
5981  * MultiXactId.
5982  *
5983  * "flags" is an output value; it's used to tell caller what to do on return.
5984  * "pagefrz" is an input/output value, used to manage page level freezing.
5985  *
5986  * Possible values that we can set in "flags":
5987  * FRM_NOOP
5988  * don't do anything -- keep existing Xmax
5989  * FRM_INVALIDATE_XMAX
5990  * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
5991  * FRM_RETURN_IS_XID
5992  * The Xid return value is a single update Xid to set as xmax.
5993  * FRM_MARK_COMMITTED
5994  * Xmax can be marked as HEAP_XMAX_COMMITTED
5995  * FRM_RETURN_IS_MULTI
5996  * The return value is a new MultiXactId to set as new Xmax.
5997  * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
5998  *
5999  * Caller delegates control of page freezing to us. In practice we always
6000  * force freezing of caller's page unless FRM_NOOP processing is indicated.
6001  * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
6002  * can never be left behind. We freely choose when and how to process each
6003  * Multi, without ever violating the cutoff postconditions for freezing.
6004  *
6005  * It's useful to remove Multis on a proactive timeline (relative to freezing
6006  * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also
6007  * be cheaper in the short run, for us, since we too can avoid SLRU buffer
6008  * misses through eager processing.
6009  *
6010  * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
6011  * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
6012  * This can usually be put off, which is usually enough to avoid it altogether.
6013  * Allocating new multis during VACUUM should be avoided on general principle;
6014  * only VACUUM can advance relminmxid, so allocating new Multis here comes with
6015  * its own special risks.
6016  *
6017  * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
6018  * using heap_tuple_should_freeze when we haven't forced page-level freezing.
6019  *
6020  * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
6021  * have already forced page-level freezing, since that might incur the same
6022  * SLRU buffer misses that we specifically intended to avoid by freezing.
6023  */
6024 static TransactionId
6026  const struct VacuumCutoffs *cutoffs, uint16 *flags,
6027  HeapPageFreeze *pagefrz)
6028 {
6029  TransactionId newxmax;
6030  MultiXactMember *members;
6031  int nmembers;
6032  bool need_replace;
6033  int nnewmembers;
6034  MultiXactMember *newmembers;
6035  bool has_lockers;
6036  TransactionId update_xid;
6037  bool update_committed;
6038  TransactionId FreezePageRelfrozenXid;
6039 
6040  *flags = 0;
6041 
6042  /* We should only be called in Multis */
6043  Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6044 
6045  if (!MultiXactIdIsValid(multi) ||
6046  HEAP_LOCKED_UPGRADED(t_infomask))
6047  {
6048  *flags |= FRM_INVALIDATE_XMAX;
6049  pagefrz->freeze_required = true;
6050  return InvalidTransactionId;
6051  }
6052  else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6053  ereport(ERROR,
6055  errmsg_internal("found multixact %u from before relminmxid %u",
6056  multi, cutoffs->relminmxid)));
6057  else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6058  {
6059  TransactionId update_xact;
6060 
6061  /*
6062  * This old multi cannot possibly have members still running, but
6063  * verify just in case. If it was a locker only, it can be removed
6064  * without any further consideration; but if it contained an update,
6065  * we might need to preserve it.
6066  */
6067  if (MultiXactIdIsRunning(multi,
6068  HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6069  ereport(ERROR,
6071  errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6072  multi, cutoffs->OldestMxact)));
6073 
6074  if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6075  {
6076  *flags |= FRM_INVALIDATE_XMAX;
6077  pagefrz->freeze_required = true;
6078  return InvalidTransactionId;
6079  }
6080 
6081  /* replace multi with single XID for its updater? */
6082  update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6083  if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid))
6084  ereport(ERROR,
6086  errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6087  multi, update_xact,
6088  cutoffs->relfrozenxid)));
6089  else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6090  {
6091  /*
6092  * Updater XID has to have aborted (otherwise the tuple would have
6093  * been pruned away instead, since updater XID is < OldestXmin).
6094  * Just remove xmax.
6095  */
6096  if (TransactionIdDidCommit(update_xact))
6097  ereport(ERROR,
6099  errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6100  multi, update_xact,
6101  cutoffs->OldestXmin)));
6102  *flags |= FRM_INVALIDATE_XMAX;
6103  pagefrz->freeze_required = true;
6104  return InvalidTransactionId;
6105  }
6106 
6107  /* Have to keep updater XID as new xmax */
6108  *flags |= FRM_RETURN_IS_XID;
6109  pagefrz->freeze_required = true;
6110  return update_xact;
6111  }
6112 
6113  /*
6114  * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6115  * need to walk the whole members array to figure out what to do, if
6116  * anything.
6117  */
6118  nmembers =
6119  GetMultiXactIdMembers(multi, &members, false,
6120  HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6121  if (nmembers <= 0)
6122  {
6123  /* Nothing worth keeping */
6124  *flags |= FRM_INVALIDATE_XMAX;
6125  pagefrz->freeze_required = true;
6126  return InvalidTransactionId;
6127  }
6128 
6129  /*
6130  * The FRM_NOOP case is the only case where we might need to ratchet back
6131  * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6132  * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6133  * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6134  * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6135  * trackers managed by VACUUM being ratcheting back by xmax to the degree
6136  * required to make it safe to leave xmax undisturbed, independent of
6137  * whether or not page freezing is triggered somewhere else.
6138  *
6139  * Our policy is to force freezing in every case other than FRM_NOOP,
6140  * which obviates the need to maintain either set of trackers, anywhere.
6141  * Every other case will reliably execute a freeze plan for xmax that
6142  * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6143  * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6144  * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6145  * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6146  */
6147  need_replace = false;
6148  FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6149  for (int i = 0; i < nmembers; i++)
6150  {
6151  TransactionId xid = members[i].xid;
6152 
6153  Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6154 
6155  if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6156  {
6157  /* Can't violate the FreezeLimit postcondition */
6158  need_replace = true;
6159  break;
6160  }
6161  if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6162  FreezePageRelfrozenXid = xid;
6163  }
6164 
6165  /* Can't violate the MultiXactCutoff postcondition, either */
6166  if (!need_replace)
6167  need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff);
6168 
6169  if (!need_replace)
6170  {
6171  /*
6172  * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6173  * both together to make it safe to retain this particular multi after
6174  * freezing its page
6175  */
6176  *flags |= FRM_NOOP;
6177  pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6178  if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6179  pagefrz->FreezePageRelminMxid = multi;
6180  pfree(members);
6181  return multi;
6182  }
6183 
6184  /*
6185  * Do a more thorough second pass over the multi to figure out which
6186  * member XIDs actually need to be kept. Checking the precise status of
6187  * individual members might even show that we don't need to keep anything.
6188  * That is quite possible even though the Multi must be >= OldestMxact,
6189  * since our second pass only keeps member XIDs when it's truly necessary;
6190  * even member XIDs >= OldestXmin often won't be kept by second pass.
6191  */
6192  nnewmembers = 0;
6193  newmembers = palloc(sizeof(MultiXactMember) * nmembers);
6194  has_lockers = false;
6195  update_xid = InvalidTransactionId;
6196  update_committed = false;
6197 
6198  /*
6199  * Determine whether to keep each member xid, or to ignore it instead
6200  */
6201  for (int i = 0; i < nmembers; i++)
6202  {
6203  TransactionId xid = members[i].xid;
6204  MultiXactStatus mstatus = members[i].status;
6205 
6206  Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6207 
6208  if (!ISUPDATE_from_mxstatus(mstatus))
6209  {
6210  /*
6211  * Locker XID (not updater XID). We only keep lockers that are
6212  * still running.
6213  */
6216  {
6217  if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6218  ereport(ERROR,
6220  errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6221  multi, xid,
6222  cutoffs->OldestXmin)));
6223  newmembers[nnewmembers++] = members[i];
6224  has_lockers = true;
6225  }
6226 
6227  continue;
6228  }
6229 
6230  /*
6231  * Updater XID (not locker XID). Should we keep it?
6232  *
6233  * Since the tuple wasn't totally removed when vacuum pruned, the
6234  * update Xid cannot possibly be older than OldestXmin cutoff unless
6235  * the updater XID aborted. If the updater transaction is known
6236  * aborted or crashed then it's okay to ignore it, otherwise not.
6237  *
6238  * In any case the Multi should never contain two updaters, whatever
6239  * their individual commit status. Check for that first, in passing.
6240  */
6241  if (TransactionIdIsValid(update_xid))
6242  ereport(ERROR,
6244  errmsg_internal("multixact %u has two or more updating members",
6245  multi),
6246  errdetail_internal("First updater XID=%u second updater XID=%u.",
6247  update_xid, xid)));
6248 
6249  /*
6250  * As with all tuple visibility routines, it's critical to test
6251  * TransactionIdIsInProgress before TransactionIdDidCommit, because of
6252  * race conditions explained in detail in heapam_visibility.c.
6253  */
6256  update_xid = xid;
6257  else if (TransactionIdDidCommit(xid))
6258  {
6259  /*
6260  * The transaction committed, so we can tell caller to set
6261  * HEAP_XMAX_COMMITTED. (We can only do this because we know the
6262  * transaction is not running.)
6263  */
6264  update_committed = true;
6265  update_xid = xid;
6266  }
6267  else
6268  {
6269  /*
6270  * Not in progress, not committed -- must be aborted or crashed;
6271  * we can ignore it.
6272  */
6273  continue;
6274  }
6275 
6276  /*
6277  * We determined that updater must be kept -- add it to pending new
6278  * members list
6279  */
6280  if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6281  ereport(ERROR,
6283  errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6284  multi, xid, cutoffs->OldestXmin)));
6285  newmembers[nnewmembers++] = members[i];
6286  }
6287 
6288  pfree(members);
6289 
6290  /*
6291  * Determine what to do with caller's multi based on information gathered
6292  * during our second pass
6293  */
6294  if (nnewmembers == 0)
6295  {
6296  /* Nothing worth keeping */
6297  *flags |= FRM_INVALIDATE_XMAX;
6298  newxmax = InvalidTransactionId;
6299  }
6300  else if (TransactionIdIsValid(update_xid) && !has_lockers)
6301  {
6302  /*
6303  * If there's a single member and it's an update, pass it back alone
6304  * without creating a new Multi. (XXX we could do this when there's a
6305  * single remaining locker, too, but that would complicate the API too
6306  * much; moreover, the case with the single updater is more
6307  * interesting, because those are longer-lived.)
6308  */
6309  Assert(nnewmembers == 1);
6310  *flags |= FRM_RETURN_IS_XID;
6311  if (update_committed)
6312  *flags |= FRM_MARK_COMMITTED;
6313  newxmax = update_xid;
6314  }
6315  else
6316  {
6317  /*
6318  * Create a new multixact with the surviving members of the previous
6319  * one, to set as new Xmax in the tuple
6320  */
6321  newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6322  *flags |= FRM_RETURN_IS_MULTI;
6323  }
6324 
6325  pfree(newmembers);
6326 
6327  pagefrz->freeze_required = true;
6328  return newxmax;
6329 }
6330 
6331 /*
6332  * heap_prepare_freeze_tuple
6333  *
6334  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6335  * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so,
6336  * setup enough state (in the *frz output argument) to enable caller to
6337  * process this tuple as part of freezing its page, and return true. Return
6338  * false if nothing can be changed about the tuple right now.
6339  *
6340  * Also sets *totally_frozen to true if the tuple will be totally frozen once
6341  * caller executes returned freeze plan (or if the tuple was already totally
6342  * frozen by an earlier VACUUM). This indicates that there are no remaining
6343  * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
6344  *
6345  * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
6346  * tuple that we returned true for, and call heap_freeze_execute_prepared to
6347  * execute freezing. Caller must initialize pagefrz fields for page as a
6348  * whole before first call here for each heap page.
6349  *
6350  * VACUUM caller decides on whether or not to freeze the page as a whole.
6351  * We'll often prepare freeze plans for a page that caller just discards.
6352  * However, VACUUM doesn't always get to make a choice; it must freeze when
6353  * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
6354  * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure
6355  * that VACUUM always follows that rule.
6356  *
6357  * We sometimes force freezing of xmax MultiXactId values long before it is
6358  * strictly necessary to do so just to ensure the FreezeLimit postcondition.
6359  * It's worth processing MultiXactIds proactively when it is cheap to do so,
6360  * and it's convenient to make that happen by piggy-backing it on the "force
6361  * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds
6362  * because it is expensive right now (though only when it's still possible to
6363  * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
6364  *
6365  * It is assumed that the caller has checked the tuple with
6366  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6367  * (else we should be removing the tuple, not freezing it).
6368  *
6369  * NB: This function has side effects: it might allocate a new MultiXactId.
6370  * It will be set as tuple's new xmax when our *frz output is processed within
6371  * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer
6372  * then caller had better have an exclusive lock on it already.
6373  */
6374 bool
6376  const struct VacuumCutoffs *cutoffs,
6377  HeapPageFreeze *pagefrz,
6378  HeapTupleFreeze *frz, bool *totally_frozen)
6379 {
6380  bool xmin_already_frozen = false,
6381  xmax_already_frozen = false;
6382  bool freeze_xmin = false,
6383  replace_xvac = false,
6384  replace_xmax = false,
6385  freeze_xmax = false;
6386  TransactionId xid;
6387 
6388  frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6389  frz->t_infomask2 = tuple->t_infomask2;
6390  frz->t_infomask = tuple->t_infomask;
6391  frz->frzflags = 0;
6392  frz->checkflags = 0;
6393 
6394  /*
6395  * Process xmin, while keeping track of whether it's already frozen, or
6396  * will become frozen iff our freeze plan is executed by caller (could be
6397  * neither).
6398  */
6399  xid = HeapTupleHeaderGetXmin(tuple);
6400  if (!TransactionIdIsNormal(xid))
6401  xmin_already_frozen = true;
6402  else
6403  {
6404  if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
6405  ereport(ERROR,
6407  errmsg_internal("found xmin %u from before relfrozenxid %u",
6408  xid, cutoffs->relfrozenxid)));
6409 
6410  /* Will set freeze_xmin flags in freeze plan below */
6411  freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
6412 
6413  /* Verify that xmin committed if and when freeze plan is executed */
6414  if (freeze_xmin)
6416  }
6417 
6418  /*
6419  * Old-style VACUUM FULL is gone, but we have to process xvac for as long
6420  * as we support having MOVED_OFF/MOVED_IN tuples in the database
6421  */
6422  xid = HeapTupleHeaderGetXvac(tuple);
6423  if (TransactionIdIsNormal(xid))
6424  {
6426  Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
6427 
6428  /*
6429  * For Xvac, we always freeze proactively. This allows totally_frozen
6430  * tracking to ignore xvac.
6431  */
6432  replace_xvac = pagefrz->freeze_required = true;
6433 
6434  /* Will set replace_xvac flags in freeze plan below */
6435  }
6436 
6437  /* Now process xmax */
6438  xid = frz->xmax;
6439  if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6440  {
6441  /* Raw xmax is a MultiXactId */
6442  TransactionId newxmax;
6443  uint16 flags;
6444 
6445  /*
6446  * We will either remove xmax completely (in the "freeze_xmax" path),
6447  * process xmax by replacing it (in the "replace_xmax" path), or
6448  * perform no-op xmax processing. The only constraint is that the
6449  * FreezeLimit/MultiXactCutoff postcondition must never be violated.
6450  */
6451  newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
6452  &flags, pagefrz);
6453 
6454  if (flags & FRM_NOOP)
6455  {
6456  /*
6457  * xmax is a MultiXactId, and nothing about it changes for now.
6458  * This is the only case where 'freeze_required' won't have been
6459  * set for us by FreezeMultiXactId, as well as the only case where
6460  * neither freeze_xmax nor replace_xmax are set (given a multi).
6461  *
6462  * This is a no-op, but the call to FreezeMultiXactId might have
6463  * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
6464  * for us (the "freeze page" variants, specifically). That'll
6465  * make it safe for our caller to freeze the page later on, while
6466  * leaving this particular xmax undisturbed.
6467  *
6468  * FreezeMultiXactId is _not_ responsible for the "no freeze"
6469  * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
6470  * job. A call to heap_tuple_should_freeze for this same tuple
6471  * will take place below if 'freeze_required' isn't set already.
6472  * (This repeats work from FreezeMultiXactId, but allows "no
6473  * freeze" tracker maintenance to happen in only one place.)
6474  */
6475  Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff));
6476  Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
6477  }
6478  else if (flags & FRM_RETURN_IS_XID)
6479  {
6480  /*
6481  * xmax will become an updater Xid (original MultiXact's updater
6482  * member Xid will be carried forward as a simple Xid in Xmax).
6483  */
6484  Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin));
6485 
6486  /*
6487  * NB -- some of these transformations are only valid because we
6488  * know the return Xid is a tuple updater (i.e. not merely a
6489  * locker.) Also note that the only reason we don't explicitly
6490  * worry about HEAP_KEYS_UPDATED is because it lives in
6491  * t_infomask2 rather than t_infomask.
6492  */
6493  frz->t_infomask &= ~HEAP_XMAX_BITS;
6494  frz->xmax = newxmax;
6495  if (flags & FRM_MARK_COMMITTED)
6497  replace_xmax = true;
6498  }
6499  else if (flags & FRM_RETURN_IS_MULTI)
6500  {
6501  uint16 newbits;
6502  uint16 newbits2;
6503 
6504  /*
6505  * xmax is an old MultiXactId that we have to replace with a new
6506  * MultiXactId, to carry forward two or more original member XIDs.
6507  */
6508  Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact));
6509 
6510  /*
6511  * We can't use GetMultiXactIdHintBits directly on the new multi
6512  * here; that routine initializes the masks to all zeroes, which
6513  * would lose other bits we need. Doing it this way ensures all
6514  * unrelated bits remain untouched.
6515  */
6516  frz->t_infomask &= ~HEAP_XMAX_BITS;
6517  frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6518  GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6519  frz->t_infomask |= newbits;
6520  frz->t_infomask2 |= newbits2;
6521  frz->xmax = newxmax;
6522  replace_xmax = true;
6523  }
6524  else
6525  {
6526  /*
6527  * Freeze plan for tuple "freezes xmax" in the strictest sense:
6528  * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
6529  */
6530  Assert(flags & FRM_INVALIDATE_XMAX);
6531  Assert(!TransactionIdIsValid(newxmax));
6532 
6533  /* Will set freeze_xmax flags in freeze plan below */
6534  freeze_xmax = true;
6535  }
6536 
6537  /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
6538  Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
6539  }
6540  else if (TransactionIdIsNormal(xid))
6541  {
6542  /* Raw xmax is normal XID */
6543  if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
6544  ereport(ERROR,
6546  errmsg_internal("found xmax %u from before relfrozenxid %u",
6547  xid, cutoffs->relfrozenxid)));
6548 
6549  /* Will set freeze_xmax flags in freeze plan below */
6550  freeze_xmax = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
6551 
6552  /*
6553  * Verify that xmax aborted if and when freeze plan is executed,
6554  * provided it's from an update. (A lock-only xmax can be removed
6555  * independent of this, since the lock is released at xact end.)
6556  */
6557  if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
6559  }
6560  else if (!TransactionIdIsValid(xid))
6561  {
6562  /* Raw xmax is InvalidTransactionId XID */
6563  Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
6564  xmax_already_frozen = true;
6565  }
6566  else
6567  ereport(ERROR,
6569  errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
6570  xid, tuple->t_infomask)));
6571 
6572  if (freeze_xmin)
6573  {
6574  Assert(!xmin_already_frozen);
6575 
6576  frz->t_infomask |= HEAP_XMIN_FROZEN;
6577  }
6578  if (replace_xvac)
6579  {
6580  /*
6581  * If a MOVED_OFF tuple is not dead, the xvac transaction must have
6582  * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
6583  * transaction succeeded.
6584  */
6585  Assert(pagefrz->freeze_required);
6586  if (tuple->t_infomask & HEAP_MOVED_OFF)
6587  frz->frzflags |= XLH_INVALID_XVAC;
6588  else
6589  frz->frzflags |= XLH_FREEZE_XVAC;
6590  }
6591  if (replace_xmax)
6592  {
6593  Assert(!xmax_already_frozen && !freeze_xmax);
6594  Assert(pagefrz->freeze_required);
6595 
6596  /* Already set replace_xmax flags in freeze plan earlier */
6597  }
6598  if (freeze_xmax)
6599  {
6600  Assert(!xmax_already_frozen && !replace_xmax);
6601 
6602  frz->xmax = InvalidTransactionId;
6603 
6604  /*
6605  * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6606  * LOCKED. Normalize to INVALID just to be sure no one gets confused.
6607  * Also get rid of the HEAP_KEYS_UPDATED bit.
6608  */
6609  frz->t_infomask &= ~HEAP_XMAX_BITS;
6610  frz->t_infomask |= HEAP_XMAX_INVALID;
6611  frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6612  frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6613  }
6614 
6615  /*
6616  * Determine if this tuple is already totally frozen, or will become
6617  * totally frozen (provided caller executes freeze plans for the page)
6618  */
6619  *totally_frozen = ((freeze_xmin || xmin_already_frozen) &&
6620  (freeze_xmax || xmax_already_frozen));
6621 
6622  if (!pagefrz->freeze_required && !(xmin_already_frozen &&
6623  xmax_already_frozen))
6624  {
6625  /*
6626  * So far no previous tuple from the page made freezing mandatory.
6627  * Does this tuple force caller to freeze the entire page?
6628  */
6629  pagefrz->freeze_required =
6630  heap_tuple_should_freeze(tuple, cutoffs,
6631  &pagefrz->NoFreezePageRelfrozenXid,
6632  &pagefrz->NoFreezePageRelminMxid);
6633  }
6634 
6635  /* Tell caller if this tuple has a usable freeze plan set in *frz */
6636  return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax;
6637 }
6638 
6639 /*
6640  * heap_execute_freeze_tuple
6641  * Execute the prepared freezing of a tuple with caller's freeze plan.
6642  *
6643  * Caller is responsible for ensuring that no other backend can access the
6644  * storage underlying this tuple, either by holding an exclusive lock on the
6645  * buffer containing it (which is what lazy VACUUM does), or by having it be
6646  * in private storage (which is what CLUSTER and friends do).
6647  */
6648 static inline void
6650 {
6651  HeapTupleHeaderSetXmax(tuple, frz->xmax);
6652 
6653  if (frz->frzflags & XLH_FREEZE_XVAC)
6655 
6656  if (frz->frzflags & XLH_INVALID_XVAC)
6658 
6659  tuple->t_infomask = frz->t_infomask;
6660  tuple->t_infomask2 = frz->t_infomask2;
6661 }
6662 
6663 /*
6664  * heap_freeze_execute_prepared
6665  *
6666  * Executes freezing of one or more heap tuples on a page on behalf of caller.
6667  * Caller passes an array of tuple plans from heap_prepare_freeze_tuple.
6668  * Caller must set 'offset' in each plan for us. Note that we destructively
6669  * sort caller's tuples array in-place, so caller had better be done with it.
6670  *
6671  * WAL-logs the changes so that VACUUM can advance the rel's relfrozenxid
6672  * later on without any risk of unsafe pg_xact lookups, even following a hard
6673  * crash (or when querying from a standby). We represent freezing by setting
6674  * infomask bits in tuple headers, but this shouldn't be thought of as a hint.
6675  * See section on buffer access rules in src/backend/storage/buffer/README.
6676  */
6677 void
6679  TransactionId snapshotConflictHorizon,
6680  HeapTupleFreeze *tuples, int ntuples)
6681 {
6682  Page page = BufferGetPage(buffer);
6683 
6684  Assert(ntuples > 0);
6685 
6686  /*
6687  * Perform xmin/xmax XID status sanity checks before critical section.
6688  *
6689  * heap_prepare_freeze_tuple doesn't perform these checks directly because
6690  * pg_xact lookups are relatively expensive. They shouldn't be repeated
6691  * by successive VACUUMs that each decide against freezing the same page.
6692  */
6693  for (int i = 0; i < ntuples; i++)
6694  {
6695  HeapTupleFreeze *frz = tuples + i;
6696  ItemId itemid = PageGetItemId(page, frz->offset);
6697  HeapTupleHeader htup;
6698 
6699  htup = (HeapTupleHeader) PageGetItem(page, itemid);
6700 
6701  /* Deliberately avoid relying on tuple hint bits here */
6703  {
6705 
6707  if (unlikely(!TransactionIdDidCommit(xmin)))
6708  ereport(ERROR,
6710  errmsg_internal("uncommitted xmin %u needs to be frozen",
6711  xmin)));
6712  }
6713 
6714  /*
6715  * TransactionIdDidAbort won't work reliably in the presence of XIDs
6716  * left behind by transactions that were in progress during a crash,
6717  * so we can only check that xmax didn't commit
6718  */
6720  {
6722 
6724  if (unlikely(TransactionIdDidCommit(xmax)))
6725  ereport(ERROR,
6727  errmsg_internal("cannot freeze committed xmax %u",
6728  xmax)));
6729  }
6730  }
6731 
6733 
6734  for (int i = 0; i < ntuples; i++)
6735  {
6736  HeapTupleFreeze *frz = tuples + i;
6737  ItemId itemid = PageGetItemId(page, frz->offset);
6738  HeapTupleHeader htup;
6739 
6740  htup = (HeapTupleHeader) PageGetItem(page, itemid);
6741  heap_execute_freeze_tuple(htup, frz);
6742  }
6743 
6744  MarkBufferDirty(buffer);
6745 
6746  /* Now WAL-log freezing if necessary */
6747  if (RelationNeedsWAL(rel))
6748  {
6751  int nplans;
6752  xl_heap_freeze_page xlrec;
6753  XLogRecPtr recptr;
6754 
6755  /* Prepare deduplicated representation for use in WAL record */
6756  nplans = heap_log_freeze_plan(tuples, ntuples, plans, offsets);
6757 
6758  xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
6760  xlrec.nplans = nplans;
6761 
6762  XLogBeginInsert();
6763  XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
6764 
6765  /*
6766  * The freeze plan array and offset array are not actually in the
6767  * buffer, but pretend that they are. When XLogInsert stores the
6768  * whole buffer, the arrays need not be stored too.
6769  */
6770  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6771  XLogRegisterBufData(0, (char *) plans,
6772  nplans * sizeof(xl_heap_freeze_plan));
6773  XLogRegisterBufData(0, (char *) offsets,
6774  ntuples * sizeof(OffsetNumber));
6775 
6776  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
6777 
6778  PageSetLSN(page, recptr);
6779  }
6780 
6781  END_CRIT_SECTION();
6782 }
6783 
6784 /*
6785  * Comparator used to deduplicate XLOG_HEAP2_FREEZE_PAGE freeze plans
6786  */
6787 static int
6788 heap_log_freeze_cmp(const void *arg1, const void *arg2)
6789 {
6790  HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1;
6791  HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2;
6792 
6793  if (frz1->xmax < frz2->xmax)
6794  return -1;
6795  else if (frz1->xmax > frz2->xmax)
6796  return 1;
6797 
6798  if (frz1->t_infomask2 < frz2->t_infomask2)
6799  return -1;
6800  else if (frz1->t_infomask2 > frz2->t_infomask2)
6801  return 1;
6802 
6803  if (frz1->t_infomask < frz2->t_infomask)
6804  return -1;
6805  else if (frz1->t_infomask > frz2->t_infomask)
6806  return 1;
6807 
6808  if (frz1->frzflags < frz2->frzflags)
6809  return -1;
6810  else if (frz1->frzflags > frz2->frzflags)
6811  return 1;
6812 
6813  /*
6814  * heap_log_freeze_eq would consider these tuple-wise plans to be equal.
6815  * (So the tuples will share a single canonical freeze plan.)
6816  *
6817  * We tiebreak on page offset number to keep each freeze plan's page
6818  * offset number array individually sorted. (Unnecessary, but be tidy.)
6819  */
6820  if (frz1->offset < frz2->offset)
6821  return -1;
6822  else if (frz1->offset > frz2->offset)
6823  return 1;
6824 
6825  Assert(false);
6826  return 0;
6827 }
6828 
6829 /*
6830  * Compare fields that describe actions required to freeze tuple with caller's
6831  * open plan. If everything matches then the frz tuple plan is equivalent to
6832  * caller's plan.
6833  */
6834 static inline bool
6836 {
6837  if (plan->xmax == frz->xmax &&
6838  plan->t_infomask2 == frz->t_infomask2 &&
6839  plan->t_infomask == frz->t_infomask &&
6840  plan->frzflags == frz->frzflags)
6841  return true;
6842 
6843  /* Caller must call heap_log_freeze_new_plan again for frz */
6844  return false;
6845 }
6846 
6847 /*
6848  * Start new plan initialized using tuple-level actions. At least one tuple
6849  * will have steps required to freeze described by caller's plan during REDO.
6850  */
6851 static inline void
6853 {
6854  plan->xmax = frz->xmax;
6855  plan->t_infomask2 = frz->t_infomask2;
6856  plan->t_infomask = frz->t_infomask;
6857  plan->frzflags = frz->frzflags;
6858  plan->ntuples = 1; /* for now */
6859 }
6860 
6861 /*
6862  * Deduplicate tuple-based freeze plans so that each distinct set of
6863  * processing steps is only stored once in XLOG_HEAP2_FREEZE_PAGE records.
6864  * Called during original execution of freezing (for logged relations).
6865  *
6866  * Return value is number of plans set in *plans_out for caller. Also writes
6867  * an array of offset numbers into *offsets_out output argument for caller
6868  * (actually there is one array per freeze plan, but that's not of immediate
6869  * concern to our caller).
6870  */
6871 static int
6873  xl_heap_freeze_plan *plans_out,
6874  OffsetNumber *offsets_out)
6875 {
6876  int nplans = 0;
6877 
6878  /* Sort tuple-based freeze plans in the order required to deduplicate */
6879  qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
6880 
6881  for (int i = 0; i < ntuples; i++)
6882  {
6883  HeapTupleFreeze *frz = tuples + i;
6884 
6885  if (i == 0)
6886  {
6887  /* New canonical freeze plan starting with first tup */
6888  heap_log_freeze_new_plan(plans_out, frz);
6889  nplans++;
6890  }
6891  else if (heap_log_freeze_eq(plans_out, frz))
6892  {
6893  /* tup matches open canonical plan -- include tup in it */
6894  Assert(offsets_out[i - 1] < frz->offset);
6895  plans_out->ntuples++;
6896  }
6897  else
6898  {
6899  /* Tup doesn't match current plan -- done with it now */
6900  plans_out++;
6901 
6902  /* New canonical freeze plan starting with this tup */
6903  heap_log_freeze_new_plan(plans_out, frz);
6904  nplans++;
6905  }
6906 
6907  /*
6908  * Save page offset number in dedicated buffer in passing.
6909  *
6910  * REDO routine relies on the record's offset numbers array grouping
6911  * offset numbers by freeze plan. The sort order within each grouping
6912  * is ascending offset number order, just to keep things tidy.
6913  */
6914  offsets_out[i] = frz->offset;
6915  }
6916 
6917  Assert(nplans > 0 && nplans <= ntuples);
6918 
6919  return nplans;
6920 }
6921 
6922 /*
6923  * heap_freeze_tuple
6924  * Freeze tuple in place, without WAL logging.
6925  *
6926  * Useful for callers like CLUSTER that perform their own WAL logging.
6927  */
6928 bool
6930  TransactionId relfrozenxid, TransactionId relminmxid,
6931  TransactionId FreezeLimit, TransactionId MultiXactCutoff)
6932 {
6933  HeapTupleFreeze frz;
6934  bool do_freeze;
6935  bool totally_frozen;
6936  struct VacuumCutoffs cutoffs;
6937  HeapPageFreeze pagefrz;
6938 
6939  cutoffs.relfrozenxid = relfrozenxid;
6940  cutoffs.relminmxid = relminmxid;
6941  cutoffs.OldestXmin = FreezeLimit;
6942  cutoffs.OldestMxact = MultiXactCutoff;
6943  cutoffs.FreezeLimit = FreezeLimit;
6944  cutoffs.MultiXactCutoff = MultiXactCutoff;
6945 
6946  pagefrz.freeze_required = true;
6947  pagefrz.FreezePageRelfrozenXid = FreezeLimit;
6948  pagefrz.FreezePageRelminMxid = MultiXactCutoff;
6949  pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
6950  pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
6951 
6952  do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
6953  &pagefrz, &frz, &totally_frozen);
6954 
6955  /*
6956  * Note that because this is not a WAL-logged operation, we don't need to
6957  * fill in the offset in the freeze record.
6958  */
6959 
6960  if (do_freeze)
6961  heap_execute_freeze_tuple(tuple, &frz);
6962  return do_freeze;
6963 }
6964 
6965 /*
6966  * For a given MultiXactId, return the hint bits that should be set in the
6967  * tuple's infomask.
6968  *
6969  * Normally this should be called for a multixact that was just created, and
6970  * so is on our local cache, so the GetMembers call is fast.
6971  */
6972 static void
6974  uint16 *new_infomask2)
6975 {
6976  int nmembers;
6977  MultiXactMember *members;
6978  int i;
6979  uint16 bits = HEAP_XMAX_IS_MULTI;
6980  uint16 bits2 = 0;
6981  bool has_update = false;
6982  LockTupleMode strongest = LockTupleKeyShare;
6983 
6984  /*
6985  * We only use this in multis we just created, so they cannot be values
6986  * pre-pg_upgrade.
6987  */
6988  nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6989 
6990  for (i = 0; i < nmembers; i++)
6991  {
6993 
6994  /*
6995  * Remember the strongest lock mode held by any member of the
6996  * multixact.
6997  */
6998  mode = TUPLOCK_from_mxstatus(members[i].status);
6999  if (mode > strongest)
7000  strongest = mode;
7001 
7002  /* See what other bits we need */
7003  switch (members[i].status)
7004  {
7008  break;
7009 
7011  bits2 |= HEAP_KEYS_UPDATED;
7012  break;
7013 
7015  has_update = true;
7016  break;
7017 
7018  case MultiXactStatusUpdate:
7019  bits2 |= HEAP_KEYS_UPDATED;
7020  has_update = true;
7021  break;
7022  }
7023  }
7024 
7025  if (strongest == LockTupleExclusive ||
7026  strongest == LockTupleNoKeyExclusive)
7027  bits |= HEAP_XMAX_EXCL_LOCK;
7028  else if (strongest == LockTupleShare)
7029  bits |= HEAP_XMAX_SHR_LOCK;
7030  else if (strongest == LockTupleKeyShare)
7031  bits |= HEAP_XMAX_KEYSHR_LOCK;
7032 
7033  if (!has_update)
7034  bits |= HEAP_XMAX_LOCK_ONLY;
7035 
7036  if (nmembers > 0)
7037  pfree(members);
7038 
7039  *new_infomask = bits;
7040  *new_infomask2 = bits2;
7041 }
7042 
7043 /*
7044  * MultiXactIdGetUpdateXid
7045  *
7046  * Given a multixact Xmax and corresponding infomask, which does not have the
7047  * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7048  * transaction.
7049  *
7050  * Caller is expected to check the status of the updating transaction, if
7051  * necessary.
7052  */
7053 static TransactionId
7055 {
7056  TransactionId update_xact = InvalidTransactionId;
7057  MultiXactMember *members;
7058  int nmembers;
7059 
7060  Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7061  Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7062 
7063  /*
7064  * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7065  * pre-pg_upgrade.
7066  */
7067  nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7068 
7069  if (nmembers > 0)
7070  {
7071  int i;
7072 
7073  for (i = 0; i < nmembers; i++)
7074  {
7075  /* Ignore lockers */
7076  if (!ISUPDATE_from_mxstatus(members[i].status))
7077  continue;
7078 
7079  /* there can be at most one updater */
7080  Assert(update_xact == InvalidTransactionId);
7081  update_xact = members[i].xid;
7082 #ifndef USE_ASSERT_CHECKING
7083 
7084  /*
7085  * in an assert-enabled build, walk the whole array to ensure
7086  * there's no other updater.
7087  */
7088  break;
7089 #endif
7090  }
7091 
7092  pfree(members);
7093  }
7094 
7095  return update_xact;
7096 }
7097 
7098 /*
7099  * HeapTupleGetUpdateXid
7100  * As above, but use a HeapTupleHeader
7101  *
7102  * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7103  * checking the hint bits.
7104  */
7107 {
7109  tuple->t_infomask);
7110 }
7111 
7112 /*
7113  * Does the given multixact conflict with the current transaction grabbing a
7114  * tuple lock of the given strength?
7115  *
7116  * The passed infomask pairs up with the given multixact in the tuple header.
7117  *
7118  * If current_is_member is not NULL, it is set to 'true' if the current
7119  * transaction is a member of the given multixact.
7120  */
7121 static bool
7123  LockTupleMode lockmode, bool *current_is_member)
7124 {
7125  int nmembers;
7126  MultiXactMember *members;
7127  bool result = false;
7128  LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7129 
7130  if (HEAP_LOCKED_UPGRADED(infomask))
7131  return false;
7132 
7133  nmembers = GetMultiXactIdMembers(multi, &members, false,
7134  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7135  if (nmembers >= 0)
7136  {
7137  int i;
7138 
7139  for (i = 0; i < nmembers; i++)
7140  {
7141  TransactionId memxid;
7142  LOCKMODE memlockmode;
7143 
7144  if (result && (current_is_member == NULL || *current_is_member))
7145  break;
7146 
7147  memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7148 
7149  /* ignore members from current xact (but track their presence) */
7150  memxid = members[i].xid;
7152  {
7153  if (current_is_member != NULL)
7154  *current_is_member = true;
7155  continue;
7156  }
7157  else if (result)
7158  continue;
7159 
7160  /* ignore members that don't conflict with the lock we want */
7161  if (!DoLockModesConflict(memlockmode, wanted))
7162  continue;
7163 
7164  if (ISUPDATE_from_mxstatus(members[i].status))
7165  {
7166  /* ignore aborted updaters */
7167  if (TransactionIdDidAbort(memxid))
7168  continue;
7169  }
7170  else
7171  {
7172  /* ignore lockers-only that are no longer in progress */
7173  if (!TransactionIdIsInProgress(memxid))
7174  continue;
7175  }
7176 
7177  /*
7178  * Whatever remains are either live lockers that conflict with our
7179  * wanted lock, and updaters that are not aborted. Those conflict
7180  * with what we want. Set up to return true, but keep going to
7181  * look for the current transaction among the multixact members,
7182  * if needed.
7183  */
7184  result = true;
7185  }
7186  pfree(members);
7187  }
7188 
7189  return result;
7190 }
7191 
7192 /*
7193  * Do_MultiXactIdWait
7194  * Actual implementation for the two functions below.
7195  *
7196  * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7197  * needed to ensure we only sleep on conflicting members, and the infomask is
7198  * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7199  * indicates whether to use conditional lock acquisition, to allow callers to
7200  * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7201  * context information for error messages. 'remaining', if not NULL, receives
7202  * the number of members that are still running, including any (non-aborted)
7203  * subtransactions of our own transaction.
7204  *
7205  * We do this by sleeping on each member using XactLockTableWait. Any
7206  * members that belong to the current backend are *not* waited for, however;
7207  * this would not merely be useless but would lead to Assert failure inside
7208  * XactLockTableWait. By the time this returns, it is certain that all
7209  * transactions *of other backends* that were members of the MultiXactId
7210  * that conflict with the requested status are dead (and no new ones can have
7211  * been added, since it is not legal to add members to an existing
7212  * MultiXactId).
7213  *
7214  * But by the time we finish sleeping, someone else may have changed the Xmax
7215  * of the containing tuple, so the caller needs to iterate on us somehow.
7216  *
7217  * Note that in case we return false, the number of remaining members is
7218  * not to be trusted.
7219  */
7220 static bool
7222  uint16 infomask, bool nowait,
7223  Relation rel, ItemPointer ctid, XLTW_Oper oper,
7224  int *remaining)
7225 {
7226  bool result = true;
7227  MultiXactMember *members;
7228  int nmembers;
7229  int remain = 0;
7230 
7231  /* for pre-pg_upgrade tuples, no need to sleep at all */
7232  nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7233  GetMultiXactIdMembers(multi, &members, false,
7234  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7235 
7236  if (nmembers >= 0)
7237  {
7238  int i;
7239 
7240  for (i = 0; i < nmembers; i++)
7241  {
7242  TransactionId memxid = members[i].xid;
7243  MultiXactStatus memstatus = members[i].status;
7244 
7246  {
7247  remain++;
7248  continue;
7249  }
7250 
7252  LOCKMODE_from_mxstatus(status)))
7253  {
7254  if (remaining && TransactionIdIsInProgress(memxid))
7255  remain++;
7256  continue;
7257  }
7258 
7259  /*
7260  * This member conflicts with our multi, so we have to sleep (or
7261  * return failure, if asked to avoid waiting.)
7262  *
7263  * Note that we don't set up an error context callback ourselves,
7264  * but instead we pass the info down to XactLockTableWait. This
7265  * might seem a bit wasteful because the context is set up and
7266  * tore down for each member of the multixact, but in reality it
7267  * should be barely noticeable, and it avoids duplicate code.
7268  */
7269  if (nowait)
7270  {
7271  result = ConditionalXactLockTableWait(memxid);
7272  if (!result)
7273  break;
7274  }
7275  else
7276  XactLockTableWait(memxid, rel, ctid, oper);
7277  }
7278 
7279  pfree(members);
7280  }
7281 
7282  if (remaining)
7283  *remaining = remain;
7284 
7285  return result;
7286 }
7287 
7288 /*
7289  * MultiXactIdWait
7290  * Sleep on a MultiXactId.
7291  *
7292  * By the time we finish sleeping, someone else may have changed the Xmax
7293  * of the containing tuple, so the caller needs to iterate on us somehow.
7294  *
7295  * We return (in *remaining, if not NULL) the number of members that are still
7296  * running, including any (non-aborted) subtransactions of our own transaction.
7297  */
7298 static void
7300  Relation rel, ItemPointer ctid, XLTW_Oper oper,
7301  int *remaining)
7302 {
7303  (void) Do_MultiXactIdWait(multi, status, infomask, false,
7304  rel, ctid, oper, remaining);
7305 }
7306 
7307 /*
7308  * ConditionalMultiXactIdWait
7309  * As above, but only lock if we can get the lock without blocking.
7310  *
7311  * By the time we finish sleeping, someone else may have changed the Xmax
7312  * of the containing tuple, so the caller needs to iterate on us somehow.
7313  *
7314  * If the multixact is now all gone, return true. Returns false if some
7315  * transactions might still be running.
7316  *
7317  * We return (in *remaining, if not NULL) the number of members that are still
7318  * running, including any (non-aborted) subtransactions of our own transaction.
7319  */
7320 static bool
7322  uint16 infomask, Relation rel, int *remaining)
7323 {
7324  return Do_MultiXactIdWait(multi, status, infomask, true,
7325  rel, NULL, XLTW_None, remaining);
7326 }
7327 
7328 /*
7329  * heap_tuple_needs_eventual_freeze
7330  *
7331  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7332  * will eventually require freezing (if tuple isn't removed by pruning first).
7333  */
7334 bool
7336 {
7337  TransactionId xid;
7338 
7339  /*
7340  * If xmin is a normal transaction ID, this tuple is definitely not
7341  * frozen.
7342  */
7343  xid = HeapTupleHeaderGetXmin(tuple);
7344  if (TransactionIdIsNormal(xid))
7345  return true;
7346 
7347  /*
7348  * If xmax is a valid xact or multixact, this tuple is also not frozen.
7349  */
7350  if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7351  {
7352  MultiXactId multi;
7353 
7354  multi = HeapTupleHeaderGetRawXmax(tuple);
7355  if (MultiXactIdIsValid(multi))
7356  return true;
7357  }
7358  else
7359  {
7360  xid = HeapTupleHeaderGetRawXmax(tuple);
7361  if (TransactionIdIsNormal(xid))
7362  return true;
7363  }
7364 
7365  if (tuple->t_infomask & HEAP_MOVED)
7366  {
7367  xid = HeapTupleHeaderGetXvac(tuple);
7368  if (TransactionIdIsNormal(xid))
7369  return true;
7370  }
7371 
7372  return false;
7373 }
7374 
7375 /*
7376  * heap_tuple_should_freeze
7377  *
7378  * Return value indicates if heap_prepare_freeze_tuple sibling function would
7379  * (or should) force freezing of the heap page that contains caller's tuple.
7380  * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
7381  * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
7382  *
7383  * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
7384  * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
7385  * Our working assumption is that caller won't decide to freeze this tuple.
7386  * It's up to caller to only ratchet back its own top-level trackers after the
7387  * point that it fully commits to not freezing the tuple/page in question.
7388  */
7389 bool
7391  const struct VacuumCutoffs *cutoffs,
7392  TransactionId *NoFreezePageRelfrozenXid,
7393  MultiXactId *NoFreezePageRelminMxid)
7394 {
7395  TransactionId xid;
7396  MultiXactId multi;
7397  bool freeze = false;
7398 
7399  /* First deal with xmin */
7400  xid = HeapTupleHeaderGetXmin(tuple);
7401  if (TransactionIdIsNormal(xid))
7402  {
7404  if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7405  *NoFreezePageRelfrozenXid = xid;
7406  if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7407  freeze = true;
7408  }
7409 
7410  /* Now deal with xmax */
7411  xid = InvalidTransactionId;
7412  multi = InvalidMultiXactId;
7413  if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7414  multi = HeapTupleHeaderGetRawXmax(tuple);
7415  else
7416  xid = HeapTupleHeaderGetRawXmax(tuple);
7417 
7418  if (TransactionIdIsNormal(xid))
7419  {
7421  /* xmax is a non-permanent XID */
7422  if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7423  *NoFreezePageRelfrozenXid = xid;
7424  if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7425  freeze = true;
7426  }
7427  else if (!MultiXactIdIsValid(multi))
7428  {
7429  /* xmax is a permanent XID or invalid MultiXactId/XID */
7430  }
7431  else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7432  {
7433  /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7434  if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7435  *NoFreezePageRelminMxid = multi;
7436  /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7437  freeze = true;
7438  }
7439  else
7440  {
7441  /* xmax is a MultiXactId that may have an updater XID */
7442  MultiXactMember *members;
7443  int nmembers;
7444 
7445  Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi));
7446  if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7447  *NoFreezePageRelminMxid = multi;
7448  if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
7449  freeze = true;
7450 
7451  /* need to check whether any member of the mxact is old */
7452  nmembers = GetMultiXactIdMembers(multi, &members, false,
7454 
7455  for (int i = 0; i < nmembers; i++)
7456  {
7457  xid = members[i].xid;
7459  if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7460  *NoFreezePageRelfrozenXid = xid;
7461  if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7462  freeze = true;
7463  }
7464  if (nmembers > 0)
7465  pfree(members);
7466  }
7467 
7468  if (tuple->t_infomask & HEAP_MOVED)
7469  {
7470  xid = HeapTupleHeaderGetXvac(tuple);
7471  if (TransactionIdIsNormal(xid))
7472  {
7474  if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7475  *NoFreezePageRelfrozenXid = xid;
7476  /* heap_prepare_freeze_tuple forces xvac freezing */
7477  freeze = true;
7478  }
7479  }
7480 
7481  return freeze;
7482 }
7483 
7484 /*
7485  * Maintain snapshotConflictHorizon for caller by ratcheting forward its value
7486  * using any committed XIDs contained in 'tuple', an obsolescent heap tuple
7487  * that caller is in the process of physically removing, e.g. via HOT pruning
7488  * or index deletion.
7489  *
7490  * Caller must initialize its value to InvalidTransactionId, which is
7491  * generally interpreted as "definitely no need for a recovery conflict".
7492  * Final value must reflect all heap tuples that caller will physically remove
7493  * (or remove TID references to) via its ongoing pruning/deletion operation.
7494  * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
7495  * caller's WAL record) by REDO routine when it replays caller's operation.
7496  */
7497 void
7499  TransactionId *snapshotConflictHorizon)
7500 {
7501  TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
7503  TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
7504 
7505  if (tuple->t_infomask & HEAP_MOVED)
7506  {
7507  if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
7508  *snapshotConflictHorizon = xvac;
7509  }
7510 
7511  /*
7512  * Ignore tuples inserted by an aborted transaction or if the tuple was
7513  * updated/deleted by the inserting transaction.
7514  *
7515  * Look for a committed hint bit, or if no xmin bit is set, check clog.
7516  */
7517  if (HeapTupleHeaderXminCommitted(tuple) ||
7519  {
7520  if (xmax != xmin &&
7521  TransactionIdFollows(xmax, *snapshotConflictHorizon))
7522  *snapshotConflictHorizon = xmax;
7523  }
7524 }
7525 
7526 #ifdef USE_PREFETCH
7527 /*
7528  * Helper function for heap_index_delete_tuples. Issues prefetch requests for
7529  * prefetch_count buffers. The prefetch_state keeps track of all the buffers
7530  * we can prefetch, and which have already been prefetched; each call to this
7531  * function picks up where the previous call left off.
7532  *
7533  * Note: we expect the deltids array to be sorted in an order that groups TIDs
7534  * by heap block, with all TIDs for each block appearing together in exactly
7535  * one group.
7536  */
7537 static void
7538 index_delete_prefetch_buffer(Relation rel,
7539  IndexDeletePrefetchState *prefetch_state,
7540  int prefetch_count)
7541 {
7542  BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
7543  int count = 0;
7544  int i;
7545  int ndeltids = prefetch_state->ndeltids;
7546  TM_IndexDelete *deltids = prefetch_state->deltids;
7547 
7548  for (i = prefetch_state->next_item;
7549  i < ndeltids && count < prefetch_count;
7550  i++)
7551  {
7552  ItemPointer htid = &deltids[i].tid;
7553 
7554  if (cur_hblkno == InvalidBlockNumber ||
7555  ItemPointerGetBlockNumber(htid) != cur_hblkno)
7556  {
7557  cur_hblkno = ItemPointerGetBlockNumber(htid);
7558  PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
7559  count++;
7560  }
7561  }
7562 
7563  /*
7564  * Save the prefetch position so that next time we can continue from that
7565  * position.
7566  */
7567  prefetch_state->next_item = i;
7568  prefetch_state->cur_hblkno = cur_hblkno;
7569 }
7570 #endif
7571 
7572 /*
7573  * Helper function for heap_index_delete_tuples. Checks for index corruption
7574  * involving an invalid TID in index AM caller's index page.
7575  *
7576  * This is an ideal place for these checks. The index AM must hold a buffer
7577  * lock on the index page containing the TIDs we examine here, so we don't
7578  * have to worry about concurrent VACUUMs at all. We can be sure that the
7579  * index is corrupt when htid points directly to an LP_UNUSED item or
7580  * heap-only tuple, which is not the case during standard index scans.
7581  */
7582 static inline void
7584  Page page, OffsetNumber maxoff,
7585  ItemPointer htid, TM_IndexStatus *istatus)
7586 {
7587  OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid);
7588  ItemId iid;
7589 
7591 
7592  if (unlikely(indexpagehoffnum > maxoff))
7593  ereport(ERROR,
7594  (errcode(ERRCODE_INDEX_CORRUPTED),
7595  errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
7597  indexpagehoffnum,
7598  istatus->idxoffnum, delstate->iblknum,
7599  RelationGetRelationName(delstate->irel))));
7600 
7601  iid = PageGetItemId(page, indexpagehoffnum);
7602  if (unlikely(!ItemIdIsUsed(iid)))
7603  ereport(ERROR,
7604  (errcode(ERRCODE_INDEX_CORRUPTED),
7605  errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
7607  indexpagehoffnum,
7608  istatus->idxoffnum, delstate->iblknum,
7609  RelationGetRelationName(delstate->irel))));
7610 
7611  if (ItemIdHasStorage(iid))
7612  {
7613  HeapTupleHeader htup;
7614 
7615  Assert(ItemIdIsNormal(iid));
7616  htup = (HeapTupleHeader) PageGetItem(page, iid);
7617 
7619  ereport(ERROR,
7620  (errcode(ERRCODE_INDEX_CORRUPTED),
7621  errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
7623  indexpagehoffnum,
7624  istatus->idxoffnum, delstate->iblknum,
7625  RelationGetRelationName(delstate->irel))));
7626  }
7627 }
7628 
7629 /*
7630  * heapam implementation of tableam's index_delete_tuples interface.
7631  *
7632  * This helper function is called by index AMs during index tuple deletion.
7633  * See tableam header comments for an explanation of the interface implemented
7634  * here and a general theory of operation. Note that each call here is either
7635  * a simple index deletion call, or a bottom-up index deletion call.
7636  *
7637  * It's possible for this to generate a fair amount of I/O, since we may be
7638  * deleting hundreds of tuples from a single index block. To amortize that
7639  * cost to some degree, this uses prefetching and combines repeat accesses to
7640  * the same heap block.
7641  */
7644 {
7645  /* Initial assumption is that earlier pruning took care of conflict */
7646  TransactionId snapshotConflictHorizon = InvalidTransactionId;
7649  Page page = NULL;
7651  TransactionId priorXmax;
7652 #ifdef USE_PREFETCH
7653  IndexDeletePrefetchState prefetch_state;
7654  int prefetch_distance;
7655 #endif
7656  SnapshotData SnapshotNonVacuumable;
7657  int finalndeltids = 0,
7658  nblocksaccessed = 0;
7659 
7660  /* State that's only used in bottom-up index deletion case */
7661  int nblocksfavorable = 0;
7662  int curtargetfreespace = delstate->bottomupfreespace,
7663  lastfreespace = 0,
7664  actualfreespace = 0;
7665  bool bottomup_final_block = false;
7666 
7667  InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
7668 
7669  /* Sort caller's deltids array by TID for further processing */
7670  index_delete_sort(delstate);
7671 
7672  /*
7673  * Bottom-up case: resort deltids array in an order attuned to where the
7674  * greatest number of promising TIDs are to be found, and determine how
7675  * many blocks from the start of sorted array should be considered
7676  * favorable. This will also shrink the deltids array in order to
7677  * eliminate completely unfavorable blocks up front.
7678  */
7679  if (delstate->bottomup)
7680  nblocksfavorable = bottomup_sort_and_shrink(delstate);
7681 
7682 #ifdef USE_PREFETCH
7683  /* Initialize prefetch state. */
7684  prefetch_state.cur_hblkno = InvalidBlockNumber;
7685  prefetch_state.next_item = 0;
7686  prefetch_state.ndeltids = delstate->ndeltids;
7687  prefetch_state.deltids = delstate->deltids;
7688 
7689  /*
7690  * Determine the prefetch distance that we will attempt to maintain.
7691  *
7692  * Since the caller holds a buffer lock somewhere in rel, we'd better make
7693  * sure that isn't a catalog relation before we call code that does
7694  * syscache lookups, to avoid risk of deadlock.
7695  */
7696  if (IsCatalogRelation(rel))
7697  prefetch_distance = maintenance_io_concurrency;
7698  else
7699  prefetch_distance =
7701 
7702  /* Cap initial prefetch distance for bottom-up deletion caller */
7703  if (delstate->bottomup)
7704  {
7705  Assert(nblocksfavorable >= 1);
7706  Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS);
7707  prefetch_distance = Min(prefetch_distance, nblocksfavorable);
7708  }
7709 
7710  /* Start prefetching. */
7711  index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
7712 #endif
7713 
7714  /* Iterate over deltids, determine which to delete, check their horizon */
7715  Assert(delstate->ndeltids > 0);
7716  for (int i = 0; i < delstate->ndeltids; i++)
7717  {
7718  TM_IndexDelete *ideltid = &delstate->deltids[i];
7719  TM_IndexStatus *istatus = delstate->status + ideltid->id;
7720  ItemPointer htid = &ideltid->tid;
7721  OffsetNumber offnum;
7722 
7723  /*
7724  * Read buffer, and perform required extra steps each time a new block
7725  * is encountered. Avoid refetching if it's the same block as the one
7726  * from the last htid.
7727  */
7728  if (blkno == InvalidBlockNumber ||
7729  ItemPointerGetBlockNumber(htid) != blkno)
7730  {
7731  /*
7732  * Consider giving up early for bottom-up index deletion caller
7733  * first. (Only prefetch next-next block afterwards, when it
7734  * becomes clear that we're at least going to access the next
7735  * block in line.)
7736  *
7737  * Sometimes the first block frees so much space for bottom-up
7738  * caller that the deletion process can end without accessing any
7739  * more blocks. It is usually necessary to access 2 or 3 blocks
7740  * per bottom-up deletion operation, though.
7741  */
7742  if (delstate->bottomup)
7743  {
7744  /*
7745  * We often allow caller to delete a few additional items
7746  * whose entries we reached after the point that space target
7747  * from caller was satisfied. The cost of accessing the page
7748  * was already paid at that point, so it made sense to finish
7749  * it off. When that happened, we finalize everything here
7750  * (by finishing off the whole bottom-up deletion operation
7751  * without needlessly paying the cost of accessing any more
7752  * blocks).
7753  */
7754  if (bottomup_final_block)
7755  break;
7756 
7757  /*
7758  * Give up when we didn't enable our caller to free any
7759  * additional space as a result of processing the page that we
7760  * just finished up with. This rule is the main way in which
7761  * we keep the cost of bottom-up deletion under control.
7762  */
7763  if (nblocksaccessed >= 1 && actualfreespace == lastfreespace)
7764  break;
7765  lastfreespace = actualfreespace; /* for next time */
7766 
7767  /*
7768  * Deletion operation (which is bottom-up) will definitely
7769  * access the next block in line. Prepare for that now.
7770  *
7771  * Decay target free space so that we don't hang on for too
7772  * long with a marginal case. (Space target is only truly
7773  * helpful when it allows us to recognize that we don't need
7774  * to access more than 1 or 2 blocks to satisfy caller due to
7775  * agreeable workload characteristics.)
7776  *
7777  * We are a bit more patient when we encounter contiguous
7778  * blocks, though: these are treated as favorable blocks. The
7779  * decay process is only applied when the next block in line
7780  * is not a favorable/contiguous block. This is not an
7781  * exception to the general rule; we still insist on finding
7782  * at least one deletable item per block accessed. See
7783  * bottomup_nblocksfavorable() for full details of the theory
7784  * behind favorable blocks and heap block locality in general.
7785  *
7786  * Note: The first block in line is always treated as a
7787  * favorable block, so the earliest possible point that the
7788  * decay can be applied is just before we access the second
7789  * block in line. The Assert() verifies this for us.
7790  */
7791  Assert(nblocksaccessed > 0 || nblocksfavorable > 0);
7792  if (nblocksfavorable > 0)
7793  nblocksfavorable--;
7794  else
7795  curtargetfreespace /= 2;
7796  }
7797 
7798  /* release old buffer */
7799  if (BufferIsValid(buf))
7801 
7802  blkno = ItemPointerGetBlockNumber(htid);
7803  buf = ReadBuffer(rel, blkno);
7804  nblocksaccessed++;
7805  Assert(!delstate->bottomup ||
7806  nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS);
7807 
7808 #ifdef USE_PREFETCH
7809 
7810  /*
7811  * To maintain the prefetch distance, prefetch one more page for
7812  * each page we read.
7813  */
7814  index_delete_prefetch_buffer(rel, &prefetch_state, 1);
7815 #endif
7816 
7818 
7819  page = BufferGetPage(buf);
7820  maxoff = PageGetMaxOffsetNumber(page);
7821  }
7822 
7823  /*
7824  * In passing, detect index corruption involving an index page with a
7825  * TID that points to a location in the heap that couldn't possibly be
7826  * correct. We only do this with actual TIDs from caller's index page
7827  * (not items reached by traversing through a HOT chain).
7828  */
7829  index_delete_check_htid(delstate, page, maxoff, htid, istatus);
7830 
7831  if (istatus->knowndeletable)
7832  Assert(!delstate->bottomup && !istatus->promising);
7833  else
7834  {
7835  ItemPointerData tmp = *htid;
7836  HeapTupleData heapTuple;
7837 
7838  /* Are any tuples from this HOT chain non-vacuumable? */
7839  if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable,
7840  &heapTuple, NULL, true))
7841  continue; /* can't delete entry */
7842 
7843  /* Caller will delete, since whole HOT chain is vacuumable */
7844  istatus->knowndeletable = true;
7845 
7846  /* Maintain index free space info for bottom-up deletion case */
7847  if (delstate->bottomup)
7848  {
7849  Assert(istatus->freespace > 0);
7850  actualfreespace += istatus->freespace;
7851  if (actualfreespace >= curtargetfreespace)
7852  bottomup_final_block = true;
7853  }
7854  }
7855 
7856  /*
7857  * Maintain snapshotConflictHorizon value for deletion operation as a
7858  * whole by advancing current value using heap tuple headers. This is
7859  * loosely based on the logic for pruning a HOT chain.
7860  */
7861  offnum = ItemPointerGetOffsetNumber(htid);
7862  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
7863  for (;;)
7864  {
7865  ItemId lp;
7866  HeapTupleHeader htup;
7867 
7868  /* Sanity check (pure paranoia) */
7869  if (offnum < FirstOffsetNumber)
7870  break;
7871 
7872  /*
7873  * An offset past the end of page's line pointer array is possible
7874  * when the array was truncated
7875  */
7876  if (offnum > maxoff)
7877  break;
7878 
7879  lp = PageGetItemId(page, offnum);
7880  if (ItemIdIsRedirected(lp))
7881  {
7882  offnum = ItemIdGetRedirect(lp);
7883  continue;
7884  }
7885 
7886  /*
7887  * We'll often encounter LP_DEAD line pointers (especially with an
7888  * entry marked knowndeletable by our caller up front). No heap
7889  * tuple headers get examined for an htid that leads us to an
7890  * LP_DEAD item. This is okay because the earlier pruning
7891  * operation that made the line pointer LP_DEAD in the first place
7892  * must have considered the original tuple header as part of
7893  * generating its own snapshotConflictHorizon value.
7894  *
7895  * Relying on XLOG_HEAP2_PRUNE records like this is the same
7896  * strategy that index vacuuming uses in all cases. Index VACUUM
7897  * WAL records don't even have a snapshotConflictHorizon field of
7898  * their own for this reason.
7899  */
7900  if (!ItemIdIsNormal(lp))
7901  break;
7902 
7903  htup = (HeapTupleHeader) PageGetItem(page, lp);
7904 
7905  /*
7906  * Check the tuple XMIN against prior XMAX, if any
7907  */
7908  if (TransactionIdIsValid(priorXmax) &&
7909  !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
7910  break;
7911 
7913  &snapshotConflictHorizon);
7914 
7915  /*
7916  * If the tuple is not HOT-updated, then we are at the end of this
7917  * HOT-chain. No need to visit later tuples from the same update
7918  * chain (they get their own index entries) -- just move on to
7919  * next htid from index AM caller.
7920  */
7921  if (!HeapTupleHeaderIsHotUpdated(htup))
7922  break;
7923 
7924  /* Advance to next HOT chain member */
7925  Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
7926  offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
7927  priorXmax = HeapTupleHeaderGetUpdateXid(htup);
7928  }
7929 
7930  /* Enable further/final shrinking of deltids for caller */
7931  finalndeltids = i + 1;
7932  }
7933 
7935 
7936  /*
7937  * Shrink deltids array to exclude non-deletable entries at the end. This
7938  * is not just a minor optimization. Final deltids array size might be
7939  * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
7940  * ndeltids being zero in all cases with zero total deletable entries.
7941  */
7942  Assert(finalndeltids > 0 || delstate->bottomup);
7943  delstate->ndeltids = finalndeltids;
7944 
7945  return snapshotConflictHorizon;
7946 }
7947 
7948 /*
7949  * Specialized inlineable comparison function for index_delete_sort()
7950  */
7951 static inline int
7953 {
7954  ItemPointer tid1 = &deltid1->tid;
7955  ItemPointer tid2 = &deltid2->tid;
7956 
7957  {
7960 
7961  if (blk1 != blk2)
7962  return (blk1 < blk2) ? -1 : 1;
7963  }
7964  {
7967 
7968  if (pos1 != pos2)
7969  return (pos1 < pos2) ? -1 : 1;
7970  }
7971 
7972  Assert(false);
7973 
7974  return 0;
7975 }
7976 
7977 /*
7978  * Sort deltids array from delstate by TID. This prepares it for further
7979  * processing by heap_index_delete_tuples().
7980  *
7981  * This operation becomes a noticeable consumer of CPU cycles with some
7982  * workloads, so we go to the trouble of specialization/micro optimization.
7983  * We use shellsort for this because it's easy to specialize, compiles to
7984  * relatively few instructions, and is adaptive to presorted inputs/subsets
7985  * (which are typical here).
7986  */
7987 static void
7989 {
7990  TM_IndexDelete *deltids = delstate->deltids;
7991  int ndeltids = delstate->ndeltids;
7992  int low = 0;
7993 
7994  /*
7995  * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
7996  *
7997  * This implementation is fast with array sizes up to ~4500. This covers
7998  * all supported BLCKSZ values.
7999  */
8000  const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8001 
8002  /* Think carefully before changing anything here -- keep swaps cheap */
8003  StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8004  "element size exceeds 8 bytes");
8005 
8006  for (int g = 0; g < lengthof(gaps); g++)
8007  {
8008  for (int hi = gaps[g], i = low + hi; i < ndeltids; i++)
8009  {
8010  TM_IndexDelete d = deltids[i];
8011  int j = i;
8012 
8013  while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8014  {
8015  deltids[j] = deltids[j - hi];
8016  j -= hi;
8017  }
8018  deltids[j] = d;
8019  }
8020  }
8021 }
8022 
8023 /*
8024  * Returns how many blocks should be considered favorable/contiguous for a
8025  * bottom-up index deletion pass. This is a number of heap blocks that starts
8026  * from and includes the first block in line.
8027  *
8028  * There is always at least one favorable block during bottom-up index
8029  * deletion. In the worst case (i.e. with totally random heap blocks) the
8030  * first block in line (the only favorable block) can be thought of as a
8031  * degenerate array of contiguous blocks that consists of a single block.
8032  * heap_index_delete_tuples() will expect this.
8033  *
8034  * Caller passes blockgroups, a description of the final order that deltids
8035  * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
8036  * processing. Note that deltids need not actually be sorted just yet (caller
8037  * only passes deltids to us so that we can interpret blockgroups).
8038  *
8039  * You might guess that the existence of contiguous blocks cannot matter much,
8040  * since in general the main factor that determines which blocks we visit is
8041  * the number of promising TIDs, which is a fixed hint from the index AM.
8042  * We're not really targeting the general case, though -- the actual goal is
8043  * to adapt our behavior to a wide variety of naturally occurring conditions.
8044  * The effects of most of the heuristics we apply are only noticeable in the
8045  * aggregate, over time and across many _related_ bottom-up index deletion
8046  * passes.
8047  *
8048  * Deeming certain blocks favorable allows heapam to recognize and adapt to
8049  * workloads where heap blocks visited during bottom-up index deletion can be
8050  * accessed contiguously, in the sense that each newly visited block is the
8051  * neighbor of the block that bottom-up deletion just finished processing (or
8052  * close enough to it). It will likely be cheaper to access more favorable
8053  * blocks sooner rather than later (e.g. in this pass, not across a series of
8054  * related bottom-up passes). Either way it is probably only a matter of time
8055  * (or a matter of further correlated version churn) before all blocks that
8056  * appear together as a single large batch of favorable blocks get accessed by
8057  * _some_ bottom-up pass. Large batches of favorable blocks tend to either
8058  * appear almost constantly or not even once (it all depends on per-index
8059  * workload characteristics).
8060  *
8061  * Note that the blockgroups sort order applies a power-of-two bucketing
8062  * scheme that creates opportunities for contiguous groups of blocks to get
8063  * batched together, at least with workloads that are naturally amenable to
8064  * being driven by heap block locality. This doesn't just enhance the spatial
8065  * locality of bottom-up heap block processing in the obvious way. It also
8066  * enables temporal locality of access, since sorting by heap block number
8067  * naturally tends to make the bottom-up processing order deterministic.
8068  *
8069  * Consider the following example to get a sense of how temporal locality
8070  * might matter: There is a heap relation with several indexes, each of which
8071  * is low to medium cardinality. It is subject to constant non-HOT updates.
8072  * The updates are skewed (in one part of the primary key, perhaps). None of
8073  * the indexes are logically modified by the UPDATE statements (if they were
8074  * then bottom-up index deletion would not be triggered in the first place).
8075  * Naturally, each new round of index tuples (for each heap tuple that gets a
8076  * heap_update() call) will have the same heap TID in each and every index.
8077  * Since these indexes are low cardinality and never get logically modified,
8078  * heapam processing during bottom-up deletion passes will access heap blocks
8079  * in approximately sequential order. Temporal locality of access occurs due
8080  * to bottom-up deletion passes behaving very similarly across each of the
8081  * indexes at any given moment. This keeps the number of buffer misses needed
8082  * to visit heap blocks to a minimum.
8083  */
8084 static int
8085 bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups,
8086  TM_IndexDelete *deltids)
8087 {
8088  int64 lastblock = -1;
8089  int nblocksfavorable = 0;
8090 
8091  Assert(nblockgroups >= 1);
8092  Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS);
8093 
8094  /*
8095  * We tolerate heap blocks that will be accessed only slightly out of
8096  * physical order. Small blips occur when a pair of almost-contiguous
8097  * blocks happen to fall into different buckets (perhaps due only to a
8098  * small difference in npromisingtids that the bucketing scheme didn't
8099  * quite manage to ignore). We effectively ignore these blips by applying
8100  * a small tolerance. The precise tolerance we use is a little arbitrary,
8101  * but it works well enough in practice.
8102  */
8103  for (int b = 0; b < nblockgroups; b++)
8104  {
8105  IndexDeleteCounts *group = blockgroups + b;
8106  TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8107  BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid);
8108 
8109  if (lastblock != -1 &&
8110  ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS ||
8111  (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS))
8112  break;
8113 
8114  nblocksfavorable++;
8115  lastblock = block;
8116  }
8117 
8118  /* Always indicate that there is at least 1 favorable block */
8119  Assert(nblocksfavorable >= 1);
8120 
8121  return nblocksfavorable;
8122 }
8123 
8124 /*
8125  * qsort comparison function for bottomup_sort_and_shrink()
8126  */
8127 static int
8128 bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
8129 {
8130  const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1;
8131  const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2;
8132 
8133  /*
8134  * Most significant field is npromisingtids (which we invert the order of
8135  * so as to sort in desc order).
8136  *
8137  * Caller should have already normalized npromisingtids fields into
8138  * power-of-two values (buckets).
8139  */
8140  if (group1->npromisingtids > group2->npromisingtids)
8141  return -1;
8142  if (group1->npromisingtids < group2->npromisingtids)
8143  return 1;
8144 
8145  /*
8146  * Tiebreak: desc ntids sort order.
8147  *
8148  * We cannot expect power-of-two values for ntids fields. We should
8149  * behave as if they were already rounded up for us instead.
8150  */
8151  if (group1->ntids != group2->ntids)
8152  {
8153  uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids);
8154  uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids);
8155 
8156  if (ntids1 > ntids2)
8157  return -1;
8158  if (ntids1 < ntids2)
8159  return 1;
8160  }
8161 
8162  /*
8163  * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8164  * block in deltids array) order.
8165  *
8166  * This is equivalent to sorting in ascending heap block number order
8167  * (among otherwise equal subsets of the array). This approach allows us
8168  * to avoid accessing the out-of-line TID. (We rely on the assumption
8169  * that the deltids array was sorted in ascending heap TID order when
8170  * these offsets to the first TID from each heap block group were formed.)
8171  */
8172  if (group1->ifirsttid > group2->ifirsttid)
8173  return 1;
8174  if (group1->ifirsttid < group2->ifirsttid)
8175  return -1;
8176 
8177  pg_unreachable();
8178 
8179  return 0;
8180 }
8181 
8182 /*
8183  * heap_index_delete_tuples() helper function for bottom-up deletion callers.
8184  *
8185  * Sorts deltids array in the order needed for useful processing by bottom-up
8186  * deletion. The array should already be sorted in TID order when we're
8187  * called. The sort process groups heap TIDs from deltids into heap block
8188  * groupings. Earlier/more-promising groups/blocks are usually those that are
8189  * known to have the most "promising" TIDs.
8190  *
8191  * Sets new size of deltids array (ndeltids) in state. deltids will only have
8192  * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
8193  * return. This often means that deltids will be shrunk to a small fraction
8194  * of its original size (we eliminate many heap blocks from consideration for
8195  * caller up front).
8196  *
8197  * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
8198  * for a definition and full details.
8199  */
8200 static int
8202 {
8203  IndexDeleteCounts *blockgroups;
8204  TM_IndexDelete *reordereddeltids;
8205  BlockNumber curblock = InvalidBlockNumber;
8206  int nblockgroups = 0;
8207  int ncopied = 0;
8208  int nblocksfavorable = 0;
8209 
8210  Assert(delstate->bottomup);
8211  Assert(delstate->ndeltids > 0);
8212 
8213  /* Calculate per-heap-block count of TIDs */
8214  blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
8215  for (int i = 0; i < delstate->ndeltids; i++)
8216  {
8217  TM_IndexDelete *ideltid = &delstate->deltids[i];
8218  TM_IndexStatus *istatus = delstate->status + ideltid->id;
8219  ItemPointer htid = &ideltid->tid;
8220  bool promising = istatus->promising;
8221 
8222  if (curblock != ItemPointerGetBlockNumber(htid))
8223  {
8224  /* New block group */
8225  nblockgroups++;
8226 
8227  Assert(curblock < ItemPointerGetBlockNumber(htid) ||
8228  !BlockNumberIsValid(curblock));
8229 
8230  curblock = ItemPointerGetBlockNumber(htid);
8231  blockgroups[nblockgroups - 1].ifirsttid = i;
8232  blockgroups[nblockgroups - 1].ntids = 1;
8233  blockgroups[nblockgroups - 1].npromisingtids = 0;
8234  }
8235  else
8236  {
8237  blockgroups[nblockgroups - 1].ntids++;
8238  }
8239 
8240  if (promising)
8241  blockgroups[nblockgroups - 1].npromisingtids++;
8242  }
8243 
8244  /*
8245  * We're about ready to sort block groups to determine the optimal order
8246  * for visiting heap blocks. But before we do, round the number of
8247  * promising tuples for each block group up to the next power-of-two,
8248  * unless it is very low (less than 4), in which case we round up to 4.
8249  * npromisingtids is far too noisy to trust when choosing between a pair
8250  * of block groups that both have very low values.
8251  *
8252  * This scheme divides heap blocks/block groups into buckets. Each bucket
8253  * contains blocks that have _approximately_ the same number of promising
8254  * TIDs as each other. The goal is to ignore relatively small differences
8255  * in the total number of promising entries, so that the whole process can
8256  * give a little weight to heapam factors (like heap block locality)
8257  * instead. This isn't a trade-off, really -- we have nothing to lose. It
8258  * would be foolish to interpret small differences in npromisingtids
8259  * values as anything more than noise.
8260  *
8261  * We tiebreak on nhtids when sorting block group subsets that have the
8262  * same npromisingtids, but this has the same issues as npromisingtids,
8263  * and so nhtids is subject to the same power-of-two bucketing scheme. The
8264  * only reason that we don't fix nhtids in the same way here too is that
8265  * we'll need accurate nhtids values after the sort. We handle nhtids
8266  * bucketization dynamically instead (in the sort comparator).
8267  *
8268  * See bottomup_nblocksfavorable() for a full explanation of when and how
8269  * heap locality/favorable blocks can significantly influence when and how
8270  * heap blocks are accessed.
8271  */
8272  for (int b = 0; b < nblockgroups; b++)
8273  {
8274  IndexDeleteCounts *group = blockgroups + b;
8275 
8276  /* Better off falling back on nhtids with low npromisingtids */
8277  if (group->npromisingtids <= 4)
8278  group->npromisingtids = 4;
8279  else
8280  group->npromisingtids =
8282  }
8283 
8284  /* Sort groups and rearrange caller's deltids array */
8285  qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts),
8287  reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8288 
8289  nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups);
8290  /* Determine number of favorable blocks at the start of final deltids */
8291  nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups,
8292  delstate->deltids);
8293 
8294  for (int b = 0; b < nblockgroups; b++)
8295  {
8296  IndexDeleteCounts *group = blockgroups + b;
8297  TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8298 
8299  memcpy(reordereddeltids + ncopied, firstdtid,
8300  sizeof(TM_IndexDelete) * group->ntids);
8301  ncopied += group->ntids;
8302  }
8303 
8304  /* Copy final grouped and sorted TIDs back into start of caller's array */
8305  memcpy(delstate->deltids, reordereddeltids,
8306  sizeof(TM_IndexDelete) * ncopied);
8307  delstate->ndeltids = ncopied;
8308 
8309  pfree(reordereddeltids);
8310  pfree(blockgroups);
8311 
8312  return nblocksfavorable;
8313 }
8314 
8315 /*
8316  * Perform XLogInsert for a heap-visible operation. 'block' is the block
8317  * being marked all-visible, and vm_buffer is the buffer containing the
8318  * corresponding visibility map block. Both should have already been modified
8319  * and dirtied.
8320  *
8321  * snapshotConflictHorizon comes from the largest xmin on the page being
8322  * marked all-visible. REDO routine uses it to generate recovery conflicts.
8323  *
8324  * If checksums or wal_log_hints are enabled, we may also generate a full-page
8325  * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
8326  * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
8327  * update the heap page's LSN.
8328  */
8329 XLogRecPtr
8330 log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer,
8331  TransactionId snapshotConflictHorizon, uint8 vmflags)
8332 {
8333  xl_heap_visible xlrec;
8334  XLogRecPtr recptr;
8335  uint8 flags;
8336 
8337  Assert(BufferIsValid(heap_buffer));
8338  Assert(BufferIsValid(vm_buffer));
8339 
8340  xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
8341  xlrec.flags = vmflags;
8344  XLogBeginInsert();
8345  XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
8346 
8347  XLogRegisterBuffer(0, vm_buffer, 0);
8348 
8349  flags = REGBUF_STANDARD;
8350  if (!XLogHintBitIsNeeded())
8351  flags |= REGBUF_NO_IMAGE;
8352  XLogRegisterBuffer(1, heap_buffer, flags);
8353 
8354  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
8355 
8356  return recptr;
8357 }
8358 
8359 /*
8360  * Perform XLogInsert for a heap-update operation. Caller must already
8361  * have modified the buffer(s) and marked them dirty.
8362  */
8363 static XLogRecPtr
8365  Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
8366  HeapTuple old_key_tuple,
8367  bool all_visible_cleared, bool new_all_visible_cleared)
8368 {
8369  xl_heap_update xlrec;
8370  xl_heap_header xlhdr;
8371  xl_heap_header xlhdr_idx;
8372  uint8 info;
8373  uint16 prefix_suffix[2];
8374  uint16 prefixlen = 0,
8375  suffixlen = 0;
8376  XLogRecPtr recptr;
8377  Page page = BufferGetPage(newbuf);
8378  bool need_tuple_data = RelationIsLogicallyLogged(reln);
8379  bool init;
8380  int bufflags;
8381 
8382  /* Caller should not call me on a non-WAL-logged relation */
8383  Assert(RelationNeedsWAL(reln));
8384 
8385  XLogBeginInsert();
8386 
8387  if (HeapTupleIsHeapOnly(newtup))
8388  info = XLOG_HEAP_HOT_UPDATE;
8389  else
8390  info = XLOG_HEAP_UPDATE;
8391 
8392  /*
8393  * If the old and new tuple are on the same page, we only need to log the
8394  * parts of the new tuple that were changed. That saves on the amount of
8395  * WAL we need to write. Currently, we just count any unchanged bytes in
8396  * the beginning and end of the tuple. That's quick to check, and
8397  * perfectly covers the common case that only one field is updated.
8398  *
8399  * We could do this even if the old and new tuple are on different pages,
8400  * but only if we don't make a full-page image of the old page, which is
8401  * difficult to know in advance. Also, if the old tuple is corrupt for
8402  * some reason, it would allow the corruption to propagate the new page,
8403  * so it seems best to avoid. Under the general assumption that most
8404  * updates tend to create the new tuple version on the same page, there
8405  * isn't much to be gained by doing this across pages anyway.
8406  *
8407  * Skip this if we're taking a full-page image of the new page, as we
8408  * don't include the new tuple in the WAL record in that case. Also
8409  * disable if wal_level='logical', as logical decoding needs to be able to
8410  * read the new tuple in whole from the WAL record alone.
8411  */
8412  if (oldbuf == newbuf && !need_tuple_data &&
8413  !XLogCheckBufferNeedsBackup(newbuf))
8414  {
8415  char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8416  char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8417  int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8418  int newlen = newtup->t_len - newtup->t_data->t_hoff;
8419 
8420  /* Check for common prefix between old and new tuple */
8421  for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8422  {
8423  if (newp[prefixlen] != oldp[prefixlen])
8424  break;
8425  }
8426 
8427  /*
8428  * Storing the length of the prefix takes 2 bytes, so we need to save
8429  * at least 3 bytes or there's no point.
8430  */
8431  if (prefixlen < 3)
8432  prefixlen = 0;
8433 
8434  /* Same for suffix */
8435  for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
8436  {
8437  if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8438  break;
8439  }
8440  if (suffixlen < 3)
8441  suffixlen = 0;
8442  }
8443 
8444  /* Prepare main WAL data chain */
8445  xlrec.flags = 0;
8446  if (all_visible_cleared)
8448  if (new_all_visible_cleared)
8450  if (prefixlen > 0)
8452  if (suffixlen > 0)
8454  if (need_tuple_data)
8455  {
8457  if (old_key_tuple)
8458  {
8459  if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
8461  else
8463  }
8464  }
8465 
8466  /* If new tuple is the single and first tuple on page... */
8469  {
8470  info |= XLOG_HEAP_INIT_PAGE;
8471  init = true;
8472  }
8473  else
8474  init = false;
8475 
8476  /* Prepare WAL data for the old page */
8477  xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
8478  xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
8480  oldtup->t_data->t_infomask2);
8481 
8482  /* Prepare WAL data for the new page */
8483  xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
8484  xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
8485 
8486  bufflags = REGBUF_STANDARD;
8487  if (init)
8488  bufflags |= REGBUF_WILL_INIT;
8489  if (need_tuple_data)
8490  bufflags |= REGBUF_KEEP_DATA;
8491 
8492  XLogRegisterBuffer(0, newbuf, bufflags);
8493  if (oldbuf != newbuf)
8494  XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
8495 
8496  XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
8497 
8498  /*
8499  * Prepare WAL data for the new tuple.
8500  */
8501  if (prefixlen > 0 || suffixlen > 0)
8502  {
8503  if (prefixlen > 0 && suffixlen > 0)
8504  {
8505  prefix_suffix[0] = prefixlen;
8506  prefix_suffix[1] = suffixlen;
8507  XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
8508  }
8509  else if (prefixlen > 0)
8510  {
8511  XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
8512  }
8513  else
8514  {
8515  XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
8516  }
8517  }
8518 
8519  xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
8520  xlhdr.t_infomask = newtup->t_data->t_infomask;
8521  xlhdr.t_hoff = newtup->t_data->t_hoff;
8522  Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
8523 
8524  /*
8525  * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
8526  *
8527  * The 'data' doesn't include the common prefix or suffix.
8528  */
8529  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
8530  if (prefixlen == 0)
8531  {
8533  ((char *) newtup->t_data) + SizeofHeapTupleHeader,
8534  newtup->t_len - SizeofHeapTupleHeader - suffixlen);
8535  }
8536  else
8537  {
8538  /*
8539  * Have to write the null bitmap and data after the common prefix as
8540  * two separate rdata entries.
8541  */
8542  /* bitmap [+ padding] [+ oid] */
8543  if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
8544  {
8546  ((char *) newtup->t_data) + SizeofHeapTupleHeader,
8547  newtup->t_data->t_hoff - SizeofHeapTupleHeader);
8548  }
8549 
8550  /* data after common prefix */
8552  ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
8553  newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
8554  }
8555 
8556  /* We need to log a tuple identity */
8557  if (need_tuple_data && old_key_tuple)
8558  {
8559  /* don't really need this, but its more comfy to decode */
8560  xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
8561  xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
8562  xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
8563 
8564  XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
8565 
8566  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
8567  XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
8568  old_key_tuple->t_len - SizeofHeapTupleHeader);
8569  }
8570 
8571  /* filtering by origin on a row level is much more efficient */
8573 
8574  recptr = XLogInsert(RM_HEAP_ID, info);
8575 
8576  return recptr;
8577 }
8578 
8579 /*
8580  * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
8581  *
8582  * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
8583  * tuples.
8584  */
8585 static XLogRecPtr
8587 {
8588  xl_heap_new_cid xlrec;
8589 
8590  XLogRecPtr recptr;
8591  HeapTupleHeader hdr = tup->t_data;
8592 
8594  Assert(tup->t_tableOid != InvalidOid);
8595 
8596  xlrec.top_xid = GetTopTransactionId();
8597  xlrec.target_locator = relation->rd_locator;
8598  xlrec.target_tid = tup->t_self;
8599 
8600  /*
8601  * If the tuple got inserted & deleted in the same TX we definitely have a
8602  * combo CID, set cmin and cmax.
8603  */
8604  if (hdr->t_infomask & HEAP_COMBOCID)
8605  {
8606  Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
8608  xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
8609  xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
8611  }
8612  /* No combo CID, so only cmin or cmax can be set by this TX */
8613  else
8614  {
8615  /*
8616  * Tuple inserted.
8617  *
8618  * We need to check for LOCK ONLY because multixacts might be
8619  * transferred to the new tuple in case of FOR KEY SHARE updates in
8620  * which case there will be an xmax, although the tuple just got
8621  * inserted.
8622  */
8623  if (hdr->t_infomask & HEAP_XMAX_INVALID ||
8625  {
8626  xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
8627  xlrec.cmax = InvalidCommandId;
8628  }
8629  /* Tuple from a different tx updated or deleted. */
8630  else
8631  {
8632  xlrec.cmin = InvalidCommandId;
8633  xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
8634  }
8635  xlrec.combocid = InvalidCommandId;
8636  }
8637 
8638  /*
8639  * Note that we don't need to register the buffer here, because this
8640  * operation does not modify the page. The insert/update/delete that
8641  * called us certainly did, but that's WAL-logged separately.
8642  */
8643  XLogBeginInsert();
8644  XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
8645 
8646  /* will be looked at irrespective of origin */
8647 
8648  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
8649 
8650  return recptr;
8651 }
8652 
8653 /*
8654  * Build a heap tuple representing the configured REPLICA IDENTITY to represent
8655  * the old tuple in an UPDATE or DELETE.
8656  *
8657  * Returns NULL if there's no need to log an identity or if there's no suitable
8658  * key defined.
8659  *
8660  * Pass key_required true if any replica identity columns changed value, or if
8661  * any of them have any external data. Delete must always pass true.
8662  *
8663  * *copy is set to true if the returned tuple is a modified copy rather than
8664  * the same tuple that was passed in.
8665  */
8666 static HeapTuple
8667 ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
8668  bool *copy)
8669 {
8670  TupleDesc desc = RelationGetDescr(relation);
8671  char replident = relation->rd_rel->relreplident;
8672  Bitmapset *idattrs;
8673  HeapTuple key_tuple;
8674  bool nulls[MaxHeapAttributeNumber];
8676 
8677  *copy = false;
8678 
8679  if (!RelationIsLogicallyLogged(relation))
8680  return NULL;
8681 
8682  if (replident == REPLICA_IDENTITY_NOTHING)
8683  return NULL;
8684 
8685  if (replident == REPLICA_IDENTITY_FULL)
8686  {
8687  /*
8688  * When logging the entire old tuple, it very well could contain
8689  * toasted columns. If so, force them to be inlined.
8690  */
8691  if (HeapTupleHasExternal(tp))
8692  {
8693  *copy = true;
8694  tp = toast_flatten_tuple(tp, desc);
8695  }
8696  return tp;
8697  }
8698 
8699  /* if the key isn't required and we're only logging the key, we're done */
8700  if (!key_required)
8701  return NULL;
8702 
8703  /* find out the replica identity columns */
8704  idattrs = RelationGetIndexAttrBitmap(relation,
8706 
8707  /*
8708  * If there's no defined replica identity columns, treat as !key_required.
8709  * (This case should not be reachable from heap_update, since that should
8710  * calculate key_required accurately. But heap_delete just passes
8711  * constant true for key_required, so we can hit this case in deletes.)
8712  */
8713  if (bms_is_empty(idattrs))
8714  return NULL;
8715 
8716  /*
8717  * Construct a new tuple containing only the replica identity columns,
8718  * with nulls elsewhere. While we're at it, assert that the replica
8719  * identity columns aren't null.
8720  */
8721  heap_deform_tuple(tp, desc, values, nulls);
8722 
8723  for (int i = 0; i < desc->natts; i++)
8724  {
8726  idattrs))
8727  Assert(!nulls[i]);
8728  else
8729  nulls[i] = true;
8730  }
8731 
8732  key_tuple = heap_form_tuple(desc, values, nulls);
8733  *copy = true;
8734 
8735  bms_free(idattrs);
8736 
8737  /*
8738  * If the tuple, which by here only contains indexed columns, still has
8739  * toasted columns, force them to be inlined. This is somewhat unlikely
8740  * since there's limits on the size of indexed columns, so we don't
8741  * duplicate toast_flatten_tuple()s functionality in the above loop over
8742  * the indexed columns, even if it would be more efficient.
8743  */
8744  if (HeapTupleHasExternal(key_tuple))
8745  {
8746  HeapTuple oldtup = key_tuple;
8747 
8748  key_tuple = toast_flatten_tuple(oldtup, desc);
8749  heap_freetuple(oldtup);
8750  }
8751 
8752  return key_tuple;
8753 }
8754 
8755 /*
8756  * Handles XLOG_HEAP2_PRUNE record type.
8757  *
8758  * Acquires a full cleanup lock.
8759  */
8760 static void
8762 {
8763  XLogRecPtr lsn = record->EndRecPtr;
8764  xl_heap_prune *xlrec = (xl_heap_prune *) XLogRecGetData(record);
8765  Buffer buffer;
8766  RelFileLocator rlocator;
8767  BlockNumber blkno;
8769 
8770  XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
8771 
8772  /*
8773  * We're about to remove tuples. In Hot Standby mode, ensure that there's
8774  * no queries running for which the removed tuples are still visible.
8775  */
8776  if (InHotStandby)
8778  xlrec->isCatalogRel,
8779  rlocator);
8780 
8781  /*
8782  * If we have a full-page image, restore it (using a cleanup lock) and
8783  * we're done.
8784  */
8785  action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
8786  &buffer);
8787  if (action == BLK_NEEDS_REDO)
8788  {
8789  Page page = (Page) BufferGetPage(buffer);
8790  OffsetNumber *end;
8791  OffsetNumber *redirected;
8792  OffsetNumber *nowdead;
8793  OffsetNumber *nowunused;
8794  int nredirected;
8795  int ndead;
8796  int nunused;
8797  Size datalen;
8798 
8799  redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8800 
8801  nredirected = xlrec->nredirected;
8802  ndead = xlrec->ndead;
8803  end = (OffsetNumber *) ((char *) redirected + datalen);
8804  nowdead = redirected + (nredirected * 2);
8805  nowunused = nowdead + ndead;
8806  nunused = (end - nowunused);
8807  Assert(nunused >= 0);
8808 
8809  /* Update all line pointers per the record, and repair fragmentation */
8810  heap_page_prune_execute(buffer,
8811  redirected, nredirected,
8812  nowdead, ndead,
8813  nowunused, nunused);
8814 
8815  /*
8816  * Note: we don't worry about updating the page's prunability hints.
8817  * At worst this will cause an extra prune cycle to occur soon.
8818  */
8819 
8820  PageSetLSN(page, lsn);
8821  MarkBufferDirty(buffer);
8822  }
8823 
8824  if (BufferIsValid(buffer))
8825  {
8826  Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8827 
8828  UnlockReleaseBuffer(buffer);
8829 
8830  /*
8831  * After pruning records from a page, it's useful to update the FSM
8832  * about it, as it may cause the page become target for insertions
8833  * later even if vacuum decides not to visit it (which is possible if
8834  * gets marked all-visible.)
8835  *
8836  * Do this regardless of a full-page image being applied, since the
8837  * FSM data is not in the page anyway.
8838  */
8839  XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
8840  }
8841 }
8842 
8843 /*
8844  * Handles XLOG_HEAP2_VACUUM record type.
8845  *
8846  * Acquires an ordinary exclusive lock only.
8847  */
8848 static void
8850 {
8851  XLogRecPtr lsn = record->EndRecPtr;
8852  xl_heap_vacuum *xlrec = (xl_heap_vacuum *) XLogRecGetData(record);
8853  Buffer buffer;
8854  BlockNumber blkno;
8856 
8857  /*
8858  * If we have a full-page image, restore it (without using a cleanup lock)
8859  * and we're done.
8860  */
8861  action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false,
8862  &buffer);
8863  if (action == BLK_NEEDS_REDO)
8864  {
8865  Page page = (Page) BufferGetPage(buffer);
8866  OffsetNumber *nowunused;
8867  Size datalen;
8868  OffsetNumber *offnum;
8869 
8870  nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8871 
8872  /* Shouldn't be a record unless there's something to do */
8873  Assert(xlrec->nunused > 0);
8874 
8875  /* Update all now-unused line pointers */
8876  offnum = nowunused;
8877  for (int i = 0; i < xlrec->nunused; i++)
8878  {
8879  OffsetNumber off = *offnum++;
8880  ItemId lp = PageGetItemId(page, off);
8881 
8882  Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
8883  ItemIdSetUnused(lp);
8884  }
8885 
8886  /* Attempt to truncate line pointer array now */
8888 
8889  PageSetLSN(page, lsn);
8890  MarkBufferDirty(buffer);
8891  }
8892 
8893  if (BufferIsValid(buffer))
8894  {
8895  Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8896  RelFileLocator rlocator;
8897 
8898  XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
8899 
8900  UnlockReleaseBuffer(buffer);
8901 
8902  /*
8903  * After vacuuming LP_DEAD items from a page, it's useful to update
8904  * the FSM about it, as it may cause the page become target for
8905  * insertions later even if vacuum decides not to visit it (which is
8906  * possible if gets marked all-visible.)
8907  *
8908  * Do this regardless of a full-page image being applied, since the
8909  * FSM data is not in the page anyway.
8910  */
8911  XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
8912  }
8913 }
8914 
8915 /*
8916  * Replay XLOG_HEAP2_VISIBLE record.
8917  *
8918  * The critical integrity requirement here is that we must never end up with
8919  * a situation where the visibility map bit is set, and the page-level
8920  * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
8921  * page modification would fail to clear the visibility map bit.
8922  */
8923 static void
8925 {
8926  XLogRecPtr lsn = record->EndRecPtr;
8927  xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
8928  Buffer vmbuffer = InvalidBuffer;
8929  Buffer buffer;
8930  Page page;
8931  RelFileLocator rlocator;
8932  BlockNumber blkno;
8934 
8935  Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags);
8936 
8937  XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno);
8938 
8939  /*
8940  * If there are any Hot Standby transactions running that have an xmin
8941  * horizon old enough that this page isn't all-visible for them, they
8942  * might incorrectly decide that an index-only scan can skip a heap fetch.
8943  *
8944  * NB: It might be better to throw some kind of "soft" conflict here that
8945  * forces any index-only scan that is in flight to perform heap fetches,
8946  * rather than killing the transaction outright.
8947  */
8948  if (InHotStandby)
8951  rlocator);
8952 
8953  /*
8954  * Read the heap page, if it still exists. If the heap file has dropped or
8955  * truncated later in recovery, we don't need to update the page, but we'd
8956  * better still update the visibility map.
8957  */
8958  action = XLogReadBufferForRedo(record, 1, &buffer);
8959  if (action == BLK_NEEDS_REDO)
8960  {
8961  /*
8962  * We don't bump the LSN of the heap page when setting the visibility
8963  * map bit (unless checksums or wal_hint_bits is enabled, in which
8964  * case we must). This exposes us to torn page hazards, but since
8965  * we're not inspecting the existing page contents in any way, we
8966  * don't care.
8967  */
8968  page = BufferGetPage(buffer);
8969 
8970  PageSetAllVisible(page);
8971 
8972  if (XLogHintBitIsNeeded())
8973  PageSetLSN(page, lsn);
8974 
8975  MarkBufferDirty(buffer);
8976  }
8977  else if (action == BLK_RESTORED)
8978  {
8979  /*
8980  * If heap block was backed up, we already restored it and there's
8981  * nothing more to do. (This can only happen with checksums or
8982  * wal_log_hints enabled.)
8983  */
8984  }
8985 
8986  if (BufferIsValid(buffer))
8987  {
8988  Size space = PageGetFreeSpace(BufferGetPage(buffer));
8989 
8990  UnlockReleaseBuffer(buffer);
8991 
8992  /*
8993  * Since FSM is not WAL-logged and only updated heuristically, it
8994  * easily becomes stale in standbys. If the standby is later promoted
8995  * and runs VACUUM, it will skip updating individual free space
8996  * figures for pages that became all-visible (or all-frozen, depending
8997  * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
8998  * propagates too optimistic free space values to upper FSM layers;
8999  * later inserters try to use such pages only to find out that they
9000  * are unusable. This can cause long stalls when there are many such
9001  * pages.
9002  *
9003  * Forestall those problems by updating FSM's idea about a page that
9004  * is becoming all-visible or all-frozen.
9005  *
9006  * Do this regardless of a full-page image being applied, since the
9007  * FSM data is not in the page anyway.
9008  */
9009  if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
9010  XLogRecordPageWithFreeSpace(rlocator, blkno, space);
9011  }
9012 
9013  /*
9014  * Even if we skipped the heap page update due to the LSN interlock, it's
9015  * still safe to update the visibility map. Any WAL record that clears
9016  * the visibility map bit does so before checking the page LSN, so any
9017  * bits that need to be cleared will still be cleared.
9018  */
9019  if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
9020  &vmbuffer) == BLK_NEEDS_REDO)
9021  {
9022  Page vmpage = BufferGetPage(vmbuffer);
9023  Relation reln;
9024  uint8 vmbits;
9025 
9026  /* initialize the page if it was read as zeros */
9027  if (PageIsNew(vmpage))
9028  PageInit(vmpage, BLCKSZ, 0);
9029 
9030  /* remove VISIBILITYMAP_XLOG_* */
9031  vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS;
9032 
9033  /*
9034  * XLogReadBufferForRedoExtended locked the buffer. But
9035  * visibilitymap_set will handle locking itself.
9036  */
9037  LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
9038 
9039  reln = CreateFakeRelcacheEntry(rlocator);
9040  visibilitymap_pin(reln, blkno, &vmbuffer);
9041 
9042  visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
9043  xlrec->snapshotConflictHorizon, vmbits);
9044 
9045  ReleaseBuffer(vmbuffer);
9046  FreeFakeRelcacheEntry(reln);
9047  }
9048  else if (BufferIsValid(vmbuffer))
9049  UnlockReleaseBuffer(vmbuffer);
9050 }
9051 
9052 /*
9053  * Replay XLOG_HEAP2_FREEZE_PAGE records
9054  */
9055 static void
9057 {
9058  XLogRecPtr lsn = record->EndRecPtr;
9060  Buffer buffer;
9061 
9062  /*
9063  * In Hot Standby mode, ensure that there's no queries running which still
9064  * consider the frozen xids as running.
9065  */
9066  if (InHotStandby)
9067  {
9068  RelFileLocator rlocator;
9069 
9070  XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL);
9072  xlrec->isCatalogRel,
9073  rlocator);
9074  }
9075 
9076  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9077  {
9078  Page page = BufferGetPage(buffer);
9079  xl_heap_freeze_plan *plans;
9080  OffsetNumber *offsets;
9081  int curoff = 0;
9082 
9083  plans = (xl_heap_freeze_plan *) XLogRecGetBlockData(record, 0, NULL);
9084  offsets = (OffsetNumber *) ((char *) plans +
9085  (xlrec->nplans *
9086  sizeof(xl_heap_freeze_plan)));
9087  for (int p = 0; p < xlrec->nplans; p++)
9088  {
9089  HeapTupleFreeze frz;
9090 
9091  /*
9092  * Convert freeze plan representation from WAL record into
9093  * per-tuple format used by heap_execute_freeze_tuple
9094  */
9095  frz.xmax = plans[p].xmax;
9096  frz.t_infomask2 = plans[p].t_infomask2;
9097  frz.t_infomask = plans[p].t_infomask;
9098  frz.frzflags = plans[p].frzflags;
9099  frz.offset = InvalidOffsetNumber; /* unused, but be tidy */
9100 
9101  for (int i = 0; i < plans[p].ntuples; i++)
9102  {
9103  OffsetNumber offset = offsets[curoff++];
9104  ItemId lp;
9105  HeapTupleHeader tuple;
9106 
9107  lp = PageGetItemId(page, offset);
9108  tuple = (HeapTupleHeader) PageGetItem(page, lp);
9109  heap_execute_freeze_tuple(tuple, &frz);
9110  }
9111  }
9112 
9113  PageSetLSN(page, lsn);
9114  MarkBufferDirty(buffer);
9115  }
9116  if (BufferIsValid(buffer))
9117  UnlockReleaseBuffer(buffer);
9118 }
9119 
9120 /*
9121  * Given an "infobits" field from an XLog record, set the correct bits in the
9122  * given infomask and infomask2 for the tuple touched by the record.
9123  *
9124  * (This is the reverse of compute_infobits).
9125  */
9126 static void
9127 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
9128 {
9129  *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
9131  *infomask2 &= ~HEAP_KEYS_UPDATED;
9132 
9133  if (infobits & XLHL_XMAX_IS_MULTI)
9134  *infomask |= HEAP_XMAX_IS_MULTI;
9135  if (infobits & XLHL_XMAX_LOCK_ONLY)
9136  *infomask |= HEAP_XMAX_LOCK_ONLY;
9137  if (infobits & XLHL_XMAX_EXCL_LOCK)
9138  *infomask |= HEAP_XMAX_EXCL_LOCK;
9139  /* note HEAP_XMAX_SHR_LOCK isn't considered here */
9140  if (infobits & XLHL_XMAX_KEYSHR_LOCK)
9141  *infomask |= HEAP_XMAX_KEYSHR_LOCK;
9142 
9143  if (infobits & XLHL_KEYS_UPDATED)
9144  *infomask2 |= HEAP_KEYS_UPDATED;
9145 }
9146 
9147 static void
9149 {
9150  XLogRecPtr lsn = record->EndRecPtr;
9151  xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
9152  Buffer buffer;
9153  Page page;
9154  ItemId lp = NULL;
9155  HeapTupleHeader htup;
9156  BlockNumber blkno;
9157  RelFileLocator target_locator;
9158  ItemPointerData target_tid;
9159 
9160  XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
9161  ItemPointerSetBlockNumber(&target_tid, blkno);
9162  ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
9163 
9164  /*
9165  * The visibility map may need to be fixed even if the heap page is
9166  * already up-to-date.
9167  */
9169  {
9170  Relation reln = CreateFakeRelcacheEntry(target_locator);
9171  Buffer vmbuffer = InvalidBuffer;
9172 
9173  visibilitymap_pin(reln, blkno, &vmbuffer);
9174  visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
9175  ReleaseBuffer(vmbuffer);
9176  FreeFakeRelcacheEntry(reln);
9177  }
9178 
9179  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9180  {
9181  page = BufferGetPage(buffer);
9182 
9183  if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
9184  lp = PageGetItemId(page, xlrec->offnum);
9185 
9186  if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
9187  elog(PANIC, "invalid lp");
9188 
9189  htup = (HeapTupleHeader) PageGetItem(page, lp);
9190 
9191  htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9192  htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9195  &htup->t_infomask, &htup->t_infomask2);
9196  if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
9197  HeapTupleHeaderSetXmax(htup, xlrec->xmax);
9198  else
9200  HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9201 
9202  /* Mark the page as a candidate for pruning */
9203  PageSetPrunable(page, XLogRecGetXid(record));
9204 
9206  PageClearAllVisible(page);
9207 
9208  /* Make sure t_ctid is set correctly */
9209  if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
9211  else
9212  htup->t_ctid = target_tid;
9213  PageSetLSN(page, lsn);
9214  MarkBufferDirty(buffer);
9215  }
9216  if (BufferIsValid(buffer))
9217  UnlockReleaseBuffer(buffer);
9218 }
9219 
9220 static void
9222 {
9223  XLogRecPtr lsn = record->EndRecPtr;
9224  xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
9225  Buffer buffer;
9226  Page page;
9227  union
9228  {
9229  HeapTupleHeaderData hdr;
9230  char data[MaxHeapTupleSize];
9231  } tbuf;
9232  HeapTupleHeader htup;
9233  xl_heap_header xlhdr;
9234  uint32 newlen;
9235  Size freespace = 0;
9236  RelFileLocator target_locator;
9237  BlockNumber blkno;
9238  ItemPointerData target_tid;
9240 
9241  XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
9242  ItemPointerSetBlockNumber(&target_tid, blkno);
9243  ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
9244 
9245  /*
9246  * The visibility map may need to be fixed even if the heap page is
9247  * already up-to-date.
9248  */
9250  {
9251  Relation reln = CreateFakeRelcacheEntry(target_locator);
9252  Buffer vmbuffer = InvalidBuffer;
9253 
9254  visibilitymap_pin(reln, blkno, &vmbuffer);
9255  visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
9256  ReleaseBuffer(vmbuffer);
9257  FreeFakeRelcacheEntry(reln);
9258  }
9259 
9260  /*
9261  * If we inserted the first and only tuple on the page, re-initialize the
9262  * page from scratch.
9263  */
9264  if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
9265  {
9266  buffer = XLogInitBufferForRedo(record, 0);
9267  page = BufferGetPage(buffer);
9268  PageInit(page, BufferGetPageSize(buffer), 0);
9270  }
9271  else
9272  action = XLogReadBufferForRedo(record, 0, &buffer);
9273  if (action == BLK_NEEDS_REDO)
9274  {
9275  Size datalen;
9276  char *data;
9277 
9278  page = BufferGetPage(buffer);
9279 
9280  if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
9281  elog(PANIC, "invalid max offset number");
9282 
9283  data = XLogRecGetBlockData(record, 0, &datalen);
9284 
9285  newlen = datalen - SizeOfHeapHeader;
9286  Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
9287  memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
9289 
9290  htup = &tbuf.hdr;
9291  MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9292  /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
9293  memcpy((char *) htup + SizeofHeapTupleHeader,
9294  data,
9295  newlen);
9296  newlen += SizeofHeapTupleHeader;
9297  htup->t_infomask2 = xlhdr.t_infomask2;
9298  htup->t_infomask = xlhdr.t_infomask;
9299  htup->t_hoff = xlhdr.t_hoff;
9300  HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9302  htup->t_ctid = target_tid;
9303 
9304  if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
9305  true, true) == InvalidOffsetNumber)
9306  elog(PANIC, "failed to add tuple");
9307 
9308  freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9309 
9310  PageSetLSN(page, lsn);
9311 
9313  PageClearAllVisible(page);
9314 
9315  /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
9316  if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
9317  PageSetAllVisible(page);
9318 
9319  MarkBufferDirty(buffer);
9320  }
9321  if (BufferIsValid(buffer))
9322  UnlockReleaseBuffer(buffer);
9323 
9324  /*
9325  * If the page is running low on free space, update the FSM as well.
9326  * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9327  * better than that without knowing the fill-factor for the table.
9328  *
9329  * XXX: Don't do this if the page was restored from full page image. We
9330  * don't bother to update the FSM in that case, it doesn't need to be
9331  * totally accurate anyway.
9332  */
9333  if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
9334  XLogRecordPageWithFreeSpace(target_locator, blkno, freespace);
9335 }
9336 
9337 /*
9338  * Handles MULTI_INSERT record type.
9339  */
9340 static void
9342 {
9343  XLogRecPtr lsn = record->EndRecPtr;
9344  xl_heap_multi_insert *xlrec;
9345  RelFileLocator rlocator;
9346  BlockNumber blkno;
9347  Buffer buffer;
9348  Page page;
9349  union
9350  {
9351  HeapTupleHeaderData hdr;
9352  char data[MaxHeapTupleSize];
9353  } tbuf;
9354  HeapTupleHeader htup;
9355  uint32 newlen;
9356  Size freespace = 0;
9357  int i;
9358  bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
9360 
9361  /*
9362  * Insertion doesn't overwrite MVCC data, so no conflict processing is
9363  * required.
9364  */
9365  xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
9366 
9367  XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
9368 
9369  /* check that the mutually exclusive flags are not both set */
9371  (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)));
9372 
9373  /*
9374  * The visibility map may need to be fixed even if the heap page is
9375  * already up-to-date.
9376  */
9378  {
9379  Relation reln = CreateFakeRelcacheEntry(rlocator);
9380  Buffer vmbuffer = InvalidBuffer;
9381 
9382  visibilitymap_pin(reln, blkno, &vmbuffer);
9383  visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
9384  ReleaseBuffer(vmbuffer);
9385  FreeFakeRelcacheEntry(reln);
9386  }
9387 
9388  if (isinit)
9389  {
9390  buffer = XLogInitBufferForRedo(record, 0);
9391  page = BufferGetPage(buffer);
9392  PageInit(page, BufferGetPageSize(buffer), 0);
9394  }
9395  else
9396  action = XLogReadBufferForRedo(record, 0, &buffer);
9397  if (action == BLK_NEEDS_REDO)
9398  {
9399  char *tupdata;
9400  char *endptr;
9401  Size len;
9402 
9403  /* Tuples are stored as block data */
9404  tupdata = XLogRecGetBlockData(record, 0, &len);
9405  endptr = tupdata + len;
9406 
9407  page = (Page) BufferGetPage(buffer);
9408 
9409  for (i = 0; i < xlrec->ntuples; i++)
9410  {
9411  OffsetNumber offnum;
9412  xl_multi_insert_tuple *xlhdr;
9413 
9414  /*
9415  * If we're reinitializing the page, the tuples are stored in
9416  * order from FirstOffsetNumber. Otherwise there's an array of
9417  * offsets in the WAL record, and the tuples come after that.
9418  */
9419  if (isinit)
9420  offnum = FirstOffsetNumber + i;
9421  else
9422  offnum = xlrec->offsets[i];
9423  if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9424  elog(PANIC, "invalid max offset number");
9425 
9426  xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
9427  tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
9428 
9429  newlen = xlhdr->datalen;
9430  Assert(newlen <= MaxHeapTupleSize);
9431  htup = &tbuf.hdr;
9432  MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9433  /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
9434  memcpy((char *) htup + SizeofHeapTupleHeader,
9435  (char *) tupdata,
9436  newlen);
9437  tupdata += newlen;
9438 
9439  newlen += SizeofHeapTupleHeader;
9440  htup->t_infomask2 = xlhdr->t_infomask2;
9441  htup->t_infomask = xlhdr->t_infomask;
9442  htup->t_hoff = xlhdr->t_hoff;
9443  HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9445  ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
9446  ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
9447 
9448  offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9449  if (offnum == InvalidOffsetNumber)
9450  elog(PANIC, "failed to add tuple");
9451  }
9452  if (tupdata != endptr)
9453  elog(PANIC, "total tuple length mismatch");
9454 
9455  freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9456 
9457  PageSetLSN(page, lsn);
9458 
9460  PageClearAllVisible(page);
9461 
9462  /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
9463  if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
9464  PageSetAllVisible(page);
9465 
9466  MarkBufferDirty(buffer);
9467  }
9468  if (BufferIsValid(buffer))
9469  UnlockReleaseBuffer(buffer);
9470 
9471  /*
9472  * If the page is running low on free space, update the FSM as well.
9473  * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9474  * better than that without knowing the fill-factor for the table.
9475  *
9476  * XXX: Don't do this if the page was restored from full page image. We
9477  * don't bother to update the FSM in that case, it doesn't need to be
9478  * totally accurate anyway.
9479  */
9480  if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
9481  XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
9482 }
9483 
9484 /*
9485  * Handles UPDATE and HOT_UPDATE
9486  */
9487 static void
9488 heap_xlog_update(XLogReaderState *record, bool hot_update)
9489 {
9490  XLogRecPtr lsn = record->EndRecPtr;
9491  xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
9492  RelFileLocator rlocator;
9493  BlockNumber oldblk;
9494  BlockNumber newblk;
9495  ItemPointerData newtid;
9496  Buffer obuffer,
9497  nbuffer;
9498  Page page;
9499  OffsetNumber offnum;
9500  ItemId lp = NULL;
9501  HeapTupleData oldtup;
9502  HeapTupleHeader htup;
9503  uint16 prefixlen = 0,
9504  suffixlen = 0;
9505  char *newp;
9506  union
9507  {
9508  HeapTupleHeaderData hdr;
9509  char data[MaxHeapTupleSize];
9510  } tbuf;
9511  xl_heap_header xlhdr;
9512  uint32 newlen;
9513  Size freespace = 0;
9514  XLogRedoAction oldaction;
9515  XLogRedoAction newaction;
9516 
9517  /* initialize to keep the compiler quiet */
9518  oldtup.t_data = NULL;
9519  oldtup.t_len = 0;
9520 
9521  XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk);
9522  if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL))
9523  {
9524  /* HOT updates are never done across pages */
9525  Assert(!hot_update);
9526  }
9527  else
9528  oldblk = newblk;
9529 
9530  ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
9531 
9532  /*
9533  * The visibility map may need to be fixed even if the heap page is
9534  * already up-to-date.
9535  */
9537  {
9538  Relation reln = CreateFakeRelcacheEntry(rlocator);
9539  Buffer vmbuffer = InvalidBuffer;
9540 
9541  visibilitymap_pin(reln, oldblk, &vmbuffer);
9542  visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9543  ReleaseBuffer(vmbuffer);
9544  FreeFakeRelcacheEntry(reln);
9545  }
9546 
9547  /*
9548  * In normal operation, it is important to lock the two pages in
9549  * page-number order, to avoid possible deadlocks against other update
9550  * operations going the other way. However, during WAL replay there can
9551  * be no other update happening, so we don't need to worry about that. But
9552  * we *do* need to worry that we don't expose an inconsistent state to Hot
9553  * Standby queries --- so the original page can't be unlocked before we've
9554  * added the new tuple to the new page.
9555  */
9556 
9557  /* Deal with old tuple version */
9558  oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
9559  &obuffer);
9560  if (oldaction == BLK_NEEDS_REDO)
9561  {
9562  page = BufferGetPage(obuffer);
9563  offnum = xlrec->old_offnum;
9564  if (PageGetMaxOffsetNumber(page) >= offnum)
9565  lp = PageGetItemId(page, offnum);
9566 
9567  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9568  elog(PANIC, "invalid lp");
9569 
9570  htup = (HeapTupleHeader) PageGetItem(page, lp);
9571 
9572  oldtup.t_data = htup;
9573  oldtup.t_len = ItemIdGetLength(lp);
9574 
9575  htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9576  htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9577  if (hot_update)
9579  else
9582  &htup->t_infomask2);
9583  HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
9584  HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9585  /* Set forward chain link in t_ctid */
9586  htup->t_ctid = newtid;
9587 
9588  /* Mark the page as a candidate for pruning */
9589  PageSetPrunable(page, XLogRecGetXid(record));
9590 
9592  PageClearAllVisible(page);
9593 
9594  PageSetLSN(page, lsn);
9595  MarkBufferDirty(obuffer);
9596  }
9597 
9598  /*
9599  * Read the page the new tuple goes into, if different from old.
9600  */
9601  if (oldblk == newblk)
9602  {
9603  nbuffer = obuffer;
9604  newaction = oldaction;
9605  }
9606  else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
9607  {
9608  nbuffer = XLogInitBufferForRedo(record, 0);
9609  page = (Page) BufferGetPage(nbuffer);
9610  PageInit(page, BufferGetPageSize(nbuffer), 0);
9611  newaction = BLK_NEEDS_REDO;
9612  }
9613  else
9614  newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
9615 
9616  /*
9617  * The visibility map may need to be fixed even if the heap page is
9618  * already up-to-date.
9619  */
9621  {
9622  Relation reln = CreateFakeRelcacheEntry(rlocator);
9623  Buffer vmbuffer = InvalidBuffer;
9624 
9625  visibilitymap_pin(reln, newblk, &vmbuffer);
9626  visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9627  ReleaseBuffer(vmbuffer);
9628  FreeFakeRelcacheEntry(reln);
9629  }
9630 
9631  /* Deal with new tuple */
9632  if (newaction == BLK_NEEDS_REDO)
9633  {
9634  char *recdata;
9635  char *recdata_end;
9636  Size datalen;
9637  Size tuplen;
9638 
9639  recdata = XLogRecGetBlockData(record, 0, &datalen);
9640  recdata_end = recdata + datalen;
9641 
9642  page = BufferGetPage(nbuffer);
9643 
9644  offnum = xlrec->new_offnum;
9645  if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9646  elog(PANIC, "invalid max offset number");
9647 
9648  if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
9649  {
9650  Assert(newblk == oldblk);
9651  memcpy(&prefixlen, recdata, sizeof(uint16));
9652  recdata += sizeof(uint16);
9653  }
9654  if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
9655  {
9656  Assert(newblk == oldblk);
9657  memcpy(&suffixlen, recdata, sizeof(uint16));
9658  recdata += sizeof(uint16);
9659  }
9660 
9661  memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
9662  recdata += SizeOfHeapHeader;
9663 
9664  tuplen = recdata_end - recdata;
9665  Assert(tuplen <= MaxHeapTupleSize);
9666 
9667  htup = &tbuf.hdr;
9668  MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9669 
9670  /*
9671  * Reconstruct the new tuple using the prefix and/or suffix from the
9672  * old tuple, and the data stored in the WAL record.
9673  */
9674  newp = (char *) htup + SizeofHeapTupleHeader;
9675  if (prefixlen > 0)
9676  {
9677  int len;
9678 
9679  /* copy bitmap [+ padding] [+ oid] from WAL record */
9680  len = xlhdr.t_hoff - SizeofHeapTupleHeader;
9681  memcpy(newp, recdata, len);
9682  recdata += len;
9683  newp += len;
9684 
9685  /* copy prefix from old tuple */
9686  memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
9687  newp += prefixlen;
9688 
9689  /* copy new tuple data from WAL record */
9690  len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
9691  memcpy(newp, recdata, len);
9692  recdata += len;
9693  newp += len;
9694  }
9695  else
9696  {
9697  /*
9698  * copy bitmap [+ padding] [+ oid] + data from record, all in one
9699  * go
9700  */
9701  memcpy(newp, recdata, tuplen);
9702  recdata += tuplen;
9703  newp += tuplen;
9704  }
9705  Assert(recdata == recdata_end);
9706 
9707  /* copy suffix from old tuple */
9708  if (suffixlen > 0)
9709  memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
9710 
9711  newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
9712  htup->t_infomask2 = xlhdr.t_infomask2;
9713  htup->t_infomask = xlhdr.t_infomask;
9714  htup->t_hoff = xlhdr.t_hoff;
9715 
9716  HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9718  HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
9719  /* Make sure there is no forward chain link in t_ctid */
9720  htup->t_ctid = newtid;
9721 
9722  offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9723  if (offnum == InvalidOffsetNumber)
9724  elog(PANIC, "failed to add tuple");
9725 
9727  PageClearAllVisible(page);
9728 
9729  freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9730 
9731  PageSetLSN(page, lsn);
9732  MarkBufferDirty(nbuffer);
9733  }
9734 
9735  if (BufferIsValid(nbuffer) && nbuffer != obuffer)
9736  UnlockReleaseBuffer(nbuffer);
9737  if (BufferIsValid(obuffer))
9738  UnlockReleaseBuffer(obuffer);
9739 
9740  /*
9741  * If the new page is running low on free space, update the FSM as well.
9742  * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9743  * better than that without knowing the fill-factor for the table.
9744  *
9745  * However, don't update the FSM on HOT updates, because after crash
9746  * recovery, either the old or the new tuple will certainly be dead and
9747  * prunable. After pruning, the page will have roughly as much free space
9748  * as it did before the update, assuming the new tuple is about the same
9749  * size as the old one.
9750  *
9751  * XXX: Don't do this if the page was restored from full page image. We
9752  * don't bother to update the FSM in that case, it doesn't need to be
9753  * totally accurate anyway.
9754  */
9755  if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
9756  XLogRecordPageWithFreeSpace(rlocator, newblk, freespace);
9757 }
9758 
9759 static void
9761 {
9762  XLogRecPtr lsn = record->EndRecPtr;
9763  xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
9764  Buffer buffer;
9765  Page page;
9766  OffsetNumber offnum;
9767  ItemId lp = NULL;
9768  HeapTupleHeader htup;
9769 
9770  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9771  {
9772  page = BufferGetPage(buffer);
9773 
9774  offnum = xlrec->offnum;
9775  if (PageGetMaxOffsetNumber(page) >= offnum)
9776  lp = PageGetItemId(page, offnum);
9777 
9778  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9779  elog(PANIC, "invalid lp");
9780 
9781  htup = (HeapTupleHeader) PageGetItem(page, lp);
9782 
9783  /*
9784  * Confirm tuple as actually inserted
9785  */
9786  ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
9787 
9788  PageSetLSN(page, lsn);
9789  MarkBufferDirty(buffer);
9790  }
9791  if (BufferIsValid(buffer))
9792  UnlockReleaseBuffer(buffer);
9793 }
9794 
9795 static void
9797 {
9798  XLogRecPtr lsn = record->EndRecPtr;
9799  xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
9800  Buffer buffer;
9801  Page page;
9802  OffsetNumber offnum;
9803  ItemId lp = NULL;
9804  HeapTupleHeader htup;
9805 
9806  /*
9807  * The visibility map may need to be fixed even if the heap page is
9808  * already up-to-date.
9809  */
9810  if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9811  {
9812  RelFileLocator rlocator;
9813  Buffer vmbuffer = InvalidBuffer;
9814  BlockNumber block;
9815  Relation reln;
9816 
9817  XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block);
9818  reln = CreateFakeRelcacheEntry(rlocator);
9819 
9820  visibilitymap_pin(reln, block, &vmbuffer);
9821  visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9822 
9823  ReleaseBuffer(vmbuffer);
9824  FreeFakeRelcacheEntry(reln);
9825  }
9826 
9827  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9828  {
9829  page = (Page) BufferGetPage(buffer);
9830 
9831  offnum = xlrec->offnum;
9832  if (PageGetMaxOffsetNumber(page) >= offnum)
9833  lp = PageGetItemId(page, offnum);
9834 
9835  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9836  elog(PANIC, "invalid lp");
9837 
9838  htup = (HeapTupleHeader) PageGetItem(page, lp);
9839 
9840  htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9841  htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9843  &htup->t_infomask2);
9844 
9845  /*
9846  * Clear relevant update flags, but only if the modified infomask says
9847  * there's no update.
9848  */
9850  {
9852  /* Make sure there is no forward chain link in t_ctid */
9853  ItemPointerSet(&htup->t_ctid,
9854  BufferGetBlockNumber(buffer),
9855  offnum);
9856  }
9857  HeapTupleHeaderSetXmax(htup, xlrec->xmax);
9858  HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9859  PageSetLSN(page, lsn);
9860  MarkBufferDirty(buffer);
9861  }
9862  if (BufferIsValid(buffer))
9863  UnlockReleaseBuffer(buffer);
9864 }
9865 
9866 static void
9868 {
9869  XLogRecPtr lsn = record->EndRecPtr;
9870  xl_heap_lock_updated *xlrec;
9871  Buffer buffer;
9872  Page page;
9873  OffsetNumber offnum;
9874  ItemId lp = NULL;
9875  HeapTupleHeader htup;
9876 
9877  xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
9878 
9879  /*
9880  * The visibility map may need to be fixed even if the heap page is
9881  * already up-to-date.
9882  */
9883  if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9884  {
9885  RelFileLocator rlocator;
9886  Buffer vmbuffer = InvalidBuffer;
9887  BlockNumber block;
9888  Relation reln;
9889 
9890  XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block);
9891  reln = CreateFakeRelcacheEntry(rlocator);
9892 
9893  visibilitymap_pin(reln, block, &vmbuffer);
9894  visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9895 
9896  ReleaseBuffer(vmbuffer);
9897  FreeFakeRelcacheEntry(reln);
9898  }
9899 
9900  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9901  {
9902  page = BufferGetPage(buffer);
9903 
9904  offnum = xlrec->offnum;
9905  if (PageGetMaxOffsetNumber(page) >= offnum)
9906  lp = PageGetItemId(page, offnum);
9907 
9908  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9909  elog(PANIC, "invalid lp");
9910 
9911  htup = (HeapTupleHeader) PageGetItem(page, lp);
9912 
9913  htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9914  htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9916  &htup->t_infomask2);
9917  HeapTupleHeaderSetXmax(htup, xlrec->xmax);
9918 
9919  PageSetLSN(page, lsn);
9920  MarkBufferDirty(buffer);
9921  }
9922  if (BufferIsValid(buffer))
9923  UnlockReleaseBuffer(buffer);
9924 }
9925 
9926 static void
9928 {
9929  XLogRecPtr lsn = record->EndRecPtr;
9930  xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
9931  Buffer buffer;
9932  Page page;
9933  OffsetNumber offnum;
9934  ItemId lp = NULL;
9935  HeapTupleHeader htup;
9936  uint32 oldlen;
9937  Size newlen;
9938 
9939  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9940  {
9941  char *newtup = XLogRecGetBlockData(record, 0, &newlen);
9942 
9943  page = BufferGetPage(buffer);
9944 
9945  offnum = xlrec->offnum;
9946  if (PageGetMaxOffsetNumber(page) >= offnum)
9947  lp = PageGetItemId(page, offnum);
9948 
9949  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9950  elog(PANIC, "invalid lp");
9951 
9952  htup = (HeapTupleHeader) PageGetItem(page, lp);
9953 
9954  oldlen = ItemIdGetLength(lp) - htup->t_hoff;
9955  if (oldlen != newlen)
9956  elog(PANIC, "wrong tuple length");
9957 
9958  memcpy((char *) htup + htup->t_hoff, newtup, newlen);
9959 
9960  PageSetLSN(page, lsn);
9961  MarkBufferDirty(buffer);
9962  }
9963  if (BufferIsValid(buffer))
9964  UnlockReleaseBuffer(buffer);
9965 }
9966 
9967 void
9969 {
9970  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9971 
9972  /*
9973  * These operations don't overwrite MVCC data so no conflict processing is
9974  * required. The ones in heap2 rmgr do.
9975  */
9976 
9977  switch (info & XLOG_HEAP_OPMASK)
9978  {
9979  case XLOG_HEAP_INSERT:
9980  heap_xlog_insert(record);
9981  break;
9982  case XLOG_HEAP_DELETE:
9983  heap_xlog_delete(record);
9984  break;
9985  case XLOG_HEAP_UPDATE:
9986  heap_xlog_update(record, false);
9987  break;
9988  case XLOG_HEAP_TRUNCATE:
9989 
9990  /*
9991  * TRUNCATE is a no-op because the actions are already logged as
9992  * SMGR WAL records. TRUNCATE WAL record only exists for logical
9993  * decoding.
9994  */
9995  break;
9996  case XLOG_HEAP_HOT_UPDATE:
9997  heap_xlog_update(record, true);
9998  break;
9999  case XLOG_HEAP_CONFIRM:
10000  heap_xlog_confirm(record);
10001  break;
10002  case XLOG_HEAP_LOCK:
10003  heap_xlog_lock(record);
10004  break;
10005  case XLOG_HEAP_INPLACE:
10006  heap_xlog_inplace(record);
10007  break;
10008  default:
10009  elog(PANIC, "heap_redo: unknown op code %u", info);
10010  }
10011 }
10012 
10013 void
10015 {
10016  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
10017 
10018  switch (info & XLOG_HEAP_OPMASK)
10019  {
10020  case XLOG_HEAP2_PRUNE:
10021  heap_xlog_prune(record);
10022  break;
10023  case XLOG_HEAP2_VACUUM:
10024  heap_xlog_vacuum(record);
10025  break;
10027  heap_xlog_freeze_page(record);
10028  break;
10029  case XLOG_HEAP2_VISIBLE:
10030  heap_xlog_visible(record);
10031  break;
10033  heap_xlog_multi_insert(record);
10034  break;
10036  heap_xlog_lock_updated(record);
10037  break;
10038  case XLOG_HEAP2_NEW_CID:
10039 
10040  /*
10041  * Nothing to do on a real replay, only used during logical
10042  * decoding.
10043  */
10044  break;
10045  case XLOG_HEAP2_REWRITE:
10046  heap_xlog_logical_rewrite(record);
10047  break;
10048  default:
10049  elog(PANIC, "heap2_redo: unknown op code %u", info);
10050  }
10051 }
10052 
10053 /*
10054  * Mask a heap page before performing consistency checks on it.
10055  */
10056 void
10057 heap_mask(char *pagedata, BlockNumber blkno)
10058 {
10059  Page page = (Page) pagedata;
10060  OffsetNumber off;
10061 
10063 
10064  mask_page_hint_bits(page);
10065  mask_unused_space(page);
10066 
10067  for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
10068  {
10069  ItemId iid = PageGetItemId(page, off);
10070  char *page_item;
10071 
10072  page_item = (char *) (page + ItemIdGetOffset(iid));
10073 
10074  if (ItemIdIsNormal(iid))
10075  {
10076  HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
10077 
10078  /*
10079  * If xmin of a tuple is not yet frozen, we should ignore
10080  * differences in hint bits, since they can be set without
10081  * emitting WAL.
10082  */
10083  if (!HeapTupleHeaderXminFrozen(page_htup))
10084  page_htup->t_infomask &= ~HEAP_XACT_MASK;
10085  else
10086  {
10087  /* Still we need to mask xmax hint bits. */
10088  page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
10089  page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
10090  }
10091 
10092  /*
10093  * During replay, we set Command Id to FirstCommandId. Hence, mask
10094  * it. See heap_xlog_insert() for details.
10095  */
10096  page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
10097 
10098  /*
10099  * For a speculative tuple, heap_insert() does not set ctid in the
10100  * caller-passed heap tuple itself, leaving the ctid field to
10101  * contain a speculative token value - a per-backend monotonically
10102  * increasing identifier. Besides, it does not WAL-log ctid under
10103  * any circumstances.
10104  *
10105  * During redo, heap_xlog_insert() sets t_ctid to current block
10106  * number and self offset number. It doesn't care about any
10107  * speculative insertions on the primary. Hence, we set t_ctid to
10108  * current block number and self offset number to ignore any
10109  * inconsistency.
10110  */
10111  if (HeapTupleHeaderIsSpeculative(page_htup))
10112  ItemPointerSet(&page_htup->t_ctid, blkno, off);
10113 
10114  /*
10115  * NB: Not ignoring ctid changes due to the tuple having moved
10116  * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
10117  * important information that needs to be in-sync between primary
10118  * and standby, and thus is WAL logged.
10119  */
10120  }
10121 
10122  /*
10123  * Ignore any padding bytes after the tuple, when the length of the
10124  * item is not MAXALIGNed.
10125  */
10126  if (ItemIdHasStorage(iid))
10127  {
10128  int len = ItemIdGetLength(iid);
10129  int padlen = MAXALIGN(len) - len;
10130 
10131  if (padlen > 0)
10132  memset(page_item + len, MASK_MARKER, padlen);
10133  }
10134  }
10135 }
10136 
10137 /*
10138  * HeapCheckForSerializableConflictOut
10139  * We are reading a tuple. If it's not visible, there may be a
10140  * rw-conflict out with the inserter. Otherwise, if it is visible to us
10141  * but has been deleted, there may be a rw-conflict out with the deleter.
10142  *
10143  * We will determine the top level xid of the writing transaction with which
10144  * we may be in conflict, and ask CheckForSerializableConflictOut() to check
10145  * for overlap with our own transaction.
10146  *
10147  * This function should be called just about anywhere in heapam.c where a
10148  * tuple has been read. The caller must hold at least a shared lock on the
10149  * buffer, because this function might set hint bits on the tuple. There is
10150  * currently no known reason to call this function from an index AM.
10151  */
10152 void
10154  HeapTuple tuple, Buffer buffer,
10155  Snapshot snapshot)
10156 {
10157  TransactionId xid;
10158  HTSV_Result htsvResult;
10159 
10160  if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
10161  return;
10162 
10163  /*
10164  * Check to see whether the tuple has been written to by a concurrent
10165  * transaction, either to create it not visible to us, or to delete it
10166  * while it is visible to us. The "visible" bool indicates whether the
10167  * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
10168  * is going on with it.
10169  *
10170  * In the event of a concurrently inserted tuple that also happens to have
10171  * been concurrently updated (by a separate transaction), the xmin of the
10172  * tuple will be used -- not the updater's xid.
10173  */
10174  htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
10175  switch (htsvResult)
10176  {
10177  case HEAPTUPLE_LIVE:
10178  if (visible)
10179  return;
10180  xid = HeapTupleHeaderGetXmin(tuple->t_data);
10181  break;
10184  if (visible)
10185  xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
10186  else
10187  xid = HeapTupleHeaderGetXmin(tuple->t_data);
10188 
10190  {
10191  /* This is like the HEAPTUPLE_DEAD case */
10192  Assert(!visible);
10193  return;
10194  }
10195  break;
10197  xid = HeapTupleHeaderGetXmin(tuple->t_data);
10198  break;
10199  case HEAPTUPLE_DEAD:
10200  Assert(!visible);
10201  return;
10202  default:
10203 
10204  /*
10205  * The only way to get to this default clause is if a new value is
10206  * added to the enum type without adding it to this switch
10207  * statement. That's a bug, so elog.
10208  */
10209  elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
10210 
10211  /*
10212  * In spite of having all enum values covered and calling elog on
10213  * this default, some compilers think this is a code path which
10214  * allows xid to be used below without initialization. Silence
10215  * that warning.
10216  */
10217  xid = InvalidTransactionId;
10218  }
10219 
10222 
10223  /*
10224  * Find top level xid. Bail out if xid is too early to be a conflict, or
10225  * if it's our own xid.
10226  */
10228  return;
10229  xid = SubTransGetTopmostTransaction(xid);
10231  return;
10232 
10233  CheckForSerializableConflictOut(relation, xid, snapshot);
10234 }
int16 AttrNumber
Definition: attnum.h:21
int bms_next_member(const Bitmapset *a, int prevbit)
Definition: bitmapset.c:1306
void bms_free(Bitmapset *a)
Definition: bitmapset.c:239
bool bms_is_member(int x, const Bitmapset *a)
Definition: bitmapset.c:510
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition: bitmapset.c:815
Bitmapset * bms_add_members(Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:917
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition: bitmapset.c:582
#define bms_is_empty(a)
Definition: bitmapset.h:118
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static int32 next
Definition: blutils.c:221
static Datum values[MAXATTR]
Definition: bootstrap.c:152
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void mask_page_lsn_and_checksum(Page page)
Definition: bufmask.c:31
void mask_unused_space(Page page)
Definition: bufmask.c:71
void mask_page_hint_bits(Page page)
Definition: bufmask.c:46
#define MASK_MARKER
Definition: bufmask.h:24
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3377
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:627
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4560
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4577
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2189
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4795
int maintenance_io_concurrency
Definition: bufmgr.c:153
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:781
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:734
@ BAS_BULKREAD
Definition: bufmgr.h:35
@ BAS_BULKWRITE
Definition: bufmgr.h:37
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:157
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:158
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:229
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:339
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:159
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:49
@ RBM_NORMAL
Definition: bufmgr.h:44
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
Size PageGetHeapFreeSpace(Page page)
Definition: bufpage.c:991
void PageTruncateLinePointerArray(Page page)
Definition: bufpage.c:835
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static void PageClearAllVisible(Page page)
Definition: bufpage.h:436
#define SizeOfPageHeaderData
Definition: bufpage.h:213
static void PageSetAllVisible(Page page)
Definition: bufpage.h:431
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static bool PageIsAllVisible(Page page)
Definition: bufpage.h:426
static void PageSetFull(Page page)
Definition: bufpage.h:415
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define PageSetPrunable(page, xid)
Definition: bufpage.h:444
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:468
#define InvalidCommandId
Definition: c.h:656
unsigned short uint16
Definition: c.h:492
unsigned int uint32
Definition: c.h:493
#define Min(x, y)
Definition: c.h:991
signed short int16
Definition: c.h:480
#define MAXALIGN(LEN)
Definition: c.h:798
TransactionId MultiXactId
Definition: c.h:649
#define FirstCommandId
Definition: c.h:655
#define SHORTALIGN(LEN)
Definition: c.h:794
#define pg_unreachable()
Definition: c.h:283
#define unlikely(x)
Definition: c.h:298
#define lengthof(array)
Definition: c.h:775
unsigned char uint8
Definition: c.h:491
#define MemSet(start, val, len)
Definition: c.h:1007
#define StaticAssertDecl(condition, errmessage)
Definition: c.h:923
uint32 CommandId
Definition: c.h:653
uint32 TransactionId
Definition: c.h:639
size_t Size
Definition: c.h:592
bool IsToastRelation(Relation relation)
Definition: catalog.c:145
bool IsCatalogRelation(Relation relation)
Definition: catalog.c:103
void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup, CommandId *cmax, bool *iscombo)
Definition: combocid.c:153
CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup)
Definition: combocid.c:104
CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup)
Definition: combocid.c:118
bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen)
Definition: datum.c:223
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1159
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1232
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define WARNING
Definition: elog.h:36
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
TupleTableSlot * ExecStoreBufferHeapTuple(HeapTuple tuple, TupleTableSlot *slot, Buffer buffer)
Definition: execTuples.c:1391
HeapTuple ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree)
Definition: execTuples.c:1643
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:639
void XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk, Size spaceAvail)
Definition: freespace.c:199
int NBuffers
Definition: globals.c:139
static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
Definition: heapam.c:7122
void heap_finish_speculative(Relation relation, ItemPointer tid)
Definition: heapam.c:5649
void heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
Definition: heapam.c:1824
static void heap_xlog_prune(XLogReaderState *record)
Definition: heapam.c:8761
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup)
Definition: heapam.c:8586
XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
Definition: heapam.c:8330
static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
Definition: heapam.c:4887
void heap_redo(XLogReaderState *record)
Definition: heapam.c:9968
struct IndexDeleteCounts IndexDeleteCounts
static int heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, xl_heap_freeze_plan *plans_out, OffsetNumber *offsets_out)
Definition: heapam.c:6872
bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
Definition: heapam.c:1341
static bool Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, ItemPointer ctid, XLTW_Oper oper, int *remaining)
Definition: heapam.c:7221
#define BOTTOMUP_TOLERANCE_NBLOCKS
Definition: heapam.c:188
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
Definition: heapam.c:2013
int updstatus
Definition: heapam.c:129
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
Definition: heapam.c:8201
void heap_mask(char *pagedata, BlockNumber blkno)
Definition: heapam.c:10057
static int heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
Definition: heapam.c:2061
void simple_heap_delete(Relation relation, ItemPointer tid)
Definition: heapam.c:2934
static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
Definition: heapam.c:6973
TM_Result heap_delete(Relation relation, ItemPointer tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
Definition: heapam.c:2513
void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
Definition: heapam.c:7498
bool heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition: heapam.c:1131
#define LOCKMODE_from_mxstatus(status)
Definition: heapam.c:157
void heap_endscan(TableScanDesc sscan)
Definition: heapam.c:1049
static void heap_xlog_insert(XLogReaderState *record)
Definition: heapam.c:9221
TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple)
Definition: heapam.c:7106
static void index_delete_check_htid(TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, ItemPointer htid, TM_IndexStatus *istatus)
Definition: heapam.c:7583
#define FRM_RETURN_IS_XID
Definition: heapam.c:5974
#define TUPLOCK_from_mxstatus(status)
Definition: heapam.c:216
void heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
Definition: heapam.c:1012
static int index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
Definition: heapam.c:7952
TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
Definition: heapam.c:2980
#define ConditionalLockTupleTuplock(rel, tup, mode)
Definition: heapam.c:169
static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
Definition: heapam.c:9127
bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
Definition: heapam.c:7335
static TransactionId FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
Definition: heapam.c:6025
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy)
Definition: heapam.c:8667
void heap_inplace_update(Relation relation, HeapTuple tuple)
Definition: heapam.c:5889
#define LockTupleTuplock(rel, tup, mode)
Definition: heapam.c:165
bool heap_tuple_should_freeze(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
Definition: heapam.c:7390
static void heap_xlog_update(XLogReaderState *record, bool hot_update)
Definition: heapam.c:9488
static void heap_xlog_vacuum(XLogReaderState *record)
Definition: heapam.c:8849
bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
Definition: heapam.c:6929
static BlockNumber heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
Definition: heapam.c:629
static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
Definition: heapam.c:7054
static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
Definition: heapam.c:4838
static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, TransactionId xid, LockTupleMode mode)
Definition: heapam.c:5604
#define BOTTOMUP_MAX_NBLOCKS
Definition: heapam.c:187
static int heap_log_freeze_cmp(const void *arg1, const void *arg2)
Definition: heapam.c:6788
void ReleaseBulkInsertStatePin(BulkInsertState bistate)
Definition: heapam.c:1786
static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, ItemPointer ctid, XLTW_Oper oper, int *remaining)
Definition: heapam.c:7299
#define FRM_MARK_COMMITTED
Definition: heapam.c:5976
#define FRM_NOOP
Definition: heapam.c:5972
HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction)
Definition: heapam.c:1082
bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
Definition: heapam.c:1461
static void heap_log_freeze_new_plan(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz)
Definition: heapam.c:6852
void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
Definition: heapam.c:4051
int lockstatus
Definition: heapam.c:128
static void heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
Definition: heapam.c:6649
bool heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition: heapam.c:1234
static void heap_xlog_delete(XLogReaderState *record)
Definition: heapam.c:9148
void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
Definition: heapam.c:1161
static void heap_xlog_lock_updated(XLogReaderState *record)
Definition: heapam.c:9867
static void heap_xlog_lock(XLogReaderState *record)
Definition: heapam.c:9796
static void heap_xlog_multi_insert(XLogReaderState *record)
Definition: heapam.c:9341
void heap_abort_speculative(Relation relation, ItemPointer tid)
Definition: heapam.c:5736
static void heap_xlog_visible(XLogReaderState *record)
Definition: heapam.c:8924
TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
Definition: heapam.c:927
static void heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition: heapam.c:720
static Page heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition: heapam.c:583
static uint8 compute_infobits(uint16 infomask, uint16 infomask2)
Definition: heapam.c:2468
void heap_freeze_execute_prepared(Relation rel, Buffer buffer, TransactionId snapshotConflictHorizon, HeapTupleFreeze *tuples, int ntuples)
Definition: heapam.c:6678
#define FRM_RETURN_IS_MULTI
Definition: heapam.c:5975
static void heap_xlog_inplace(XLogReaderState *record)
Definition: heapam.c:9927
LOCKMODE hwlock
Definition: heapam.c:127
#define FRM_INVALIDATE_XMAX
Definition: heapam.c:5973
static const struct @12 tupleLockExtraInfo[MaxLockTupleMode+1]
static void heap_xlog_confirm(XLogReaderState *record)
Definition: heapam.c:9760
static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining)
Definition: heapam.c:7321
static bool heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
Definition: heapam.c:3910
static void index_delete_sort(TM_IndexDeleteOp *delstate)
Definition: heapam.c:7988
static Bitmapset * HeapDetermineColumnsInfo(Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
Definition: heapam.c:3961
static const int MultiXactStatusLock[MaxMultiXactStatus+1]
Definition: heapam.c:205
void simple_heap_insert(Relation relation, HeapTuple tup)
Definition: heapam.c:2455
static bool xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
Definition: heapam.c:2490
void heap2_redo(XLogReaderState *record)
Definition: heapam.c:10014
#define UnlockTupleTuplock(rel, tup, mode)
Definition: heapam.c:167
static bool heap_log_freeze_eq(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz)
Definition: heapam.c:6835
static TM_Result test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
Definition: heapam.c:5168
bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
Definition: heapam.c:6375
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
Definition: heapam.c:8364
static BlockNumber heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
Definition: heapam.c:489
TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: heapam.c:7643
void heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
Definition: heapam.c:2093
static void initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
Definition: heapam.c:229
static int bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
Definition: heapam.c:8085
static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition: heapam.c:835
TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
Definition: heapam.c:4140
static void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
Definition: heapam.c:1735
static int bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
Definition: heapam.c:8128
void heap_get_latest_tid(TableScanDesc sscan, ItemPointer tid)
Definition: heapam.c:1613
void heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
Definition: heapam.c:350
void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
Definition: heapam.c:10153
void heapgetpage(TableScanDesc sscan, BlockNumber block)
Definition: heapam.c:373
static Page heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition: heapam.c:552
static MultiXactStatus get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
Definition: heapam.c:4092
BulkInsertState GetBulkInsertState(void)
Definition: heapam.c:1757
void FreeBulkInsertState(BulkInsertState bistate)
Definition: heapam.c:1774
static TM_Result heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, LockTupleMode mode)
Definition: heapam.c:5259
static void heap_xlog_freeze_page(XLogReaderState *record)
Definition: heapam.c:9056
#define HEAP_INSERT_SPECULATIVE
Definition: heapam.h:37
#define HEAP_FREEZE_CHECK_XMAX_ABORTED
Definition: heapam.h:108
struct HeapScanDescData * HeapScanDesc
Definition: heapam.h:80
HTSV_Result
Definition: heapam.h:95
@ HEAPTUPLE_RECENTLY_DEAD
Definition: heapam.h:98
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition: heapam.h:99
@ HEAPTUPLE_LIVE
Definition: heapam.h:97
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition: heapam.h:100
@ HEAPTUPLE_DEAD
Definition: heapam.h:96
#define HEAP_INSERT_FROZEN
Definition: heapam.h:35
#define HEAP_FREEZE_CHECK_XMIN_COMMITTED
Definition: heapam.h:107
#define HEAP_INSERT_NO_LOGICAL
Definition: heapam.h:36
#define MaxLockTupleMode
Definition: heapam.h:43
struct BulkInsertStateData * BulkInsertState
Definition: heapam.h:39
const TableAmRoutine * GetHeapamTableAmRoutine(void)
void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid)
bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
bool HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest)
HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer)
bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buffer buffer)
#define XLH_INSERT_ON_TOAST_RELATION
Definition: heapam_xlog.h:70
#define XLOG_HEAP2_PRUNE
Definition: heapam_xlog.h:54
#define SizeOfHeapMultiInsert
Definition: heapam_xlog.h:182
#define XLOG_HEAP2_MULTI_INSERT
Definition: heapam_xlog.h:58
#define SizeOfHeapUpdate
Definition: heapam_xlog.h:227
#define XLH_INVALID_XVAC
Definition: heapam_xlog.h:324
#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED
Definition: heapam_xlog.h:81
#define SizeOfHeapVisible
Definition: heapam_xlog.h:371
#define XLOG_HEAP_HOT_UPDATE
Definition: heapam_xlog.h:36
#define XLOG_HEAP_DELETE
Definition: heapam_xlog.h:33
#define XLOG_HEAP2_VACUUM
Definition: heapam_xlog.h:55
#define XLH_INSERT_IS_SPECULATIVE
Definition: heapam_xlog.h:68
#define XLOG_HEAP2_REWRITE
Definition: heapam_xlog.h:53
#define XLH_LOCK_ALL_FROZEN_CLEARED
Definition: heapam_xlog.h:277
#define XLH_DELETE_CONTAINS_OLD_KEY
Definition: heapam_xlog.h:98
#define SizeOfHeapInplace
Definition: heapam_xlog.h:316
#define XLOG_HEAP_TRUNCATE
Definition: heapam_xlog.h:35
#define XLH_UPDATE_CONTAINS_NEW_TUPLE
Definition: heapam_xlog.h:84
#define XLH_INSERT_LAST_IN_MULTI
Definition: heapam_xlog.h:67
#define SizeOfHeapFreezePage
Definition: heapam_xlog.h:357
#define XLH_INSERT_ALL_FROZEN_SET
Definition: heapam_xlog.h:73
#define XLOG_HEAP_OPMASK
Definition: heapam_xlog.h:41
#define XLH_FREEZE_XVAC
Definition: heapam_xlog.h:323
#define XLOG_HEAP_UPDATE
Definition: heapam_xlog.h:34
#define XLHL_XMAX_KEYSHR_LOCK
Definition: heapam_xlog.h:273
#define XLH_DELETE_ALL_VISIBLE_CLEARED
Definition: heapam_xlog.h:96
#define XLH_UPDATE_CONTAINS_OLD_TUPLE
Definition: heapam_xlog.h:82
#define SizeOfHeapNewCid
Definition: heapam_xlog.h:391
#define SizeOfHeapLockUpdated
Definition: heapam_xlog.h:299
#define XLHL_XMAX_IS_MULTI
Definition: heapam_xlog.h:270
struct xl_heap_freeze_plan xl_heap_freeze_plan
#define XLH_INSERT_ALL_VISIBLE_CLEARED
Definition: heapam_xlog.h:66
#define SizeOfHeapHeader
Definition: heapam_xlog.h:151
#define XLH_DELETE_IS_PARTITION_MOVE
Definition: heapam_xlog.h:100
#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED
Definition: heapam_xlog.h:79
#define XLHL_XMAX_LOCK_ONLY
Definition: heapam_xlog.h:271
#define XLOG_HEAP_INPLACE
Definition: heapam_xlog.h:39
#define XLOG_HEAP2_LOCK_UPDATED
Definition: heapam_xlog.h:59
#define XLH_UPDATE_SUFFIX_FROM_OLD
Definition: heapam_xlog.h:86
#define XLOG_HEAP2_FREEZE_PAGE
Definition: heapam_xlog.h:56
#define XLH_UPDATE_PREFIX_FROM_OLD
Definition: heapam_xlog.h:85
#define SizeOfMultiInsertTuple
Definition: heapam_xlog.h:193
#define XLHL_XMAX_EXCL_LOCK
Definition: heapam_xlog.h:272
#define XLOG_HEAP2_NEW_CID
Definition: heapam_xlog.h:60
#define XLH_DELETE_CONTAINS_OLD_TUPLE
Definition: heapam_xlog.h:97
#define XLOG_HEAP_LOCK
Definition: heapam_xlog.h:38
#define XLOG_HEAP_INSERT
Definition: heapam_xlog.h:32
#define SizeOfHeapInsert
Definition: heapam_xlog.h:162
#define SizeOfHeapDelete
Definition: heapam_xlog.h:115
#define XLH_DELETE_IS_SUPER
Definition: heapam_xlog.h:99
#define XLH_UPDATE_CONTAINS_OLD_KEY
Definition: heapam_xlog.h:83
#define XLHL_KEYS_UPDATED
Definition: heapam_xlog.h:274
#define XLOG_HEAP2_VISIBLE
Definition: heapam_xlog.h:57
#define XLH_INSERT_CONTAINS_NEW_TUPLE
Definition: heapam_xlog.h:69
#define XLOG_HEAP_INIT_PAGE
Definition: heapam_xlog.h:46
#define SizeOfHeapConfirm
Definition: heapam_xlog.h:307
#define SizeOfHeapLock
Definition: heapam_xlog.h:288
#define XLOG_HEAP_CONFIRM
Definition: heapam_xlog.h:37
void heap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative)
Definition: heaptoast.c:43
HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, int options)
Definition: heaptoast.c:96
HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc)
Definition: heaptoast.c:350
#define TOAST_TUPLE_THRESHOLD
Definition: heaptoast.h:48
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: heaptuple.c:1116
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition: heaptuple.c:1345
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1434
void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token)
Definition: hio.c:35
Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages)
Definition: hio.c:502
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
#define HEAP_XMAX_IS_EXCL_LOCKED(infomask)
Definition: htup_details.h:261
#define HeapTupleHeaderSetXminFrozen(tup)
Definition: htup_details.h:348
#define HEAP_MOVED_OFF
Definition: htup_details.h:211
#define HEAP_XMAX_SHR_LOCK
Definition: htup_details.h:200
#define HEAP_XMIN_FROZEN
Definition: htup_details.h:206
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:792
#define HEAP_XMAX_IS_LOCKED_ONLY(infomask)
Definition: htup_details.h:227
#define HeapTupleHeaderGetNatts(tup)
Definition: htup_details.h:529
#define SizeofHeapTupleHeader
Definition: htup_details.h:185
#define HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)
Definition: htup_details.h:263
#define HEAP_KEYS_UPDATED
Definition: htup_details.h:275
#define HeapTupleHeaderIsHeapOnly(tup)
Definition: htup_details.h:499
#define HEAP_HOT_UPDATED
Definition: htup_details.h:276
#define HeapTupleHeaderIndicatesMovedPartitions(tup)
Definition: htup_details.h:444
#define HeapTupleSetHotUpdated(tuple)
Definition: htup_details.h:677
#define HeapTupleHeaderGetXvac(tup)
Definition: htup_details.h:411
#define HEAP2_XACT_MASK
Definition: htup_details.h:279
#define HeapTupleHeaderSetXmin(tup, xid)
Definition: htup_details.h:315
#define HeapTupleHeaderSetXmax(tup, xid)
Definition: htup_details.h:376
#define HEAP_XMAX_LOCK_ONLY
Definition: htup_details.h:197
#define HeapTupleHeaderGetXmin(tup)
Definition: htup_details.h:309
#define HEAP_XMAX_BITS
Definition: htup_details.h:267
#define HEAP_LOCK_MASK
Definition: htup_details.h:202
#define HeapTupleHasExternal(tuple)
Definition: htup_details.h:671
#define HeapTupleHeaderClearHotUpdated(tup)
Definition: htup_details.h:494
#define HeapTupleHeaderSetCmin(tup, cid)
Definition: htup_details.h:393
#define HeapTupleHeaderSetHotUpdated(tup)
Definition: htup_details.h:489
#define HeapTupleHeaderSetXvac(tup, xid)
Definition: htup_details.h:419
#define HEAP_MOVED
Definition: htup_details.h:213
#define HEAP_XMAX_IS_MULTI
Definition: htup_details.h:209
#define HEAP_XMAX_COMMITTED
Definition: htup_details.h:207
#define HEAP_COMBOCID
Definition: htup_details.h:195
#define HeapTupleIsHeapOnly(tuple)
Definition: htup_details.h:683
#define HEAP_XACT_MASK
Definition: htup_details.h:215
#define HeapTupleHeaderGetRawXmin(tup)
Definition: htup_details.h:304
#define HeapTupleHeaderXminFrozen(tup)
Definition: htup_details.h:331
#define HEAP_XMAX_EXCL_LOCK
Definition: htup_details.h:196
#define HEAP_XMAX_INVALID
Definition: htup_details.h:208
#define HeapTupleHeaderXminCommitted(tup)
Definition: htup_details.h:320
#define MaxHeapAttributeNumber
Definition: htup_details.h:48
#define HeapTupleHeaderSetMovedPartitions(tup)
Definition: htup_details.h:447
#define HeapTupleIsHotUpdated(tuple)
Definition: htup_details.h:674
#define HeapTupleHeaderGetRawXmax(tup)
Definition: htup_details.h:371
#define MaxHeapTuplesPerPage
Definition: htup_details.h:572
#define HeapTupleSetHeapOnly(tuple)
Definition: htup_details.h:686
#define HEAP_XMAX_IS_SHR_LOCKED(infomask)
Definition: htup_details.h:259
#define HeapTupleClearHeapOnly(tuple)
Definition: htup_details.h:689
#define HEAP_UPDATED
Definition: htup_details.h:210
#define HEAP_XMAX_KEYSHR_LOCK
Definition: htup_details.h:194
#define HeapTupleHeaderGetUpdateXid(tup)
Definition: htup_details.h:361
#define HeapTupleClearHotUpdated(tuple)
Definition: htup_details.h:680
#define HeapTupleHeaderGetRawCommandId(tup)
Definition: htup_details.h:387
#define MaxHeapTupleSize
Definition: htup_details.h:558
#define HEAP_LOCKED_UPGRADED(infomask)
Definition: htup_details.h:249
#define HeapTupleHeaderIsSpeculative(tup)
Definition: htup_details.h:428
#define HeapTupleHeaderIsHotUpdated(tup)
Definition: htup_details.h:482
#define HeapTupleHeaderXminInvalid(tup)
Definition: htup_details.h:325
#define HeapTupleHeaderSetCmax(tup, cid, iscombo)
Definition: htup_details.h:401
#define IsParallelWorker()
Definition: parallel.h:60
int remaining
Definition: informix.c:667
void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple, HeapTuple newtuple)
Definition: inval.c:1204
int b
Definition: isn.c:70
int init
Definition: isn.c:75
int j
Definition: isn.c:74
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
Pointer Item
Definition: item.h:17
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdIsNormal(itemId)
Definition: itemid.h:99
#define ItemIdGetOffset(itemId)
Definition: itemid.h:65
struct ItemIdData ItemIdData
#define ItemIdGetRedirect(itemId)
Definition: itemid.h:78
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define ItemIdHasStorage(itemId)
Definition: itemid.h:120
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
Definition: itemptr.h:135
static void ItemPointerSetInvalid(ItemPointerData *pointer)
Definition: itemptr.h:184
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition: itemptr.h:158
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition: itemptr.h:147
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static BlockNumber ItemPointerGetBlockNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:93
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
Assert(fmt[strlen(fmt) - 1] !='\n')
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:667
bool ConditionalXactLockTableWait(TransactionId xid)
Definition: lmgr.c:740
XLTW_Oper
Definition: lmgr.h:25
@ XLTW_None
Definition: lmgr.h:26
@ XLTW_Lock
Definition: lmgr.h:29
@ XLTW_Delete
Definition: lmgr.h:28
@ XLTW_LockUpdated
Definition: lmgr.h:30
@ XLTW_Update
Definition: lmgr.h:27
bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
Definition: lock.c:570
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define AccessShareLock
Definition: lockdefs.h:36
#define ExclusiveLock
Definition: lockdefs.h:42
#define RowShareLock
Definition: lockdefs.h:37
LockWaitPolicy
Definition: lockoptions.h:37
@ LockWaitSkip
Definition: lockoptions.h:41
@ LockWaitBlock
Definition: lockoptions.h:39
@ LockWaitError
Definition: lockoptions.h:43
LockTupleMode
Definition: lockoptions.h:50
@ LockTupleExclusive
Definition: lockoptions.h:58
@ LockTupleNoKeyExclusive
Definition: lockoptions.h:56
@ LockTupleShare
Definition: lockoptions.h:54
@ LockTupleKeyShare
Definition: lockoptions.h:52
void pfree(void *pointer)
Definition: mcxt.c:1508
void * palloc(Size size)
Definition: mcxt.c:1304
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:451
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition: multixact.c:438
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3234
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3248
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition: multixact.c:550
void MultiXactIdSetOldestMember(void)
Definition: multixact.c:624
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition: multixact.c:766
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition: multixact.c:385
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition: multixact.c:1239
#define MultiXactIdIsValid(multi)
Definition: multixact.h:28
MultiXactStatus
Definition: multixact.h:38
@ MultiXactStatusForShare
Definition: multixact.h:40
@ MultiXactStatusForNoKeyUpdate
Definition: multixact.h:41
@ MultiXactStatusNoKeyUpdate
Definition: multixact.h:44
@ MultiXactStatusUpdate
Definition: multixact.h:46
@ MultiXactStatusForUpdate
Definition: multixact.h:42
@ MultiXactStatusForKeyShare
Definition: multixact.h:39
#define ISUPDATE_from_mxstatus(status)
Definition: multixact.h:52
#define InvalidMultiXactId
Definition: multixact.h:24
#define MaxMultiXactStatus
Definition: multixact.h:49
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define MaxOffsetNumber
Definition: off.h:28
Operator oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, bool noError, int location)
Definition: parse_oper.c:370
int16 attlen
Definition: pg_attribute.h:59
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:209
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static uint32 pg_nextpower2_32(uint32 num)
Definition: pg_bitutils.h:189
static PgChecksumMode mode
Definition: pg_checksums.c:56
static const struct exclude_list_item skip[]
Definition: pg_checksums.c:108
const void size_t len
const void * data
#define plan(x)
Definition: pg_regress.c:162
static char * buf
Definition: pg_test_fsync.c:73
#define pgstat_count_heap_getnext(rel)
Definition: pgstat.h:615
#define pgstat_count_heap_scan(rel)
Definition: pgstat.h:610
void pgstat_count_heap_update(Relation rel, bool hot, bool newpage)
void pgstat_count_heap_delete(Relation rel)
void pgstat_count_heap_insert(Relation rel, PgStat_Counter n)
#define qsort(a, b, c, d)
Definition: port.h:449
uintptr_t Datum
Definition: postgres.h:64
static Oid DatumGetObjectId(Datum X)
Definition: postgres.h:242
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
Definition: predicate.c:4003
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4316
void PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, TransactionId tuple_xid)
Definition: predicate.c:2601
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2556
bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
Definition: predicate.c:3971
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition: procarray.c:4091
bool TransactionIdIsInProgress(TransactionId xid)
Definition: procarray.c:1390
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition: pruneheap.c:88
void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused)
Definition: pruneheap.c:838
#define RelationGetRelid(relation)
Definition: rel.h:505
#define RelationIsLogicallyLogged(relation)
Definition: rel.h:701
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition: rel.h:378
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetNumberOfAttributes(relation)
Definition: rel.h:511
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:684
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
#define HEAP_DEFAULT_FILLFACTOR
Definition: rel.h:349
void RelationDecrementReferenceCount(Relation rel)
Definition: relcache.c:2166
void RelationIncrementReferenceCount(Relation rel)
Definition: relcache.c:2153
Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
Definition: relcache.c:5220
@ INDEX_ATTR_BITMAP_KEY
Definition: relcache.h:61
@ INDEX_ATTR_BITMAP_HOT_BLOCKING
Definition: relcache.h:64
@ INDEX_ATTR_BITMAP_SUMMARIZED
Definition: relcache.h:65
@ INDEX_ATTR_BITMAP_IDENTITY_KEY
Definition: relcache.h:63
@ MAIN_FORKNUM
Definition: relpath.h:50
struct ParallelBlockTableScanDescData * ParallelBlockTableScanDesc
Definition: relscan.h:85
void heap_xlog_logical_rewrite(XLogReaderState *r)
Definition: rewriteheap.c:1073
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
ScanDirection
Definition: sdir.h:25
ScanKeyData * ScanKey
Definition: skey.h:75
TransactionId RecentXmin
Definition: snapmgr.c:99
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:836
TransactionId TransactionXmin
Definition: snapmgr.c:98
#define SnapshotAny
Definition: snapmgr.h:33
#define InitNonVacuumableSnapshot(snapshotdata, vistestp)
Definition: snapmgr.h:48
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
#define InvalidSnapshot
Definition: snapshot.h:123
int get_tablespace_maintenance_io_concurrency(Oid spcid)
Definition: spccache.c:229
void ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, bool isCatalogRel, RelFileLocator locator)
Definition: standby.c:467
BlockNumber last_free
Definition: hio.h:49
BufferAccessStrategy strategy
Definition: hio.h:31
uint32 already_extended_by
Definition: hio.h:50
BlockNumber next_free
Definition: hio.h:48
Buffer current_buf
Definition: hio.h:32
MultiXactId NoFreezePageRelminMxid
Definition: heapam.h:190
TransactionId FreezePageRelfrozenXid
Definition: heapam.h:178
bool freeze_required
Definition: heapam.h:152
MultiXactId FreezePageRelminMxid
Definition: heapam.h:179
TransactionId NoFreezePageRelfrozenXid
Definition: heapam.h:189
int rs_ntuples
Definition: heapam.h:77
BufferAccessStrategy rs_strategy
Definition: heapam.h:65
OffsetNumber rs_coffset
Definition: heapam.h:60
bool rs_inited
Definition: heapam.h:59
Buffer rs_cbuf
Definition: heapam.h:62
ParallelBlockTableScanWorkerData * rs_parallelworkerdata
Definition: heapam.h:73
BlockNumber rs_startblock
Definition: heapam.h:54
HeapTupleData rs_ctup
Definition: heapam.h:67
OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]
Definition: heapam.h:78
BlockNumber rs_numblocks
Definition: heapam.h:55
BlockNumber rs_nblocks
Definition: heapam.h:53
BlockNumber rs_cblock
Definition: heapam.h:61
TableScanDescData rs_base
Definition: heapam.h:50
ItemPointerData t_self
Definition: htup.h:65
uint32 t_len
Definition: htup.h:64
HeapTupleHeader t_data
Definition: htup.h:68
Oid t_tableOid
Definition: htup.h:66
TransactionId t_xmin
Definition: htup_details.h:124
uint8 frzflags
Definition: heapam.h:117
uint16 t_infomask2
Definition: heapam.h:115
TransactionId xmax
Definition: heapam.h:114
OffsetNumber offset
Definition: heapam.h:122
uint8 checkflags
Definition: heapam.h:120
uint16 t_infomask
Definition: heapam.h:116
union HeapTupleHeaderData::@45 t_choice
ItemPointerData t_ctid
Definition: htup_details.h:161
HeapTupleFields t_heap
Definition: htup_details.h:157
int16 ifirsttid
Definition: heapam.c:198
int16 npromisingtids
Definition: heapam.c:196
TransactionId xid
Definition: multixact.h:58
MultiXactStatus status
Definition: multixact.h:59
const struct TableAmRoutine * rd_tableam
Definition: rel.h:189
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
bool takenDuringRecovery
Definition: snapshot.h:184
TransactionId xmax
Definition: tableam.h:143
CommandId cmax
Definition: tableam.h:144
ItemPointerData ctid
Definition: tableam.h:142
TM_IndexStatus * status
Definition: tableam.h:247
int bottomupfreespace
Definition: tableam.h:242
Relation irel
Definition: tableam.h:239
TM_IndexDelete * deltids
Definition: tableam.h:246
BlockNumber iblknum
Definition: tableam.h:240
ItemPointerData tid
Definition: tableam.h:205
bool knowndeletable
Definition: tableam.h:212
bool promising
Definition: tableam.h:215
int16 freespace
Definition: tableam.h:216
OffsetNumber idxoffnum
Definition: tableam.h:211
Relation rs_rd
Definition: relscan.h:34
ItemPointerData rs_mintid
Definition: relscan.h:40
ItemPointerData rs_maxtid
Definition: relscan.h:41
uint32 rs_flags
Definition: relscan.h:47
struct ScanKeyData * rs_key
Definition: relscan.h:37
struct SnapshotData * rs_snapshot
Definition: relscan.h:35
struct ParallelTableScanDescData * rs_parallel
Definition: relscan.h:49
Oid tts_tableOid
Definition: tuptable.h:130
TransactionId FreezeLimit
Definition: vacuum.h:276
TransactionId OldestXmin
Definition: vacuum.h:266
TransactionId relfrozenxid
Definition: vacuum.h:250
MultiXactId relminmxid
Definition: vacuum.h:251
MultiXactId MultiXactCutoff
Definition: vacuum.h:277
MultiXactId OldestMxact
Definition: vacuum.h:267
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
Definition: c.h:674
OffsetNumber offnum
Definition: heapam_xlog.h:304
TransactionId xmax
Definition: heapam_xlog.h:109
OffsetNumber offnum
Definition: heapam_xlog.h:110
uint8 infobits_set
Definition: heapam_xlog.h:111
TransactionId snapshotConflictHorizon
Definition: heapam_xlog.h:347
TransactionId xmax
Definition: heapam_xlog.h:328
uint16 t_infomask
Definition: heapam_xlog.h:147
uint16 t_infomask2
Definition: heapam_xlog.h:146
OffsetNumber offnum
Definition: heapam_xlog.h:312
OffsetNumber offnum
Definition: heapam_xlog.h:156
TransactionId xmax
Definition: heapam_xlog.h:293
OffsetNumber offnum
Definition: heapam_xlog.h:294
uint8 infobits_set
Definition: heapam_xlog.h:284
OffsetNumber offnum
Definition: heapam_xlog.h:283
TransactionId xmax
Definition: heapam_xlog.h:282
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]
Definition: heapam_xlog.h:179
CommandId cmin
Definition: heapam_xlog.h:380
CommandId combocid
Definition: heapam_xlog.h:382
ItemPointerData target_tid
Definition: heapam_xlog.h:388
TransactionId top_xid
Definition: heapam_xlog.h:379
CommandId cmax
Definition: heapam_xlog.h:381
RelFileLocator target_locator
Definition: heapam_xlog.h:387
TransactionId snapshotConflictHorizon
Definition: heapam_xlog.h:245
uint16 nredirected
Definition: heapam_xlog.h:246
TransactionId new_xmax
Definition: heapam_xlog.h:218
uint8 old_infobits_set
Definition: heapam_xlog.h:216
TransactionId old_xmax
Definition: heapam_xlog.h:214
OffsetNumber old_offnum
Definition: heapam_xlog.h:215
OffsetNumber new_offnum
Definition: heapam_xlog.h:219
TransactionId snapshotConflictHorizon
Definition: heapam_xlog.h:367
TransactionId SubTransGetTopmostTransaction(TransactionId xid)
Definition: subtrans.c:163
void ss_report_location(Relation rel, BlockNumber location)
Definition: syncscan.c:289
BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks)
Definition: syncscan.c:254
#define FirstLowInvalidHeapAttributeNumber
Definition: sysattr.h:27
#define TableOidAttributeNumber
Definition: sysattr.h:26
void table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan)
Definition: tableam.c:421
BlockNumber table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan)
Definition: tableam.c:491
bool synchronize_seqscans
Definition: tableam.c:49
@ SO_ALLOW_STRAT
Definition: tableam.h:57
@ SO_TEMP_SNAPSHOT
Definition: tableam.h:64
@ SO_ALLOW_PAGEMODE
Definition: tableam.h:61
@ SO_TYPE_SAMPLESCAN
Definition: tableam.h:50
@ SO_ALLOW_SYNC
Definition: tableam.h:59
@ SO_TYPE_SEQSCAN
Definition: tableam.h:48
TU_UpdateIndexes
Definition: tableam.h:110
@ TU_Summarizing
Definition: tableam.h:118
@ TU_All
Definition: tableam.h:115
@ TU_None
Definition: tableam.h:112
TM_Result
Definition: tableam.h:72
@ TM_Ok
Definition: tableam.h:77
@ TM_BeingModified
Definition: tableam.h:99
@ TM_Deleted
Definition: tableam.h:92
@ TM_WouldBlock
Definition: tableam.h:102
@ TM_Updated
Definition: tableam.h:89
@ TM_SelfModified
Definition: tableam.h:83
@ TM_Invisible
Definition: tableam.h:80
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:126
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:299
bool TransactionIdDidAbort(TransactionId transactionId)
Definition: transam.c:188
bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition: transam.c:314
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:329
#define FrozenTransactionId
Definition: transam.h:33
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:433
char data[BLCKSZ]
Definition: c.h:1106
static bool HeapKeyTest(HeapTuple tuple, TupleDesc tupdesc, int nkeys, ScanKey keys)
Definition: valid.h:28
#define VARATT_IS_EXTERNAL(PTR)
Definition: varatt.h:289
void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags)
bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_XLOG_VALID_BITS
#define VISIBILITYMAP_XLOG_CATALOG_REL
#define VISIBILITYMAP_ALL_VISIBLE
TransactionId GetTopTransactionId(void)
Definition: xact.c:418
bool bsysscan
Definition: xact.c:98
TransactionId CheckXidAlive
Definition: xact.c:97
TransactionId GetTopTransactionIdIfAny(void)
Definition: xact.c:433
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition: xact.c:927
bool IsInParallelMode(void)
Definition: xact.c:1070
TransactionId GetCurrentTransactionId(void)
Definition: xact.c:446
CommandId GetCurrentCommandId(bool used)
Definition: xact.c:819
#define XLOG_INCLUDE_ORIGIN
Definition: xlog.h:152
#define XLogHintBitIsNeeded()
Definition: xlog.h:118
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
bool XLogCheckBufferNeedsBackup(Buffer buffer)
Definition: xloginsert.c:1027
void XLogSetRecordFlags(uint8 flags)
Definition: xloginsert.c:456
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32
#define REGBUF_KEEP_DATA
Definition: xloginsert.h:35
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:1997
void XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: xlogreader.c:1971
char * XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len)
Definition: xlogreader.c:2025
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:412
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
void FreeFakeRelcacheEntry(Relation fakerel)
Definition: xlogutils.c:629
XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf)
Definition: xlogutils.c:314
Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
Definition: xlogutils.c:326
Relation CreateFakeRelcacheEntry(RelFileLocator rlocator)
Definition: xlogutils.c:582
XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf)
Definition: xlogutils.c:351
#define InHotStandby
Definition: xlogutils.h:57
XLogRedoAction
Definition: xlogutils.h:70
@ BLK_RESTORED
Definition: xlogutils.h:73
@ BLK_NEEDS_REDO
Definition: xlogutils.h:71