PostgreSQL Source Code  git master
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * heap_beginscan - begin relation scan
16  * heap_rescan - restart a relation scan
17  * heap_endscan - end relation scan
18  * heap_getnext - retrieve next tuple in scan
19  * heap_fetch - retrieve tuple with given tid
20  * heap_insert - insert tuple into a relation
21  * heap_multi_insert - insert multiple tuples into a relation
22  * heap_delete - delete a tuple from a relation
23  * heap_update - replace a tuple in a relation with another tuple
24  * heap_sync - sync heap, for when no WAL has been written
25  *
26  * NOTES
27  * This file contains the heap_ routines which implement
28  * the POSTGRES heap access method used for all POSTGRES
29  * relations.
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include "access/bufmask.h"
36 #include "access/genam.h"
37 #include "access/heapam.h"
38 #include "access/heapam_xlog.h"
39 #include "access/hio.h"
40 #include "access/multixact.h"
41 #include "access/parallel.h"
42 #include "access/relscan.h"
43 #include "access/sysattr.h"
44 #include "access/tableam.h"
45 #include "access/transam.h"
46 #include "access/tuptoaster.h"
47 #include "access/valid.h"
48 #include "access/visibilitymap.h"
49 #include "access/xact.h"
50 #include "access/xlog.h"
51 #include "access/xloginsert.h"
52 #include "access/xlogutils.h"
53 #include "catalog/catalog.h"
54 #include "miscadmin.h"
55 #include "pgstat.h"
56 #include "port/atomics.h"
57 #include "storage/bufmgr.h"
58 #include "storage/freespace.h"
59 #include "storage/lmgr.h"
60 #include "storage/predicate.h"
61 #include "storage/procarray.h"
62 #include "storage/smgr.h"
63 #include "storage/spin.h"
64 #include "storage/standby.h"
65 #include "utils/datum.h"
66 #include "utils/inval.h"
67 #include "utils/lsyscache.h"
68 #include "utils/relcache.h"
69 #include "utils/snapmgr.h"
70 #include "utils/spccache.h"
71 
72 
73 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
74  TransactionId xid, CommandId cid, int options);
75 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
76  Buffer newbuf, HeapTuple oldtup,
77  HeapTuple newtup, HeapTuple old_key_tuple,
78  bool all_visible_cleared, bool new_all_visible_cleared);
80  Bitmapset *interesting_cols,
81  HeapTuple oldtup, HeapTuple newtup);
82 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
83  LockTupleMode mode, LockWaitPolicy wait_policy,
84  bool *have_tuple_lock);
85 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
86  uint16 old_infomask2, TransactionId add_to_xmax,
87  LockTupleMode mode, bool is_update,
88  TransactionId *result_xmax, uint16 *result_infomask,
89  uint16 *result_infomask2);
91  ItemPointer ctid, TransactionId xid,
93 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
94  uint16 *new_infomask2);
96  uint16 t_infomask);
97 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
98  LockTupleMode lockmode, bool *current_is_member);
99 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
100  Relation rel, ItemPointer ctid, XLTW_Oper oper,
101  int *remaining);
103  uint16 infomask, Relation rel, int *remaining);
104 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
105 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
106  bool *copy);
107 
108 
109 /*
110  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
111  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
112  * update them). This table (and the macros below) helps us determine the
113  * heavyweight lock mode and MultiXactStatus values to use for any particular
114  * tuple lock strength.
115  *
116  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
117  * instead.
118  */
119 static const struct
120 {
124 }
125 
127 {
128  { /* LockTupleKeyShare */
131  -1 /* KeyShare does not allow updating tuples */
132  },
133  { /* LockTupleShare */
134  RowShareLock,
136  -1 /* Share does not allow updating tuples */
137  },
138  { /* LockTupleNoKeyExclusive */
142  },
143  { /* LockTupleExclusive */
147  }
148 };
149 
150 /* Get the LOCKMODE for a given MultiXactStatus */
151 #define LOCKMODE_from_mxstatus(status) \
152  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153 
154 /*
155  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156  * This is more readable than having every caller translate it to lock.h's
157  * LOCKMODE.
158  */
159 #define LockTupleTuplock(rel, tup, mode) \
160  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161 #define UnlockTupleTuplock(rel, tup, mode) \
162  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163 #define ConditionalLockTupleTuplock(rel, tup, mode) \
164  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165 
166 #ifdef USE_PREFETCH
167 /*
168  * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
169  * this structure to coordinate prefetching activity.
170  */
171 typedef struct
172 {
173  BlockNumber cur_hblkno;
174  int next_item;
175  int nitems;
176  ItemPointerData *tids;
177 } XidHorizonPrefetchState;
178 #endif
179 
180 /*
181  * This table maps tuple lock strength values for each particular
182  * MultiXactStatus value.
183  */
185 {
186  LockTupleKeyShare, /* ForKeyShare */
187  LockTupleShare, /* ForShare */
188  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
189  LockTupleExclusive, /* ForUpdate */
190  LockTupleNoKeyExclusive, /* NoKeyUpdate */
191  LockTupleExclusive /* Update */
192 };
193 
194 /* Get the LockTupleMode for a given MultiXactStatus */
195 #define TUPLOCK_from_mxstatus(status) \
196  (MultiXactStatusLock[(status)])
197 
198 /* ----------------------------------------------------------------
199  * heap support routines
200  * ----------------------------------------------------------------
201  */
202 
203 /* ----------------
204  * initscan - scan code common to heap_beginscan and heap_rescan
205  * ----------------
206  */
207 static void
208 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
209 {
210  ParallelBlockTableScanDesc bpscan = NULL;
211  bool allow_strat;
212  bool allow_sync;
213 
214  /*
215  * Determine the number of blocks we have to scan.
216  *
217  * It is sufficient to do this once at scan start, since any tuples added
218  * while the scan is in progress will be invisible to my snapshot anyway.
219  * (That is not true when using a non-MVCC snapshot. However, we couldn't
220  * guarantee to return tuples added after scan start anyway, since they
221  * might go into pages we already scanned. To guarantee consistent
222  * results for a non-MVCC snapshot, the caller must hold some higher-level
223  * lock that ensures the interesting tuple(s) won't change.)
224  */
225  if (scan->rs_base.rs_parallel != NULL)
226  {
228  scan->rs_nblocks = bpscan->phs_nblocks;
229  }
230  else
232 
233  /*
234  * If the table is large relative to NBuffers, use a bulk-read access
235  * strategy and enable synchronized scanning (see syncscan.c). Although
236  * the thresholds for these features could be different, we make them the
237  * same so that there are only two behaviors to tune rather than four.
238  * (However, some callers need to be able to disable one or both of these
239  * behaviors, independently of the size of the table; also there is a GUC
240  * variable that can disable synchronized scanning.)
241  *
242  * Note that table_block_parallelscan_initialize has a very similar test;
243  * if you change this, consider changing that one, too.
244  */
245  if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
246  scan->rs_nblocks > NBuffers / 4)
247  {
248  allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
249  allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
250  }
251  else
252  allow_strat = allow_sync = false;
253 
254  if (allow_strat)
255  {
256  /* During a rescan, keep the previous strategy object. */
257  if (scan->rs_strategy == NULL)
259  }
260  else
261  {
262  if (scan->rs_strategy != NULL)
264  scan->rs_strategy = NULL;
265  }
266 
267  if (scan->rs_base.rs_parallel != NULL)
268  {
269  /* For parallel scan, believe whatever ParallelTableScanDesc says. */
270  if (scan->rs_base.rs_parallel->phs_syncscan)
271  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
272  else
273  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
274  }
275  else if (keep_startblock)
276  {
277  /*
278  * When rescanning, we want to keep the previous startblock setting,
279  * so that rewinding a cursor doesn't generate surprising results.
280  * Reset the active syncscan setting, though.
281  */
282  if (allow_sync && synchronize_seqscans)
283  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
284  else
285  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
286  }
287  else if (allow_sync && synchronize_seqscans)
288  {
289  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
290  scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
291  }
292  else
293  {
294  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
295  scan->rs_startblock = 0;
296  }
297 
299  scan->rs_inited = false;
300  scan->rs_ctup.t_data = NULL;
302  scan->rs_cbuf = InvalidBuffer;
304 
305  /* page-at-a-time fields are always invalid when not rs_inited */
306 
307  /*
308  * copy the scan key, if appropriate
309  */
310  if (key != NULL)
311  memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
312 
313  /*
314  * Currently, we only have a stats counter for sequential heap scans (but
315  * e.g for bitmap scans the underlying bitmap index scans will be counted,
316  * and for sample scans we update stats for tuple fetches).
317  */
318  if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
320 }
321 
322 /*
323  * heap_setscanlimits - restrict range of a heapscan
324  *
325  * startBlk is the page to start at
326  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
327  */
328 void
330 {
331  HeapScanDesc scan = (HeapScanDesc) sscan;
332 
333  Assert(!scan->rs_inited); /* else too late to change */
334  /* else rs_startblock is significant */
335  Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
336 
337  /* Check startBlk is valid (but allow case of zero blocks...) */
338  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
339 
340  scan->rs_startblock = startBlk;
341  scan->rs_numblocks = numBlks;
342 }
343 
344 /*
345  * heapgetpage - subroutine for heapgettup()
346  *
347  * This routine reads and pins the specified page of the relation.
348  * In page-at-a-time mode it performs additional work, namely determining
349  * which tuples on the page are visible.
350  */
351 void
353 {
354  HeapScanDesc scan = (HeapScanDesc) sscan;
355  Buffer buffer;
356  Snapshot snapshot;
357  Page dp;
358  int lines;
359  int ntup;
360  OffsetNumber lineoff;
361  ItemId lpp;
362  bool all_visible;
363 
364  Assert(page < scan->rs_nblocks);
365 
366  /* release previous scan buffer, if any */
367  if (BufferIsValid(scan->rs_cbuf))
368  {
369  ReleaseBuffer(scan->rs_cbuf);
370  scan->rs_cbuf = InvalidBuffer;
371  }
372 
373  /*
374  * Be sure to check for interrupts at least once per page. Checks at
375  * higher code levels won't be able to stop a seqscan that encounters many
376  * pages' worth of consecutive dead tuples.
377  */
379 
380  /* read page using selected strategy */
381  scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
382  RBM_NORMAL, scan->rs_strategy);
383  scan->rs_cblock = page;
384 
385  if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
386  return;
387 
388  buffer = scan->rs_cbuf;
389  snapshot = scan->rs_base.rs_snapshot;
390 
391  /*
392  * Prune and repair fragmentation for the whole page, if possible.
393  */
394  heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
395 
396  /*
397  * We must hold share lock on the buffer content while examining tuple
398  * visibility. Afterwards, however, the tuples we have found to be
399  * visible are guaranteed good as long as we hold the buffer pin.
400  */
401  LockBuffer(buffer, BUFFER_LOCK_SHARE);
402 
403  dp = BufferGetPage(buffer);
404  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
405  lines = PageGetMaxOffsetNumber(dp);
406  ntup = 0;
407 
408  /*
409  * If the all-visible flag indicates that all tuples on the page are
410  * visible to everyone, we can skip the per-tuple visibility tests.
411  *
412  * Note: In hot standby, a tuple that's already visible to all
413  * transactions in the master might still be invisible to a read-only
414  * transaction in the standby. We partly handle this problem by tracking
415  * the minimum xmin of visible tuples as the cut-off XID while marking a
416  * page all-visible on master and WAL log that along with the visibility
417  * map SET operation. In hot standby, we wait for (or abort) all
418  * transactions that can potentially may not see one or more tuples on the
419  * page. That's how index-only scans work fine in hot standby. A crucial
420  * difference between index-only scans and heap scans is that the
421  * index-only scan completely relies on the visibility map where as heap
422  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
423  * the page-level flag can be trusted in the same way, because it might
424  * get propagated somehow without being explicitly WAL-logged, e.g. via a
425  * full page write. Until we can prove that beyond doubt, let's check each
426  * tuple for visibility the hard way.
427  */
428  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
429 
430  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
431  lineoff <= lines;
432  lineoff++, lpp++)
433  {
434  if (ItemIdIsNormal(lpp))
435  {
436  HeapTupleData loctup;
437  bool valid;
438 
439  loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
440  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
441  loctup.t_len = ItemIdGetLength(lpp);
442  ItemPointerSet(&(loctup.t_self), page, lineoff);
443 
444  if (all_visible)
445  valid = true;
446  else
447  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
448 
450  &loctup, buffer, snapshot);
451 
452  if (valid)
453  scan->rs_vistuples[ntup++] = lineoff;
454  }
455  }
456 
458 
459  Assert(ntup <= MaxHeapTuplesPerPage);
460  scan->rs_ntuples = ntup;
461 }
462 
463 /* ----------------
464  * heapgettup - fetch next heap tuple
465  *
466  * Initialize the scan if not already done; then advance to the next
467  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
468  * or set scan->rs_ctup.t_data = NULL if no more tuples.
469  *
470  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
471  * by scan->rs_ctup".
472  *
473  * Note: the reason nkeys/key are passed separately, even though they are
474  * kept in the scan descriptor, is that the caller may not want us to check
475  * the scankeys.
476  *
477  * Note: when we fall off the end of the scan in either direction, we
478  * reset rs_inited. This means that a further request with the same
479  * scan direction will restart the scan, which is a bit odd, but a
480  * request with the opposite scan direction will start a fresh scan
481  * in the proper direction. The latter is required behavior for cursors,
482  * while the former case is generally undefined behavior in Postgres
483  * so we don't care too much.
484  * ----------------
485  */
486 static void
488  ScanDirection dir,
489  int nkeys,
490  ScanKey key)
491 {
492  HeapTuple tuple = &(scan->rs_ctup);
493  Snapshot snapshot = scan->rs_base.rs_snapshot;
494  bool backward = ScanDirectionIsBackward(dir);
495  BlockNumber page;
496  bool finished;
497  Page dp;
498  int lines;
499  OffsetNumber lineoff;
500  int linesleft;
501  ItemId lpp;
502 
503  /*
504  * calculate next starting lineoff, given scan direction
505  */
506  if (ScanDirectionIsForward(dir))
507  {
508  if (!scan->rs_inited)
509  {
510  /*
511  * return null immediately if relation is empty
512  */
513  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
514  {
515  Assert(!BufferIsValid(scan->rs_cbuf));
516  tuple->t_data = NULL;
517  return;
518  }
519  if (scan->rs_base.rs_parallel != NULL)
520  {
523 
525  pbscan);
526 
528  pbscan);
529 
530  /* Other processes might have already finished the scan. */
531  if (page == InvalidBlockNumber)
532  {
533  Assert(!BufferIsValid(scan->rs_cbuf));
534  tuple->t_data = NULL;
535  return;
536  }
537  }
538  else
539  page = scan->rs_startblock; /* first page */
540  heapgetpage((TableScanDesc) scan, page);
541  lineoff = FirstOffsetNumber; /* first offnum */
542  scan->rs_inited = true;
543  }
544  else
545  {
546  /* continue from previously returned page/tuple */
547  page = scan->rs_cblock; /* current page */
548  lineoff = /* next offnum */
550  }
551 
553 
554  dp = BufferGetPage(scan->rs_cbuf);
555  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
556  lines = PageGetMaxOffsetNumber(dp);
557  /* page and lineoff now reference the physically next tid */
558 
559  linesleft = lines - lineoff + 1;
560  }
561  else if (backward)
562  {
563  /* backward parallel scan not supported */
564  Assert(scan->rs_base.rs_parallel == NULL);
565 
566  if (!scan->rs_inited)
567  {
568  /*
569  * return null immediately if relation is empty
570  */
571  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
572  {
573  Assert(!BufferIsValid(scan->rs_cbuf));
574  tuple->t_data = NULL;
575  return;
576  }
577 
578  /*
579  * Disable reporting to syncscan logic in a backwards scan; it's
580  * not very likely anyone else is doing the same thing at the same
581  * time, and much more likely that we'll just bollix things for
582  * forward scanners.
583  */
584  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
585  /* start from last page of the scan */
586  if (scan->rs_startblock > 0)
587  page = scan->rs_startblock - 1;
588  else
589  page = scan->rs_nblocks - 1;
590  heapgetpage((TableScanDesc) scan, page);
591  }
592  else
593  {
594  /* continue from previously returned page/tuple */
595  page = scan->rs_cblock; /* current page */
596  }
597 
599 
600  dp = BufferGetPage(scan->rs_cbuf);
601  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
602  lines = PageGetMaxOffsetNumber(dp);
603 
604  if (!scan->rs_inited)
605  {
606  lineoff = lines; /* final offnum */
607  scan->rs_inited = true;
608  }
609  else
610  {
611  lineoff = /* previous offnum */
613  }
614  /* page and lineoff now reference the physically previous tid */
615 
616  linesleft = lineoff;
617  }
618  else
619  {
620  /*
621  * ``no movement'' scan direction: refetch prior tuple
622  */
623  if (!scan->rs_inited)
624  {
625  Assert(!BufferIsValid(scan->rs_cbuf));
626  tuple->t_data = NULL;
627  return;
628  }
629 
630  page = ItemPointerGetBlockNumber(&(tuple->t_self));
631  if (page != scan->rs_cblock)
632  heapgetpage((TableScanDesc) scan, page);
633 
634  /* Since the tuple was previously fetched, needn't lock page here */
635  dp = BufferGetPage(scan->rs_cbuf);
636  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
637  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
638  lpp = PageGetItemId(dp, lineoff);
639  Assert(ItemIdIsNormal(lpp));
640 
641  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
642  tuple->t_len = ItemIdGetLength(lpp);
643 
644  return;
645  }
646 
647  /*
648  * advance the scan until we find a qualifying tuple or run out of stuff
649  * to scan
650  */
651  lpp = PageGetItemId(dp, lineoff);
652  for (;;)
653  {
654  while (linesleft > 0)
655  {
656  if (ItemIdIsNormal(lpp))
657  {
658  bool valid;
659 
660  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
661  tuple->t_len = ItemIdGetLength(lpp);
662  ItemPointerSet(&(tuple->t_self), page, lineoff);
663 
664  /*
665  * if current tuple qualifies, return it.
666  */
667  valid = HeapTupleSatisfiesVisibility(tuple,
668  snapshot,
669  scan->rs_cbuf);
670 
672  tuple, scan->rs_cbuf,
673  snapshot);
674 
675  if (valid && key != NULL)
677  nkeys, key, valid);
678 
679  if (valid)
680  {
682  return;
683  }
684  }
685 
686  /*
687  * otherwise move to the next item on the page
688  */
689  --linesleft;
690  if (backward)
691  {
692  --lpp; /* move back in this page's ItemId array */
693  --lineoff;
694  }
695  else
696  {
697  ++lpp; /* move forward in this page's ItemId array */
698  ++lineoff;
699  }
700  }
701 
702  /*
703  * if we get here, it means we've exhausted the items on this page and
704  * it's time to move to the next.
705  */
707 
708  /*
709  * advance to next/prior page and detect end of scan
710  */
711  if (backward)
712  {
713  finished = (page == scan->rs_startblock) ||
714  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
715  if (page == 0)
716  page = scan->rs_nblocks;
717  page--;
718  }
719  else if (scan->rs_base.rs_parallel != NULL)
720  {
723 
725  pbscan);
726  finished = (page == InvalidBlockNumber);
727  }
728  else
729  {
730  page++;
731  if (page >= scan->rs_nblocks)
732  page = 0;
733  finished = (page == scan->rs_startblock) ||
734  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
735 
736  /*
737  * Report our new scan position for synchronization purposes. We
738  * don't do that when moving backwards, however. That would just
739  * mess up any other forward-moving scanners.
740  *
741  * Note: we do this before checking for end of scan so that the
742  * final state of the position hint is back at the start of the
743  * rel. That's not strictly necessary, but otherwise when you run
744  * the same query multiple times the starting position would shift
745  * a little bit backwards on every invocation, which is confusing.
746  * We don't guarantee any specific ordering in general, though.
747  */
748  if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
749  ss_report_location(scan->rs_base.rs_rd, page);
750  }
751 
752  /*
753  * return NULL if we've exhausted all the pages
754  */
755  if (finished)
756  {
757  if (BufferIsValid(scan->rs_cbuf))
758  ReleaseBuffer(scan->rs_cbuf);
759  scan->rs_cbuf = InvalidBuffer;
761  tuple->t_data = NULL;
762  scan->rs_inited = false;
763  return;
764  }
765 
766  heapgetpage((TableScanDesc) scan, page);
767 
768  LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
769 
770  dp = BufferGetPage(scan->rs_cbuf);
771  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
772  lines = PageGetMaxOffsetNumber((Page) dp);
773  linesleft = lines;
774  if (backward)
775  {
776  lineoff = lines;
777  lpp = PageGetItemId(dp, lines);
778  }
779  else
780  {
781  lineoff = FirstOffsetNumber;
782  lpp = PageGetItemId(dp, FirstOffsetNumber);
783  }
784  }
785 }
786 
787 /* ----------------
788  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
789  *
790  * Same API as heapgettup, but used in page-at-a-time mode
791  *
792  * The internal logic is much the same as heapgettup's too, but there are some
793  * differences: we do not take the buffer content lock (that only needs to
794  * happen inside heapgetpage), and we iterate through just the tuples listed
795  * in rs_vistuples[] rather than all tuples on the page. Notice that
796  * lineindex is 0-based, where the corresponding loop variable lineoff in
797  * heapgettup is 1-based.
798  * ----------------
799  */
800 static void
802  ScanDirection dir,
803  int nkeys,
804  ScanKey key)
805 {
806  HeapTuple tuple = &(scan->rs_ctup);
807  bool backward = ScanDirectionIsBackward(dir);
808  BlockNumber page;
809  bool finished;
810  Page dp;
811  int lines;
812  int lineindex;
813  OffsetNumber lineoff;
814  int linesleft;
815  ItemId lpp;
816 
817  /*
818  * calculate next starting lineindex, given scan direction
819  */
820  if (ScanDirectionIsForward(dir))
821  {
822  if (!scan->rs_inited)
823  {
824  /*
825  * return null immediately if relation is empty
826  */
827  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
828  {
829  Assert(!BufferIsValid(scan->rs_cbuf));
830  tuple->t_data = NULL;
831  return;
832  }
833  if (scan->rs_base.rs_parallel != NULL)
834  {
837 
839  pbscan);
840 
842  pbscan);
843 
844  /* Other processes might have already finished the scan. */
845  if (page == InvalidBlockNumber)
846  {
847  Assert(!BufferIsValid(scan->rs_cbuf));
848  tuple->t_data = NULL;
849  return;
850  }
851  }
852  else
853  page = scan->rs_startblock; /* first page */
854  heapgetpage((TableScanDesc) scan, page);
855  lineindex = 0;
856  scan->rs_inited = true;
857  }
858  else
859  {
860  /* continue from previously returned page/tuple */
861  page = scan->rs_cblock; /* current page */
862  lineindex = scan->rs_cindex + 1;
863  }
864 
865  dp = BufferGetPage(scan->rs_cbuf);
867  lines = scan->rs_ntuples;
868  /* page and lineindex now reference the next visible tid */
869 
870  linesleft = lines - lineindex;
871  }
872  else if (backward)
873  {
874  /* backward parallel scan not supported */
875  Assert(scan->rs_base.rs_parallel == NULL);
876 
877  if (!scan->rs_inited)
878  {
879  /*
880  * return null immediately if relation is empty
881  */
882  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
883  {
884  Assert(!BufferIsValid(scan->rs_cbuf));
885  tuple->t_data = NULL;
886  return;
887  }
888 
889  /*
890  * Disable reporting to syncscan logic in a backwards scan; it's
891  * not very likely anyone else is doing the same thing at the same
892  * time, and much more likely that we'll just bollix things for
893  * forward scanners.
894  */
895  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
896  /* start from last page of the scan */
897  if (scan->rs_startblock > 0)
898  page = scan->rs_startblock - 1;
899  else
900  page = scan->rs_nblocks - 1;
901  heapgetpage((TableScanDesc) scan, page);
902  }
903  else
904  {
905  /* continue from previously returned page/tuple */
906  page = scan->rs_cblock; /* current page */
907  }
908 
909  dp = BufferGetPage(scan->rs_cbuf);
911  lines = scan->rs_ntuples;
912 
913  if (!scan->rs_inited)
914  {
915  lineindex = lines - 1;
916  scan->rs_inited = true;
917  }
918  else
919  {
920  lineindex = scan->rs_cindex - 1;
921  }
922  /* page and lineindex now reference the previous visible tid */
923 
924  linesleft = lineindex + 1;
925  }
926  else
927  {
928  /*
929  * ``no movement'' scan direction: refetch prior tuple
930  */
931  if (!scan->rs_inited)
932  {
933  Assert(!BufferIsValid(scan->rs_cbuf));
934  tuple->t_data = NULL;
935  return;
936  }
937 
938  page = ItemPointerGetBlockNumber(&(tuple->t_self));
939  if (page != scan->rs_cblock)
940  heapgetpage((TableScanDesc) scan, page);
941 
942  /* Since the tuple was previously fetched, needn't lock page here */
943  dp = BufferGetPage(scan->rs_cbuf);
944  TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
945  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
946  lpp = PageGetItemId(dp, lineoff);
947  Assert(ItemIdIsNormal(lpp));
948 
949  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
950  tuple->t_len = ItemIdGetLength(lpp);
951 
952  /* check that rs_cindex is in sync */
953  Assert(scan->rs_cindex < scan->rs_ntuples);
954  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
955 
956  return;
957  }
958 
959  /*
960  * advance the scan until we find a qualifying tuple or run out of stuff
961  * to scan
962  */
963  for (;;)
964  {
965  while (linesleft > 0)
966  {
967  lineoff = scan->rs_vistuples[lineindex];
968  lpp = PageGetItemId(dp, lineoff);
969  Assert(ItemIdIsNormal(lpp));
970 
971  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
972  tuple->t_len = ItemIdGetLength(lpp);
973  ItemPointerSet(&(tuple->t_self), page, lineoff);
974 
975  /*
976  * if current tuple qualifies, return it.
977  */
978  if (key != NULL)
979  {
980  bool valid;
981 
983  nkeys, key, valid);
984  if (valid)
985  {
986  scan->rs_cindex = lineindex;
987  return;
988  }
989  }
990  else
991  {
992  scan->rs_cindex = lineindex;
993  return;
994  }
995 
996  /*
997  * otherwise move to the next item on the page
998  */
999  --linesleft;
1000  if (backward)
1001  --lineindex;
1002  else
1003  ++lineindex;
1004  }
1005 
1006  /*
1007  * if we get here, it means we've exhausted the items on this page and
1008  * it's time to move to the next.
1009  */
1010  if (backward)
1011  {
1012  finished = (page == scan->rs_startblock) ||
1013  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1014  if (page == 0)
1015  page = scan->rs_nblocks;
1016  page--;
1017  }
1018  else if (scan->rs_base.rs_parallel != NULL)
1019  {
1022 
1024  pbscan);
1025  finished = (page == InvalidBlockNumber);
1026  }
1027  else
1028  {
1029  page++;
1030  if (page >= scan->rs_nblocks)
1031  page = 0;
1032  finished = (page == scan->rs_startblock) ||
1033  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1034 
1035  /*
1036  * Report our new scan position for synchronization purposes. We
1037  * don't do that when moving backwards, however. That would just
1038  * mess up any other forward-moving scanners.
1039  *
1040  * Note: we do this before checking for end of scan so that the
1041  * final state of the position hint is back at the start of the
1042  * rel. That's not strictly necessary, but otherwise when you run
1043  * the same query multiple times the starting position would shift
1044  * a little bit backwards on every invocation, which is confusing.
1045  * We don't guarantee any specific ordering in general, though.
1046  */
1047  if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1048  ss_report_location(scan->rs_base.rs_rd, page);
1049  }
1050 
1051  /*
1052  * return NULL if we've exhausted all the pages
1053  */
1054  if (finished)
1055  {
1056  if (BufferIsValid(scan->rs_cbuf))
1057  ReleaseBuffer(scan->rs_cbuf);
1058  scan->rs_cbuf = InvalidBuffer;
1059  scan->rs_cblock = InvalidBlockNumber;
1060  tuple->t_data = NULL;
1061  scan->rs_inited = false;
1062  return;
1063  }
1064 
1065  heapgetpage((TableScanDesc) scan, page);
1066 
1067  dp = BufferGetPage(scan->rs_cbuf);
1068  TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1069  lines = scan->rs_ntuples;
1070  linesleft = lines;
1071  if (backward)
1072  lineindex = lines - 1;
1073  else
1074  lineindex = 0;
1075  }
1076 }
1077 
1078 
1079 #if defined(DISABLE_COMPLEX_MACRO)
1080 /*
1081  * This is formatted so oddly so that the correspondence to the macro
1082  * definition in access/htup_details.h is maintained.
1083  */
1084 Datum
1085 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1086  bool *isnull)
1087 {
1088  return (
1089  (attnum) > 0 ?
1090  (
1091  (*(isnull) = false),
1092  HeapTupleNoNulls(tup) ?
1093  (
1094  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1095  (
1096  fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1097  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1098  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1099  )
1100  :
1101  nocachegetattr((tup), (attnum), (tupleDesc))
1102  )
1103  :
1104  (
1105  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1106  (
1107  (*(isnull) = true),
1108  (Datum) NULL
1109  )
1110  :
1111  (
1112  nocachegetattr((tup), (attnum), (tupleDesc))
1113  )
1114  )
1115  )
1116  :
1117  (
1118  (Datum) NULL
1119  )
1120  );
1121 }
1122 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1123 
1124 
1125 /* ----------------------------------------------------------------
1126  * heap access method interface
1127  * ----------------------------------------------------------------
1128  */
1129 
1130 
1132 heap_beginscan(Relation relation, Snapshot snapshot,
1133  int nkeys, ScanKey key,
1134  ParallelTableScanDesc parallel_scan,
1135  uint32 flags)
1136 {
1137  HeapScanDesc scan;
1138 
1139  /*
1140  * increment relation ref count while scanning relation
1141  *
1142  * This is just to make really sure the relcache entry won't go away while
1143  * the scan has a pointer to it. Caller should be holding the rel open
1144  * anyway, so this is redundant in all normal scenarios...
1145  */
1147 
1148  /*
1149  * allocate and initialize scan descriptor
1150  */
1151  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1152 
1153  scan->rs_base.rs_rd = relation;
1154  scan->rs_base.rs_snapshot = snapshot;
1155  scan->rs_base.rs_nkeys = nkeys;
1156  scan->rs_base.rs_flags = flags;
1157  scan->rs_base.rs_parallel = parallel_scan;
1158  scan->rs_strategy = NULL; /* set in initscan */
1159 
1160  /*
1161  * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1162  */
1163  if (!(snapshot && IsMVCCSnapshot(snapshot)))
1165 
1166  /*
1167  * For seqscan and sample scans in a serializable transaction, acquire a
1168  * predicate lock on the entire relation. This is required not only to
1169  * lock all the matching tuples, but also to conflict with new insertions
1170  * into the table. In an indexscan, we take page locks on the index pages
1171  * covering the range specified in the scan qual, but in a heap scan there
1172  * is nothing more fine-grained to lock. A bitmap scan is a different
1173  * story, there we have already scanned the index and locked the index
1174  * pages covering the predicate. But in that case we still have to lock
1175  * any matching heap tuples. For sample scan we could optimize the locking
1176  * to be at least page-level granularity, but we'd need to add per-tuple
1177  * locking for that.
1178  */
1180  {
1181  /*
1182  * Ensure a missing snapshot is noticed reliably, even if the
1183  * isolation mode means predicate locking isn't performed (and
1184  * therefore the snapshot isn't used here).
1185  */
1186  Assert(snapshot);
1187  PredicateLockRelation(relation, snapshot);
1188  }
1189 
1190  /* we only need to set this up once */
1191  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1192 
1193  /*
1194  * we do this here instead of in initscan() because heap_rescan also calls
1195  * initscan() and we don't want to allocate memory again
1196  */
1197  if (nkeys > 0)
1198  scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1199  else
1200  scan->rs_base.rs_key = NULL;
1201 
1202  initscan(scan, key, false);
1203 
1204  return (TableScanDesc) scan;
1205 }
1206 
1207 void
1208 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1209  bool allow_strat, bool allow_sync, bool allow_pagemode)
1210 {
1211  HeapScanDesc scan = (HeapScanDesc) sscan;
1212 
1213  if (set_params)
1214  {
1215  if (allow_strat)
1216  scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1217  else
1218  scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1219 
1220  if (allow_sync)
1221  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1222  else
1223  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1224 
1225  if (allow_pagemode && scan->rs_base.rs_snapshot &&
1228  else
1230  }
1231 
1232  /*
1233  * unpin scan buffers
1234  */
1235  if (BufferIsValid(scan->rs_cbuf))
1236  ReleaseBuffer(scan->rs_cbuf);
1237 
1238  /*
1239  * reinitialize scan descriptor
1240  */
1241  initscan(scan, key, true);
1242 }
1243 
1244 void
1246 {
1247  HeapScanDesc scan = (HeapScanDesc) sscan;
1248 
1249  /* Note: no locking manipulations needed */
1250 
1251  /*
1252  * unpin scan buffers
1253  */
1254  if (BufferIsValid(scan->rs_cbuf))
1255  ReleaseBuffer(scan->rs_cbuf);
1256 
1257  /*
1258  * decrement relation reference count and free scan descriptor storage
1259  */
1261 
1262  if (scan->rs_base.rs_key)
1263  pfree(scan->rs_base.rs_key);
1264 
1265  if (scan->rs_strategy != NULL)
1267 
1268  if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1270 
1271  pfree(scan);
1272 }
1273 
1274 #ifdef HEAPDEBUGALL
1275 #define HEAPDEBUG_1 \
1276  elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1277  RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1278 #define HEAPDEBUG_2 \
1279  elog(DEBUG2, "heap_getnext returning EOS")
1280 #define HEAPDEBUG_3 \
1281  elog(DEBUG2, "heap_getnext returning tuple")
1282 #else
1283 #define HEAPDEBUG_1
1284 #define HEAPDEBUG_2
1285 #define HEAPDEBUG_3
1286 #endif /* !defined(HEAPDEBUGALL) */
1287 
1288 
1289 HeapTuple
1291 {
1292  HeapScanDesc scan = (HeapScanDesc) sscan;
1293 
1294  /*
1295  * This is still widely used directly, without going through table AM, so
1296  * add a safety check. It's possible we should, at a later point,
1297  * downgrade this to an assert. The reason for checking the AM routine,
1298  * rather than the AM oid, is that this allows to write regression tests
1299  * that create another AM reusing the heap handler.
1300  */
1302  ereport(ERROR,
1303  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1304  errmsg_internal("only heap AM is supported")));
1305 
1306  /* Note: no locking manipulations needed */
1307 
1308  HEAPDEBUG_1; /* heap_getnext( info ) */
1309 
1310  if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1311  heapgettup_pagemode(scan, direction,
1312  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1313  else
1314  heapgettup(scan, direction,
1315  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1316 
1317  if (scan->rs_ctup.t_data == NULL)
1318  {
1319  HEAPDEBUG_2; /* heap_getnext returning EOS */
1320  return NULL;
1321  }
1322 
1323  /*
1324  * if we get here it means we have a new current scan tuple, so point to
1325  * the proper return buffer and return the tuple.
1326  */
1327  HEAPDEBUG_3; /* heap_getnext returning tuple */
1328 
1330 
1331  return &scan->rs_ctup;
1332 }
1333 
1334 #ifdef HEAPAMSLOTDEBUGALL
1335 #define HEAPAMSLOTDEBUG_1 \
1336  elog(DEBUG2, "heapam_getnextslot([%s,nkeys=%d],dir=%d) called", \
1337  RelationGetRelationName(scan->rs_base.rs_rd), scan->rs_base.rs_nkeys, (int) direction)
1338 #define HEAPAMSLOTDEBUG_2 \
1339  elog(DEBUG2, "heapam_getnextslot returning EOS")
1340 #define HEAPAMSLOTDEBUG_3 \
1341  elog(DEBUG2, "heapam_getnextslot returning tuple")
1342 #else
1343 #define HEAPAMSLOTDEBUG_1
1344 #define HEAPAMSLOTDEBUG_2
1345 #define HEAPAMSLOTDEBUG_3
1346 #endif
1347 
1348 bool
1350 {
1351  HeapScanDesc scan = (HeapScanDesc) sscan;
1352 
1353  /* Note: no locking manipulations needed */
1354 
1355  HEAPAMSLOTDEBUG_1; /* heap_getnextslot( info ) */
1356 
1357  if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1358  heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1359  else
1360  heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1361 
1362  if (scan->rs_ctup.t_data == NULL)
1363  {
1364  HEAPAMSLOTDEBUG_2; /* heap_getnextslot returning EOS */
1365  ExecClearTuple(slot);
1366  return false;
1367  }
1368 
1369  /*
1370  * if we get here it means we have a new current scan tuple, so point to
1371  * the proper return buffer and return the tuple.
1372  */
1373  HEAPAMSLOTDEBUG_3; /* heap_getnextslot returning tuple */
1374 
1376 
1377  ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1378  scan->rs_cbuf);
1379  return true;
1380 }
1381 
1382 /*
1383  * heap_fetch - retrieve tuple with given tid
1384  *
1385  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1386  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1387  * against the specified snapshot.
1388  *
1389  * If successful (tuple found and passes snapshot time qual), then *userbuf
1390  * is set to the buffer holding the tuple and true is returned. The caller
1391  * must unpin the buffer when done with the tuple.
1392  *
1393  * If the tuple is not found (ie, item number references a deleted slot),
1394  * then tuple->t_data is set to NULL and false is returned.
1395  *
1396  * If the tuple is found but fails the time qual check, then false is returned
1397  * but tuple->t_data is left pointing to the tuple.
1398  *
1399  * heap_fetch does not follow HOT chains: only the exact TID requested will
1400  * be fetched.
1401  *
1402  * It is somewhat inconsistent that we ereport() on invalid block number but
1403  * return false on invalid item number. There are a couple of reasons though.
1404  * One is that the caller can relatively easily check the block number for
1405  * validity, but cannot check the item number without reading the page
1406  * himself. Another is that when we are following a t_ctid link, we can be
1407  * reasonably confident that the page number is valid (since VACUUM shouldn't
1408  * truncate off the destination page without having killed the referencing
1409  * tuple first), but the item number might well not be good.
1410  */
1411 bool
1413  Snapshot snapshot,
1414  HeapTuple tuple,
1415  Buffer *userbuf)
1416 {
1417  ItemPointer tid = &(tuple->t_self);
1418  ItemId lp;
1419  Buffer buffer;
1420  Page page;
1421  OffsetNumber offnum;
1422  bool valid;
1423 
1424  /*
1425  * Fetch and pin the appropriate page of the relation.
1426  */
1427  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1428 
1429  /*
1430  * Need share lock on buffer to examine tuple commit status.
1431  */
1432  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1433  page = BufferGetPage(buffer);
1434  TestForOldSnapshot(snapshot, relation, page);
1435 
1436  /*
1437  * We'd better check for out-of-range offnum in case of VACUUM since the
1438  * TID was obtained.
1439  */
1440  offnum = ItemPointerGetOffsetNumber(tid);
1441  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1442  {
1443  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1444  ReleaseBuffer(buffer);
1445  *userbuf = InvalidBuffer;
1446  tuple->t_data = NULL;
1447  return false;
1448  }
1449 
1450  /*
1451  * get the item line pointer corresponding to the requested tid
1452  */
1453  lp = PageGetItemId(page, offnum);
1454 
1455  /*
1456  * Must check for deleted tuple.
1457  */
1458  if (!ItemIdIsNormal(lp))
1459  {
1460  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1461  ReleaseBuffer(buffer);
1462  *userbuf = InvalidBuffer;
1463  tuple->t_data = NULL;
1464  return false;
1465  }
1466 
1467  /*
1468  * fill in *tuple fields
1469  */
1470  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1471  tuple->t_len = ItemIdGetLength(lp);
1472  tuple->t_tableOid = RelationGetRelid(relation);
1473 
1474  /*
1475  * check tuple visibility, then release lock
1476  */
1477  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1478 
1479  if (valid)
1480  PredicateLockTuple(relation, tuple, snapshot);
1481 
1482  CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1483 
1484  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1485 
1486  if (valid)
1487  {
1488  /*
1489  * All checks passed, so return the tuple as valid. Caller is now
1490  * responsible for releasing the buffer.
1491  */
1492  *userbuf = buffer;
1493 
1494  return true;
1495  }
1496 
1497  /* Tuple failed time qual */
1498  ReleaseBuffer(buffer);
1499  *userbuf = InvalidBuffer;
1500 
1501  return false;
1502 }
1503 
1504 /*
1505  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1506  *
1507  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1508  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1509  * for the first chain member satisfying the given snapshot. If one is
1510  * found, we update *tid to reference that tuple's offset number, and
1511  * return true. If no match, return false without modifying *tid.
1512  *
1513  * heapTuple is a caller-supplied buffer. When a match is found, we return
1514  * the tuple here, in addition to updating *tid. If no match is found, the
1515  * contents of this buffer on return are undefined.
1516  *
1517  * If all_dead is not NULL, we check non-visible tuples to see if they are
1518  * globally dead; *all_dead is set true if all members of the HOT chain
1519  * are vacuumable, false if not.
1520  *
1521  * Unlike heap_fetch, the caller must already have pin and (at least) share
1522  * lock on the buffer; it is still pinned/locked at exit. Also unlike
1523  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1524  */
1525 bool
1527  Snapshot snapshot, HeapTuple heapTuple,
1528  bool *all_dead, bool first_call)
1529 {
1530  Page dp = (Page) BufferGetPage(buffer);
1531  TransactionId prev_xmax = InvalidTransactionId;
1532  BlockNumber blkno;
1533  OffsetNumber offnum;
1534  bool at_chain_start;
1535  bool valid;
1536  bool skip;
1537 
1538  /* If this is not the first call, previous call returned a (live!) tuple */
1539  if (all_dead)
1540  *all_dead = first_call;
1541 
1542  blkno = ItemPointerGetBlockNumber(tid);
1543  offnum = ItemPointerGetOffsetNumber(tid);
1544  at_chain_start = first_call;
1545  skip = !first_call;
1546 
1548  Assert(BufferGetBlockNumber(buffer) == blkno);
1549 
1550  /* Scan through possible multiple members of HOT-chain */
1551  for (;;)
1552  {
1553  ItemId lp;
1554 
1555  /* check for bogus TID */
1556  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1557  break;
1558 
1559  lp = PageGetItemId(dp, offnum);
1560 
1561  /* check for unused, dead, or redirected items */
1562  if (!ItemIdIsNormal(lp))
1563  {
1564  /* We should only see a redirect at start of chain */
1565  if (ItemIdIsRedirected(lp) && at_chain_start)
1566  {
1567  /* Follow the redirect */
1568  offnum = ItemIdGetRedirect(lp);
1569  at_chain_start = false;
1570  continue;
1571  }
1572  /* else must be end of chain */
1573  break;
1574  }
1575 
1576  /*
1577  * Update heapTuple to point to the element of the HOT chain we're
1578  * currently investigating. Having t_self set correctly is important
1579  * because the SSI checks and the *Satisfies routine for historical
1580  * MVCC snapshots need the correct tid to decide about the visibility.
1581  */
1582  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1583  heapTuple->t_len = ItemIdGetLength(lp);
1584  heapTuple->t_tableOid = RelationGetRelid(relation);
1585  ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1586 
1587  /*
1588  * Shouldn't see a HEAP_ONLY tuple at chain start.
1589  */
1590  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1591  break;
1592 
1593  /*
1594  * The xmin should match the previous xmax value, else chain is
1595  * broken.
1596  */
1597  if (TransactionIdIsValid(prev_xmax) &&
1598  !TransactionIdEquals(prev_xmax,
1599  HeapTupleHeaderGetXmin(heapTuple->t_data)))
1600  break;
1601 
1602  /*
1603  * When first_call is true (and thus, skip is initially false) we'll
1604  * return the first tuple we find. But on later passes, heapTuple
1605  * will initially be pointing to the tuple we returned last time.
1606  * Returning it again would be incorrect (and would loop forever), so
1607  * we skip it and return the next match we find.
1608  */
1609  if (!skip)
1610  {
1611  /* If it's visible per the snapshot, we must return it */
1612  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1613  CheckForSerializableConflictOut(valid, relation, heapTuple,
1614  buffer, snapshot);
1615 
1616  if (valid)
1617  {
1618  ItemPointerSetOffsetNumber(tid, offnum);
1619  PredicateLockTuple(relation, heapTuple, snapshot);
1620  if (all_dead)
1621  *all_dead = false;
1622  return true;
1623  }
1624  }
1625  skip = false;
1626 
1627  /*
1628  * If we can't see it, maybe no one else can either. At caller
1629  * request, check whether all chain members are dead to all
1630  * transactions.
1631  *
1632  * Note: if you change the criterion here for what is "dead", fix the
1633  * planner's get_actual_variable_range() function to match.
1634  */
1635  if (all_dead && *all_dead &&
1637  *all_dead = false;
1638 
1639  /*
1640  * Check to see if HOT chain continues past this tuple; if so fetch
1641  * the next offnum and loop around.
1642  */
1643  if (HeapTupleIsHotUpdated(heapTuple))
1644  {
1646  blkno);
1647  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1648  at_chain_start = false;
1649  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1650  }
1651  else
1652  break; /* end of chain */
1653  }
1654 
1655  return false;
1656 }
1657 
1658 /*
1659  * heap_get_latest_tid - get the latest tid of a specified tuple
1660  *
1661  * Actually, this gets the latest version that is visible according to the
1662  * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1663  * possibly uncommitted version.
1664  *
1665  * *tid is both an input and an output parameter: it is updated to
1666  * show the latest version of the row. Note that it will not be changed
1667  * if no version of the row passes the snapshot test.
1668  */
1669 void
1671  ItemPointer tid)
1672 {
1673  Relation relation = sscan->rs_rd;
1674  Snapshot snapshot = sscan->rs_snapshot;
1675  ItemPointerData ctid;
1676  TransactionId priorXmax;
1677 
1678  /*
1679  * table_get_latest_tid verified that the passed in tid is valid. Assume
1680  * that t_ctid links are valid however - there shouldn't be invalid ones
1681  * in the table.
1682  */
1683  Assert(ItemPointerIsValid(tid));
1684 
1685  /*
1686  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1687  * need to examine, and *tid is the TID we will return if ctid turns out
1688  * to be bogus.
1689  *
1690  * Note that we will loop until we reach the end of the t_ctid chain.
1691  * Depending on the snapshot passed, there might be at most one visible
1692  * version of the row, but we don't try to optimize for that.
1693  */
1694  ctid = *tid;
1695  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1696  for (;;)
1697  {
1698  Buffer buffer;
1699  Page page;
1700  OffsetNumber offnum;
1701  ItemId lp;
1702  HeapTupleData tp;
1703  bool valid;
1704 
1705  /*
1706  * Read, pin, and lock the page.
1707  */
1708  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1709  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1710  page = BufferGetPage(buffer);
1711  TestForOldSnapshot(snapshot, relation, page);
1712 
1713  /*
1714  * Check for bogus item number. This is not treated as an error
1715  * condition because it can happen while following a t_ctid link. We
1716  * just assume that the prior tid is OK and return it unchanged.
1717  */
1718  offnum = ItemPointerGetOffsetNumber(&ctid);
1719  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1720  {
1721  UnlockReleaseBuffer(buffer);
1722  break;
1723  }
1724  lp = PageGetItemId(page, offnum);
1725  if (!ItemIdIsNormal(lp))
1726  {
1727  UnlockReleaseBuffer(buffer);
1728  break;
1729  }
1730 
1731  /* OK to access the tuple */
1732  tp.t_self = ctid;
1733  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1734  tp.t_len = ItemIdGetLength(lp);
1735  tp.t_tableOid = RelationGetRelid(relation);
1736 
1737  /*
1738  * After following a t_ctid link, we might arrive at an unrelated
1739  * tuple. Check for XMIN match.
1740  */
1741  if (TransactionIdIsValid(priorXmax) &&
1743  {
1744  UnlockReleaseBuffer(buffer);
1745  break;
1746  }
1747 
1748  /*
1749  * Check tuple visibility; if visible, set it as the new result
1750  * candidate.
1751  */
1752  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1753  CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1754  if (valid)
1755  *tid = ctid;
1756 
1757  /*
1758  * If there's a valid t_ctid link, follow it, else we're done.
1759  */
1760  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1764  {
1765  UnlockReleaseBuffer(buffer);
1766  break;
1767  }
1768 
1769  ctid = tp.t_data->t_ctid;
1770  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1771  UnlockReleaseBuffer(buffer);
1772  } /* end of loop */
1773 }
1774 
1775 
1776 /*
1777  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1778  *
1779  * This is called after we have waited for the XMAX transaction to terminate.
1780  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1781  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1782  * hint bit if possible --- but beware that that may not yet be possible,
1783  * if the transaction committed asynchronously.
1784  *
1785  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1786  * even if it commits.
1787  *
1788  * Hence callers should look only at XMAX_INVALID.
1789  *
1790  * Note this is not allowed for tuples whose xmax is a multixact.
1791  */
1792 static void
1794 {
1796  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1797 
1798  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1799  {
1800  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1803  xid);
1804  else
1805  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1807  }
1808 }
1809 
1810 
1811 /*
1812  * GetBulkInsertState - prepare status object for a bulk insert
1813  */
1816 {
1817  BulkInsertState bistate;
1818 
1819  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1821  bistate->current_buf = InvalidBuffer;
1822  return bistate;
1823 }
1824 
1825 /*
1826  * FreeBulkInsertState - clean up after finishing a bulk insert
1827  */
1828 void
1830 {
1831  if (bistate->current_buf != InvalidBuffer)
1832  ReleaseBuffer(bistate->current_buf);
1833  FreeAccessStrategy(bistate->strategy);
1834  pfree(bistate);
1835 }
1836 
1837 /*
1838  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1839  */
1840 void
1842 {
1843  if (bistate->current_buf != InvalidBuffer)
1844  ReleaseBuffer(bistate->current_buf);
1845  bistate->current_buf = InvalidBuffer;
1846 }
1847 
1848 
1849 /*
1850  * heap_insert - insert tuple into a heap
1851  *
1852  * The new tuple is stamped with current transaction ID and the specified
1853  * command ID.
1854  *
1855  * See table_tuple_insert for comments about most of the input flags, except
1856  * that this routine directly takes a tuple rather than a slot.
1857  *
1858  * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1859  * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1860  * implement table_tuple_insert_speculative().
1861  *
1862  * On return the header fields of *tup are updated to match the stored tuple;
1863  * in particular tup->t_self receives the actual TID where the tuple was
1864  * stored. But note that any toasting of fields within the tuple data is NOT
1865  * reflected into *tup.
1866  */
1867 void
1869  int options, BulkInsertState bistate)
1870 {
1872  HeapTuple heaptup;
1873  Buffer buffer;
1874  Buffer vmbuffer = InvalidBuffer;
1875  bool all_visible_cleared = false;
1876 
1877  /*
1878  * Fill in tuple header fields and toast the tuple if necessary.
1879  *
1880  * Note: below this point, heaptup is the data we actually intend to store
1881  * into the relation; tup is the caller's original untoasted data.
1882  */
1883  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
1884 
1885  /*
1886  * Find buffer to insert this tuple into. If the page is all visible,
1887  * this will also pin the requisite visibility map page.
1888  */
1889  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1890  InvalidBuffer, options, bistate,
1891  &vmbuffer, NULL);
1892 
1893  /*
1894  * We're about to do the actual insert -- but check for conflict first, to
1895  * avoid possibly having to roll back work we've just done.
1896  *
1897  * This is safe without a recheck as long as there is no possibility of
1898  * another process scanning the page between this check and the insert
1899  * being visible to the scan (i.e., an exclusive buffer content lock is
1900  * continuously held from this point until the tuple insert is visible).
1901  *
1902  * For a heap insert, we only need to check for table-level SSI locks. Our
1903  * new tuple can't possibly conflict with existing tuple locks, and heap
1904  * page locks are only consolidated versions of tuple locks; they do not
1905  * lock "gaps" as index page locks do. So we don't need to specify a
1906  * buffer when making the call, which makes for a faster check.
1907  */
1909 
1910  /* NO EREPORT(ERROR) from here till changes are logged */
1912 
1913  RelationPutHeapTuple(relation, buffer, heaptup,
1914  (options & HEAP_INSERT_SPECULATIVE) != 0);
1915 
1916  if (PageIsAllVisible(BufferGetPage(buffer)))
1917  {
1918  all_visible_cleared = true;
1920  visibilitymap_clear(relation,
1921  ItemPointerGetBlockNumber(&(heaptup->t_self)),
1922  vmbuffer, VISIBILITYMAP_VALID_BITS);
1923  }
1924 
1925  /*
1926  * XXX Should we set PageSetPrunable on this page ?
1927  *
1928  * The inserting transaction may eventually abort thus making this tuple
1929  * DEAD and hence available for pruning. Though we don't want to optimize
1930  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1931  * aborted tuple will never be pruned until next vacuum is triggered.
1932  *
1933  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1934  */
1935 
1936  MarkBufferDirty(buffer);
1937 
1938  /* XLOG stuff */
1939  if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
1940  {
1941  xl_heap_insert xlrec;
1942  xl_heap_header xlhdr;
1943  XLogRecPtr recptr;
1944  Page page = BufferGetPage(buffer);
1945  uint8 info = XLOG_HEAP_INSERT;
1946  int bufflags = 0;
1947 
1948  /*
1949  * If this is a catalog, we need to transmit combocids to properly
1950  * decode, so log that as well.
1951  */
1953  log_heap_new_cid(relation, heaptup);
1954 
1955  /*
1956  * If this is the single and first tuple on page, we can reinit the
1957  * page instead of restoring the whole thing. Set flag, and hide
1958  * buffer references from XLogInsert.
1959  */
1960  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1962  {
1963  info |= XLOG_HEAP_INIT_PAGE;
1964  bufflags |= REGBUF_WILL_INIT;
1965  }
1966 
1967  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
1968  xlrec.flags = 0;
1969  if (all_visible_cleared)
1971  if (options & HEAP_INSERT_SPECULATIVE)
1974 
1975  /*
1976  * For logical decoding, we need the tuple even if we're doing a full
1977  * page write, so make sure it's included even if we take a full-page
1978  * image. (XXX We could alternatively store a pointer into the FPW).
1979  */
1980  if (RelationIsLogicallyLogged(relation) &&
1981  !(options & HEAP_INSERT_NO_LOGICAL))
1982  {
1984  bufflags |= REGBUF_KEEP_DATA;
1985  }
1986 
1987  XLogBeginInsert();
1988  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
1989 
1990  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1991  xlhdr.t_infomask = heaptup->t_data->t_infomask;
1992  xlhdr.t_hoff = heaptup->t_data->t_hoff;
1993 
1994  /*
1995  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
1996  * write the whole page to the xlog, we don't need to store
1997  * xl_heap_header in the xlog.
1998  */
1999  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2000  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2001  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2003  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2004  heaptup->t_len - SizeofHeapTupleHeader);
2005 
2006  /* filtering by origin on a row level is much more efficient */
2008 
2009  recptr = XLogInsert(RM_HEAP_ID, info);
2010 
2011  PageSetLSN(page, recptr);
2012  }
2013 
2014  END_CRIT_SECTION();
2015 
2016  UnlockReleaseBuffer(buffer);
2017  if (vmbuffer != InvalidBuffer)
2018  ReleaseBuffer(vmbuffer);
2019 
2020  /*
2021  * If tuple is cachable, mark it for invalidation from the caches in case
2022  * we abort. Note it is OK to do this after releasing the buffer, because
2023  * the heaptup data structure is all in local memory, not in the shared
2024  * buffer.
2025  */
2026  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2027 
2028  /* Note: speculative insertions are counted too, even if aborted later */
2029  pgstat_count_heap_insert(relation, 1);
2030 
2031  /*
2032  * If heaptup is a private copy, release it. Don't forget to copy t_self
2033  * back to the caller's image, too.
2034  */
2035  if (heaptup != tup)
2036  {
2037  tup->t_self = heaptup->t_self;
2038  heap_freetuple(heaptup);
2039  }
2040 }
2041 
2042 /*
2043  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2044  * tuple header fields and toasts the tuple if necessary. Returns a toasted
2045  * version of the tuple if it was toasted, or the original tuple if not. Note
2046  * that in any case, the header fields are also set in the original tuple.
2047  */
2048 static HeapTuple
2050  CommandId cid, int options)
2051 {
2052  /*
2053  * Parallel operations are required to be strictly read-only in a parallel
2054  * worker. Parallel inserts are not safe even in the leader in the
2055  * general case, because group locking means that heavyweight locks for
2056  * relation extension or GIN page locks will not conflict between members
2057  * of a lock group, but we don't prohibit that case here because there are
2058  * useful special cases that we can safely allow, such as CREATE TABLE AS.
2059  */
2060  if (IsParallelWorker())
2061  ereport(ERROR,
2062  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2063  errmsg("cannot insert tuples in a parallel worker")));
2064 
2065  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2066  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2068  HeapTupleHeaderSetXmin(tup->t_data, xid);
2069  if (options & HEAP_INSERT_FROZEN)
2071 
2072  HeapTupleHeaderSetCmin(tup->t_data, cid);
2073  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2074  tup->t_tableOid = RelationGetRelid(relation);
2075 
2076  /*
2077  * If the new tuple is too big for storage or contains already toasted
2078  * out-of-line attributes from some other relation, invoke the toaster.
2079  */
2080  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2081  relation->rd_rel->relkind != RELKIND_MATVIEW)
2082  {
2083  /* toast table entries should never be recursively toasted */
2085  return tup;
2086  }
2087  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2088  return toast_insert_or_update(relation, tup, NULL, options);
2089  else
2090  return tup;
2091 }
2092 
2093 /*
2094  * heap_multi_insert - insert multiple tuple into a heap
2095  *
2096  * This is like heap_insert(), but inserts multiple tuples in one operation.
2097  * That's faster than calling heap_insert() in a loop, because when multiple
2098  * tuples can be inserted on a single page, we can write just a single WAL
2099  * record covering all of them, and only need to lock/unlock the page once.
2100  *
2101  * Note: this leaks memory into the current memory context. You can create a
2102  * temporary context before calling this, if that's a problem.
2103  */
2104 void
2105 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2106  CommandId cid, int options, BulkInsertState bistate)
2107 {
2109  HeapTuple *heaptuples;
2110  int i;
2111  int ndone;
2112  PGAlignedBlock scratch;
2113  Page page;
2114  bool needwal;
2115  Size saveFreeSpace;
2116  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2117  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2118 
2119  /* currently not needed (thus unsupported) for heap_multi_insert() */
2120  AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2121 
2122  needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2123  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2125 
2126  /* Toast and set header data in all the slots */
2127  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2128  for (i = 0; i < ntuples; i++)
2129  {
2130  HeapTuple tuple;
2131 
2132  tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2133  slots[i]->tts_tableOid = RelationGetRelid(relation);
2134  tuple->t_tableOid = slots[i]->tts_tableOid;
2135  heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2136  options);
2137  }
2138 
2139  /*
2140  * We're about to do the actual inserts -- but check for conflict first,
2141  * to minimize the possibility of having to roll back work we've just
2142  * done.
2143  *
2144  * A check here does not definitively prevent a serialization anomaly;
2145  * that check MUST be done at least past the point of acquiring an
2146  * exclusive buffer content lock on every buffer that will be affected,
2147  * and MAY be done after all inserts are reflected in the buffers and
2148  * those locks are released; otherwise there race condition. Since
2149  * multiple buffers can be locked and unlocked in the loop below, and it
2150  * would not be feasible to identify and lock all of those buffers before
2151  * the loop, we must do a final check at the end.
2152  *
2153  * The check here could be omitted with no loss of correctness; it is
2154  * present strictly as an optimization.
2155  *
2156  * For heap inserts, we only need to check for table-level SSI locks. Our
2157  * new tuples can't possibly conflict with existing tuple locks, and heap
2158  * page locks are only consolidated versions of tuple locks; they do not
2159  * lock "gaps" as index page locks do. So we don't need to specify a
2160  * buffer when making the call, which makes for a faster check.
2161  */
2163 
2164  ndone = 0;
2165  while (ndone < ntuples)
2166  {
2167  Buffer buffer;
2168  Buffer vmbuffer = InvalidBuffer;
2169  bool all_visible_cleared = false;
2170  int nthispage;
2171 
2173 
2174  /*
2175  * Find buffer where at least the next tuple will fit. If the page is
2176  * all-visible, this will also pin the requisite visibility map page.
2177  */
2178  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2179  InvalidBuffer, options, bistate,
2180  &vmbuffer, NULL);
2181  page = BufferGetPage(buffer);
2182 
2183  /* NO EREPORT(ERROR) from here till changes are logged */
2185 
2186  /*
2187  * RelationGetBufferForTuple has ensured that the first tuple fits.
2188  * Put that on the page, and then as many other tuples as fit.
2189  */
2190  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2191  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2192  {
2193  HeapTuple heaptup = heaptuples[ndone + nthispage];
2194 
2195  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2196  break;
2197 
2198  RelationPutHeapTuple(relation, buffer, heaptup, false);
2199 
2200  /*
2201  * We don't use heap_multi_insert for catalog tuples yet, but
2202  * better be prepared...
2203  */
2204  if (needwal && need_cids)
2205  log_heap_new_cid(relation, heaptup);
2206  }
2207 
2208  if (PageIsAllVisible(page))
2209  {
2210  all_visible_cleared = true;
2211  PageClearAllVisible(page);
2212  visibilitymap_clear(relation,
2213  BufferGetBlockNumber(buffer),
2214  vmbuffer, VISIBILITYMAP_VALID_BITS);
2215  }
2216 
2217  /*
2218  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2219  */
2220 
2221  MarkBufferDirty(buffer);
2222 
2223  /* XLOG stuff */
2224  if (needwal)
2225  {
2226  XLogRecPtr recptr;
2227  xl_heap_multi_insert *xlrec;
2229  char *tupledata;
2230  int totaldatalen;
2231  char *scratchptr = scratch.data;
2232  bool init;
2233  int bufflags = 0;
2234 
2235  /*
2236  * If the page was previously empty, we can reinit the page
2237  * instead of restoring the whole thing.
2238  */
2239  init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2240  PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2241 
2242  /* allocate xl_heap_multi_insert struct from the scratch area */
2243  xlrec = (xl_heap_multi_insert *) scratchptr;
2244  scratchptr += SizeOfHeapMultiInsert;
2245 
2246  /*
2247  * Allocate offsets array. Unless we're reinitializing the page,
2248  * in that case the tuples are stored in order starting at
2249  * FirstOffsetNumber and we don't need to store the offsets
2250  * explicitly.
2251  */
2252  if (!init)
2253  scratchptr += nthispage * sizeof(OffsetNumber);
2254 
2255  /* the rest of the scratch space is used for tuple data */
2256  tupledata = scratchptr;
2257 
2258  xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2259  xlrec->ntuples = nthispage;
2260 
2261  /*
2262  * Write out an xl_multi_insert_tuple and the tuple data itself
2263  * for each tuple.
2264  */
2265  for (i = 0; i < nthispage; i++)
2266  {
2267  HeapTuple heaptup = heaptuples[ndone + i];
2268  xl_multi_insert_tuple *tuphdr;
2269  int datalen;
2270 
2271  if (!init)
2272  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2273  /* xl_multi_insert_tuple needs two-byte alignment. */
2274  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2275  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2276 
2277  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2278  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2279  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2280 
2281  /* write bitmap [+ padding] [+ oid] + data */
2282  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2283  memcpy(scratchptr,
2284  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2285  datalen);
2286  tuphdr->datalen = datalen;
2287  scratchptr += datalen;
2288  }
2289  totaldatalen = scratchptr - tupledata;
2290  Assert((scratchptr - scratch.data) < BLCKSZ);
2291 
2292  if (need_tuple_data)
2294 
2295  /*
2296  * Signal that this is the last xl_heap_multi_insert record
2297  * emitted by this call to heap_multi_insert(). Needed for logical
2298  * decoding so it knows when to cleanup temporary data.
2299  */
2300  if (ndone + nthispage == ntuples)
2301  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2302 
2303  if (init)
2304  {
2305  info |= XLOG_HEAP_INIT_PAGE;
2306  bufflags |= REGBUF_WILL_INIT;
2307  }
2308 
2309  /*
2310  * If we're doing logical decoding, include the new tuple data
2311  * even if we take a full-page image of the page.
2312  */
2313  if (need_tuple_data)
2314  bufflags |= REGBUF_KEEP_DATA;
2315 
2316  XLogBeginInsert();
2317  XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2318  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2319 
2320  XLogRegisterBufData(0, tupledata, totaldatalen);
2321 
2322  /* filtering by origin on a row level is much more efficient */
2324 
2325  recptr = XLogInsert(RM_HEAP2_ID, info);
2326 
2327  PageSetLSN(page, recptr);
2328  }
2329 
2330  END_CRIT_SECTION();
2331 
2332  UnlockReleaseBuffer(buffer);
2333  if (vmbuffer != InvalidBuffer)
2334  ReleaseBuffer(vmbuffer);
2335 
2336  ndone += nthispage;
2337  }
2338 
2339  /*
2340  * We're done with the actual inserts. Check for conflicts again, to
2341  * ensure that all rw-conflicts in to these inserts are detected. Without
2342  * this final check, a sequential scan of the heap may have locked the
2343  * table after the "before" check, missing one opportunity to detect the
2344  * conflict, and then scanned the table before the new tuples were there,
2345  * missing the other chance to detect the conflict.
2346  *
2347  * For heap inserts, we only need to check for table-level SSI locks. Our
2348  * new tuples can't possibly conflict with existing tuple locks, and heap
2349  * page locks are only consolidated versions of tuple locks; they do not
2350  * lock "gaps" as index page locks do. So we don't need to specify a
2351  * buffer when making the call.
2352  */
2354 
2355  /*
2356  * If tuples are cachable, mark them for invalidation from the caches in
2357  * case we abort. Note it is OK to do this after releasing the buffer,
2358  * because the heaptuples data structure is all in local memory, not in
2359  * the shared buffer.
2360  */
2361  if (IsCatalogRelation(relation))
2362  {
2363  for (i = 0; i < ntuples; i++)
2364  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2365  }
2366 
2367  /* copy t_self fields back to the caller's slots */
2368  for (i = 0; i < ntuples; i++)
2369  slots[i]->tts_tid = heaptuples[i]->t_self;
2370 
2371  pgstat_count_heap_insert(relation, ntuples);
2372 }
2373 
2374 /*
2375  * simple_heap_insert - insert a tuple
2376  *
2377  * Currently, this routine differs from heap_insert only in supplying
2378  * a default command ID and not allowing access to the speedup options.
2379  *
2380  * This should be used rather than using heap_insert directly in most places
2381  * where we are modifying system catalogs.
2382  */
2383 void
2385 {
2386  heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2387 }
2388 
2389 /*
2390  * Given infomask/infomask2, compute the bits that must be saved in the
2391  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2392  * xl_heap_lock_updated WAL records.
2393  *
2394  * See fix_infomask_from_infobits.
2395  */
2396 static uint8
2397 compute_infobits(uint16 infomask, uint16 infomask2)
2398 {
2399  return
2400  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2401  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2402  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2403  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2404  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2405  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2406  XLHL_KEYS_UPDATED : 0);
2407 }
2408 
2409 /*
2410  * Given two versions of the same t_infomask for a tuple, compare them and
2411  * return whether the relevant status for a tuple Xmax has changed. This is
2412  * used after a buffer lock has been released and reacquired: we want to ensure
2413  * that the tuple state continues to be the same it was when we previously
2414  * examined it.
2415  *
2416  * Note the Xmax field itself must be compared separately.
2417  */
2418 static inline bool
2419 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2420 {
2421  const uint16 interesting =
2423 
2424  if ((new_infomask & interesting) != (old_infomask & interesting))
2425  return true;
2426 
2427  return false;
2428 }
2429 
2430 /*
2431  * heap_delete - delete a tuple
2432  *
2433  * See table_tuple_delete() for an explanation of the parameters, except that
2434  * this routine directly takes a tuple rather than a slot.
2435  *
2436  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2437  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2438  * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2439  * generated by another transaction).
2440  */
2441 TM_Result
2443  CommandId cid, Snapshot crosscheck, bool wait,
2444  TM_FailureData *tmfd, bool changingPart)
2445 {
2446  TM_Result result;
2448  ItemId lp;
2449  HeapTupleData tp;
2450  Page page;
2451  BlockNumber block;
2452  Buffer buffer;
2453  Buffer vmbuffer = InvalidBuffer;
2454  TransactionId new_xmax;
2455  uint16 new_infomask,
2456  new_infomask2;
2457  bool have_tuple_lock = false;
2458  bool iscombo;
2459  bool all_visible_cleared = false;
2460  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2461  bool old_key_copied = false;
2462 
2463  Assert(ItemPointerIsValid(tid));
2464 
2465  /*
2466  * Forbid this during a parallel operation, lest it allocate a combocid.
2467  * Other workers might need that combocid for visibility checks, and we
2468  * have no provision for broadcasting it to them.
2469  */
2470  if (IsInParallelMode())
2471  ereport(ERROR,
2472  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2473  errmsg("cannot delete tuples during a parallel operation")));
2474 
2475  block = ItemPointerGetBlockNumber(tid);
2476  buffer = ReadBuffer(relation, block);
2477  page = BufferGetPage(buffer);
2478 
2479  /*
2480  * Before locking the buffer, pin the visibility map page if it appears to
2481  * be necessary. Since we haven't got the lock yet, someone else might be
2482  * in the middle of changing this, so we'll need to recheck after we have
2483  * the lock.
2484  */
2485  if (PageIsAllVisible(page))
2486  visibilitymap_pin(relation, block, &vmbuffer);
2487 
2489 
2490  /*
2491  * If we didn't pin the visibility map page and the page has become all
2492  * visible while we were busy locking the buffer, we'll have to unlock and
2493  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2494  * unfortunate, but hopefully shouldn't happen often.
2495  */
2496  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2497  {
2498  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2499  visibilitymap_pin(relation, block, &vmbuffer);
2501  }
2502 
2503  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2504  Assert(ItemIdIsNormal(lp));
2505 
2506  tp.t_tableOid = RelationGetRelid(relation);
2507  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2508  tp.t_len = ItemIdGetLength(lp);
2509  tp.t_self = *tid;
2510 
2511 l1:
2512  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2513 
2514  if (result == TM_Invisible)
2515  {
2516  UnlockReleaseBuffer(buffer);
2517  ereport(ERROR,
2518  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2519  errmsg("attempted to delete invisible tuple")));
2520  }
2521  else if (result == TM_BeingModified && wait)
2522  {
2523  TransactionId xwait;
2524  uint16 infomask;
2525 
2526  /* must copy state data before unlocking buffer */
2527  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2528  infomask = tp.t_data->t_infomask;
2529 
2530  /*
2531  * Sleep until concurrent transaction ends -- except when there's a
2532  * single locker and it's our own transaction. Note we don't care
2533  * which lock mode the locker has, because we need the strongest one.
2534  *
2535  * Before sleeping, we need to acquire tuple lock to establish our
2536  * priority for the tuple (see heap_lock_tuple). LockTuple will
2537  * release us when we are next-in-line for the tuple.
2538  *
2539  * If we are forced to "start over" below, we keep the tuple lock;
2540  * this arranges that we stay at the head of the line while rechecking
2541  * tuple state.
2542  */
2543  if (infomask & HEAP_XMAX_IS_MULTI)
2544  {
2545  bool current_is_member = false;
2546 
2547  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2548  LockTupleExclusive, &current_is_member))
2549  {
2550  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2551 
2552  /*
2553  * Acquire the lock, if necessary (but skip it when we're
2554  * requesting a lock and already have one; avoids deadlock).
2555  */
2556  if (!current_is_member)
2558  LockWaitBlock, &have_tuple_lock);
2559 
2560  /* wait for multixact */
2562  relation, &(tp.t_self), XLTW_Delete,
2563  NULL);
2565 
2566  /*
2567  * If xwait had just locked the tuple then some other xact
2568  * could update this tuple before we get to this point. Check
2569  * for xmax change, and start over if so.
2570  */
2571  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2573  xwait))
2574  goto l1;
2575  }
2576 
2577  /*
2578  * You might think the multixact is necessarily done here, but not
2579  * so: it could have surviving members, namely our own xact or
2580  * other subxacts of this backend. It is legal for us to delete
2581  * the tuple in either case, however (the latter case is
2582  * essentially a situation of upgrading our former shared lock to
2583  * exclusive). We don't bother changing the on-disk hint bits
2584  * since we are about to overwrite the xmax altogether.
2585  */
2586  }
2587  else if (!TransactionIdIsCurrentTransactionId(xwait))
2588  {
2589  /*
2590  * Wait for regular transaction to end; but first, acquire tuple
2591  * lock.
2592  */
2593  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2595  LockWaitBlock, &have_tuple_lock);
2596  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2598 
2599  /*
2600  * xwait is done, but if xwait had just locked the tuple then some
2601  * other xact could update this tuple before we get to this point.
2602  * Check for xmax change, and start over if so.
2603  */
2604  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2606  xwait))
2607  goto l1;
2608 
2609  /* Otherwise check if it committed or aborted */
2610  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2611  }
2612 
2613  /*
2614  * We may overwrite if previous xmax aborted, or if it committed but
2615  * only locked the tuple without updating it.
2616  */
2617  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2620  result = TM_Ok;
2621  else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid) ||
2623  result = TM_Updated;
2624  else
2625  result = TM_Deleted;
2626  }
2627 
2628  if (crosscheck != InvalidSnapshot && result == TM_Ok)
2629  {
2630  /* Perform additional check for transaction-snapshot mode RI updates */
2631  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2632  result = TM_Updated;
2633  }
2634 
2635  if (result != TM_Ok)
2636  {
2637  Assert(result == TM_SelfModified ||
2638  result == TM_Updated ||
2639  result == TM_Deleted ||
2640  result == TM_BeingModified);
2642  Assert(result != TM_Updated ||
2643  !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2644  tmfd->ctid = tp.t_data->t_ctid;
2646  if (result == TM_SelfModified)
2647  tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2648  else
2649  tmfd->cmax = InvalidCommandId;
2650  UnlockReleaseBuffer(buffer);
2651  if (have_tuple_lock)
2652  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2653  if (vmbuffer != InvalidBuffer)
2654  ReleaseBuffer(vmbuffer);
2655  return result;
2656  }
2657 
2658  /*
2659  * We're about to do the actual delete -- check for conflict first, to
2660  * avoid possibly having to roll back work we've just done.
2661  *
2662  * This is safe without a recheck as long as there is no possibility of
2663  * another process scanning the page between this check and the delete
2664  * being visible to the scan (i.e., an exclusive buffer content lock is
2665  * continuously held from this point until the tuple delete is visible).
2666  */
2667  CheckForSerializableConflictIn(relation, &tp, buffer);
2668 
2669  /* replace cid with a combo cid if necessary */
2670  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2671 
2672  /*
2673  * Compute replica identity tuple before entering the critical section so
2674  * we don't PANIC upon a memory allocation failure.
2675  */
2676  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2677 
2678  /*
2679  * If this is the first possibly-multixact-able operation in the current
2680  * transaction, set my per-backend OldestMemberMXactId setting. We can be
2681  * certain that the transaction will never become a member of any older
2682  * MultiXactIds than that. (We have to do this even if we end up just
2683  * using our own TransactionId below, since some other backend could
2684  * incorporate our XID into a MultiXact immediately afterwards.)
2685  */
2687 
2690  xid, LockTupleExclusive, true,
2691  &new_xmax, &new_infomask, &new_infomask2);
2692 
2694 
2695  /*
2696  * If this transaction commits, the tuple will become DEAD sooner or
2697  * later. Set flag that this page is a candidate for pruning once our xid
2698  * falls below the OldestXmin horizon. If the transaction finally aborts,
2699  * the subsequent page pruning will be a no-op and the hint will be
2700  * cleared.
2701  */
2702  PageSetPrunable(page, xid);
2703 
2704  if (PageIsAllVisible(page))
2705  {
2706  all_visible_cleared = true;
2707  PageClearAllVisible(page);
2708  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2709  vmbuffer, VISIBILITYMAP_VALID_BITS);
2710  }
2711 
2712  /* store transaction information of xact deleting the tuple */
2715  tp.t_data->t_infomask |= new_infomask;
2716  tp.t_data->t_infomask2 |= new_infomask2;
2718  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2719  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2720  /* Make sure there is no forward chain link in t_ctid */
2721  tp.t_data->t_ctid = tp.t_self;
2722 
2723  /* Signal that this is actually a move into another partition */
2724  if (changingPart)
2726 
2727  MarkBufferDirty(buffer);
2728 
2729  /*
2730  * XLOG stuff
2731  *
2732  * NB: heap_abort_speculative() uses the same xlog record and replay
2733  * routines.
2734  */
2735  if (RelationNeedsWAL(relation))
2736  {
2737  xl_heap_delete xlrec;
2738  xl_heap_header xlhdr;
2739  XLogRecPtr recptr;
2740 
2741  /* For logical decode we need combocids to properly decode the catalog */
2743  log_heap_new_cid(relation, &tp);
2744 
2745  xlrec.flags = 0;
2746  if (all_visible_cleared)
2748  if (changingPart)
2751  tp.t_data->t_infomask2);
2753  xlrec.xmax = new_xmax;
2754 
2755  if (old_key_tuple != NULL)
2756  {
2757  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
2759  else
2761  }
2762 
2763  XLogBeginInsert();
2764  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
2765 
2766  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2767 
2768  /*
2769  * Log replica identity of the deleted tuple if there is one
2770  */
2771  if (old_key_tuple != NULL)
2772  {
2773  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
2774  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
2775  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
2776 
2777  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
2778  XLogRegisterData((char *) old_key_tuple->t_data
2780  old_key_tuple->t_len
2782  }
2783 
2784  /* filtering by origin on a row level is much more efficient */
2786 
2787  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2788 
2789  PageSetLSN(page, recptr);
2790  }
2791 
2792  END_CRIT_SECTION();
2793 
2794  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2795 
2796  if (vmbuffer != InvalidBuffer)
2797  ReleaseBuffer(vmbuffer);
2798 
2799  /*
2800  * If the tuple has toasted out-of-line attributes, we need to delete
2801  * those items too. We have to do this before releasing the buffer
2802  * because we need to look at the contents of the tuple, but it's OK to
2803  * release the content lock on the buffer first.
2804  */
2805  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2806  relation->rd_rel->relkind != RELKIND_MATVIEW)
2807  {
2808  /* toast table entries should never be recursively toasted */
2810  }
2811  else if (HeapTupleHasExternal(&tp))
2812  toast_delete(relation, &tp, false);
2813 
2814  /*
2815  * Mark tuple for invalidation from system caches at next command
2816  * boundary. We have to do this before releasing the buffer because we
2817  * need to look at the contents of the tuple.
2818  */
2819  CacheInvalidateHeapTuple(relation, &tp, NULL);
2820 
2821  /* Now we can release the buffer */
2822  ReleaseBuffer(buffer);
2823 
2824  /*
2825  * Release the lmgr tuple lock, if we had it.
2826  */
2827  if (have_tuple_lock)
2828  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2829 
2830  pgstat_count_heap_delete(relation);
2831 
2832  if (old_key_tuple != NULL && old_key_copied)
2833  heap_freetuple(old_key_tuple);
2834 
2835  return TM_Ok;
2836 }
2837 
2838 /*
2839  * simple_heap_delete - delete a tuple
2840  *
2841  * This routine may be used to delete a tuple when concurrent updates of
2842  * the target tuple are not expected (for example, because we have a lock
2843  * on the relation associated with the tuple). Any failure is reported
2844  * via ereport().
2845  */
2846 void
2848 {
2849  TM_Result result;
2850  TM_FailureData tmfd;
2851 
2852  result = heap_delete(relation, tid,
2854  true /* wait for commit */ ,
2855  &tmfd, false /* changingPart */ );
2856  switch (result)
2857  {
2858  case TM_SelfModified:
2859  /* Tuple was already updated in current command? */
2860  elog(ERROR, "tuple already updated by self");
2861  break;
2862 
2863  case TM_Ok:
2864  /* done successfully */
2865  break;
2866 
2867  case TM_Updated:
2868  elog(ERROR, "tuple concurrently updated");
2869  break;
2870 
2871  case TM_Deleted:
2872  elog(ERROR, "tuple concurrently deleted");
2873  break;
2874 
2875  default:
2876  elog(ERROR, "unrecognized heap_delete status: %u", result);
2877  break;
2878  }
2879 }
2880 
2881 /*
2882  * heap_update - replace a tuple
2883  *
2884  * See table_tuple_update() for an explanation of the parameters, except that
2885  * this routine directly takes a tuple rather than a slot.
2886  *
2887  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2888  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2889  * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2890  * generated by another transaction).
2891  */
2892 TM_Result
2894  CommandId cid, Snapshot crosscheck, bool wait,
2895  TM_FailureData *tmfd, LockTupleMode *lockmode)
2896 {
2897  TM_Result result;
2899  Bitmapset *hot_attrs;
2900  Bitmapset *key_attrs;
2901  Bitmapset *id_attrs;
2902  Bitmapset *interesting_attrs;
2903  Bitmapset *modified_attrs;
2904  ItemId lp;
2905  HeapTupleData oldtup;
2906  HeapTuple heaptup;
2907  HeapTuple old_key_tuple = NULL;
2908  bool old_key_copied = false;
2909  Page page;
2910  BlockNumber block;
2911  MultiXactStatus mxact_status;
2912  Buffer buffer,
2913  newbuf,
2914  vmbuffer = InvalidBuffer,
2915  vmbuffer_new = InvalidBuffer;
2916  bool need_toast;
2917  Size newtupsize,
2918  pagefree;
2919  bool have_tuple_lock = false;
2920  bool iscombo;
2921  bool use_hot_update = false;
2922  bool hot_attrs_checked = false;
2923  bool key_intact;
2924  bool all_visible_cleared = false;
2925  bool all_visible_cleared_new = false;
2926  bool checked_lockers;
2927  bool locker_remains;
2928  TransactionId xmax_new_tuple,
2929  xmax_old_tuple;
2930  uint16 infomask_old_tuple,
2931  infomask2_old_tuple,
2932  infomask_new_tuple,
2933  infomask2_new_tuple;
2934 
2935  Assert(ItemPointerIsValid(otid));
2936 
2937  /*
2938  * Forbid this during a parallel operation, lest it allocate a combocid.
2939  * Other workers might need that combocid for visibility checks, and we
2940  * have no provision for broadcasting it to them.
2941  */
2942  if (IsInParallelMode())
2943  ereport(ERROR,
2944  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2945  errmsg("cannot update tuples during a parallel operation")));
2946 
2947  /*
2948  * Fetch the list of attributes to be checked for various operations.
2949  *
2950  * For HOT considerations, this is wasted effort if we fail to update or
2951  * have to put the new tuple on a different page. But we must compute the
2952  * list before obtaining buffer lock --- in the worst case, if we are
2953  * doing an update on one of the relevant system catalogs, we could
2954  * deadlock if we try to fetch the list later. In any case, the relcache
2955  * caches the data so this is usually pretty cheap.
2956  *
2957  * We also need columns used by the replica identity and columns that are
2958  * considered the "key" of rows in the table.
2959  *
2960  * Note that we get copies of each bitmap, so we need not worry about
2961  * relcache flush happening midway through.
2962  */
2963  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
2964  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
2965  id_attrs = RelationGetIndexAttrBitmap(relation,
2967 
2968 
2969  block = ItemPointerGetBlockNumber(otid);
2970  buffer = ReadBuffer(relation, block);
2971  page = BufferGetPage(buffer);
2972 
2973  interesting_attrs = NULL;
2974 
2975  /*
2976  * If the page is already full, there is hardly any chance of doing a HOT
2977  * update on this page. It might be wasteful effort to look for index
2978  * column updates only to later reject HOT updates for lack of space in
2979  * the same page. So we be conservative and only fetch hot_attrs if the
2980  * page is not already full. Since we are already holding a pin on the
2981  * buffer, there is no chance that the buffer can get cleaned up
2982  * concurrently and even if that was possible, in the worst case we lose a
2983  * chance to do a HOT update.
2984  */
2985  if (!PageIsFull(page))
2986  {
2987  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
2988  hot_attrs_checked = true;
2989  }
2990  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
2991  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
2992 
2993  /*
2994  * Before locking the buffer, pin the visibility map page if it appears to
2995  * be necessary. Since we haven't got the lock yet, someone else might be
2996  * in the middle of changing this, so we'll need to recheck after we have
2997  * the lock.
2998  */
2999  if (PageIsAllVisible(page))
3000  visibilitymap_pin(relation, block, &vmbuffer);
3001 
3003 
3004  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3005  Assert(ItemIdIsNormal(lp));
3006 
3007  /*
3008  * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3009  * properly.
3010  */
3011  oldtup.t_tableOid = RelationGetRelid(relation);
3012  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3013  oldtup.t_len = ItemIdGetLength(lp);
3014  oldtup.t_self = *otid;
3015 
3016  /* the new tuple is ready, except for this: */
3017  newtup->t_tableOid = RelationGetRelid(relation);
3018 
3019  /* Determine columns modified by the update. */
3020  modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3021  &oldtup, newtup);
3022 
3023  /*
3024  * If we're not updating any "key" column, we can grab a weaker lock type.
3025  * This allows for more concurrency when we are running simultaneously
3026  * with foreign key checks.
3027  *
3028  * Note that if a column gets detoasted while executing the update, but
3029  * the value ends up being the same, this test will fail and we will use
3030  * the stronger lock. This is acceptable; the important case to optimize
3031  * is updates that don't manipulate key columns, not those that
3032  * serendipitously arrive at the same key values.
3033  */
3034  if (!bms_overlap(modified_attrs, key_attrs))
3035  {
3036  *lockmode = LockTupleNoKeyExclusive;
3037  mxact_status = MultiXactStatusNoKeyUpdate;
3038  key_intact = true;
3039 
3040  /*
3041  * If this is the first possibly-multixact-able operation in the
3042  * current transaction, set my per-backend OldestMemberMXactId
3043  * setting. We can be certain that the transaction will never become a
3044  * member of any older MultiXactIds than that. (We have to do this
3045  * even if we end up just using our own TransactionId below, since
3046  * some other backend could incorporate our XID into a MultiXact
3047  * immediately afterwards.)
3048  */
3050  }
3051  else
3052  {
3053  *lockmode = LockTupleExclusive;
3054  mxact_status = MultiXactStatusUpdate;
3055  key_intact = false;
3056  }
3057 
3058  /*
3059  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3060  * otid may very well point at newtup->t_self, which we will overwrite
3061  * with the new tuple's location, so there's great risk of confusion if we
3062  * use otid anymore.
3063  */
3064 
3065 l2:
3066  checked_lockers = false;
3067  locker_remains = false;
3068  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3069 
3070  /* see below about the "no wait" case */
3071  Assert(result != TM_BeingModified || wait);
3072 
3073  if (result == TM_Invisible)
3074  {
3075  UnlockReleaseBuffer(buffer);
3076  ereport(ERROR,
3077  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3078  errmsg("attempted to update invisible tuple")));
3079  }
3080  else if (result == TM_BeingModified && wait)
3081  {
3082  TransactionId xwait;
3083  uint16 infomask;
3084  bool can_continue = false;
3085 
3086  /*
3087  * XXX note that we don't consider the "no wait" case here. This
3088  * isn't a problem currently because no caller uses that case, but it
3089  * should be fixed if such a caller is introduced. It wasn't a
3090  * problem previously because this code would always wait, but now
3091  * that some tuple locks do not conflict with one of the lock modes we
3092  * use, it is possible that this case is interesting to handle
3093  * specially.
3094  *
3095  * This may cause failures with third-party code that calls
3096  * heap_update directly.
3097  */
3098 
3099  /* must copy state data before unlocking buffer */
3100  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3101  infomask = oldtup.t_data->t_infomask;
3102 
3103  /*
3104  * Now we have to do something about the existing locker. If it's a
3105  * multi, sleep on it; we might be awakened before it is completely
3106  * gone (or even not sleep at all in some cases); we need to preserve
3107  * it as locker, unless it is gone completely.
3108  *
3109  * If it's not a multi, we need to check for sleeping conditions
3110  * before actually going to sleep. If the update doesn't conflict
3111  * with the locks, we just continue without sleeping (but making sure
3112  * it is preserved).
3113  *
3114  * Before sleeping, we need to acquire tuple lock to establish our
3115  * priority for the tuple (see heap_lock_tuple). LockTuple will
3116  * release us when we are next-in-line for the tuple. Note we must
3117  * not acquire the tuple lock until we're sure we're going to sleep;
3118  * otherwise we're open for race conditions with other transactions
3119  * holding the tuple lock which sleep on us.
3120  *
3121  * If we are forced to "start over" below, we keep the tuple lock;
3122  * this arranges that we stay at the head of the line while rechecking
3123  * tuple state.
3124  */
3125  if (infomask & HEAP_XMAX_IS_MULTI)
3126  {
3127  TransactionId update_xact;
3128  int remain;
3129  bool current_is_member = false;
3130 
3131  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3132  *lockmode, &current_is_member))
3133  {
3134  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3135 
3136  /*
3137  * Acquire the lock, if necessary (but skip it when we're
3138  * requesting a lock and already have one; avoids deadlock).
3139  */
3140  if (!current_is_member)
3141  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3142  LockWaitBlock, &have_tuple_lock);
3143 
3144  /* wait for multixact */
3145  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3146  relation, &oldtup.t_self, XLTW_Update,
3147  &remain);
3148  checked_lockers = true;
3149  locker_remains = remain != 0;
3151 
3152  /*
3153  * If xwait had just locked the tuple then some other xact
3154  * could update this tuple before we get to this point. Check
3155  * for xmax change, and start over if so.
3156  */
3158  infomask) ||
3160  xwait))
3161  goto l2;
3162  }
3163 
3164  /*
3165  * Note that the multixact may not be done by now. It could have
3166  * surviving members; our own xact or other subxacts of this
3167  * backend, and also any other concurrent transaction that locked
3168  * the tuple with KeyShare if we only got TupleLockUpdate. If
3169  * this is the case, we have to be careful to mark the updated
3170  * tuple with the surviving members in Xmax.
3171  *
3172  * Note that there could have been another update in the
3173  * MultiXact. In that case, we need to check whether it committed
3174  * or aborted. If it aborted we are safe to update it again;
3175  * otherwise there is an update conflict, and we have to return
3176  * TableTuple{Deleted, Updated} below.
3177  *
3178  * In the LockTupleExclusive case, we still need to preserve the
3179  * surviving members: those would include the tuple locks we had
3180  * before this one, which are important to keep in case this
3181  * subxact aborts.
3182  */
3184  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3185  else
3186  update_xact = InvalidTransactionId;
3187 
3188  /*
3189  * There was no UPDATE in the MultiXact; or it aborted. No
3190  * TransactionIdIsInProgress() call needed here, since we called
3191  * MultiXactIdWait() above.
3192  */
3193  if (!TransactionIdIsValid(update_xact) ||
3194  TransactionIdDidAbort(update_xact))
3195  can_continue = true;
3196  }
3197  else if (TransactionIdIsCurrentTransactionId(xwait))
3198  {
3199  /*
3200  * The only locker is ourselves; we can avoid grabbing the tuple
3201  * lock here, but must preserve our locking information.
3202  */
3203  checked_lockers = true;
3204  locker_remains = true;
3205  can_continue = true;
3206  }
3207  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3208  {
3209  /*
3210  * If it's just a key-share locker, and we're not changing the key
3211  * columns, we don't need to wait for it to end; but we need to
3212  * preserve it as locker.
3213  */
3214  checked_lockers = true;
3215  locker_remains = true;
3216  can_continue = true;
3217  }
3218  else
3219  {
3220  /*
3221  * Wait for regular transaction to end; but first, acquire tuple
3222  * lock.
3223  */
3224  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3225  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3226  LockWaitBlock, &have_tuple_lock);
3227  XactLockTableWait(xwait, relation, &oldtup.t_self,
3228  XLTW_Update);
3229  checked_lockers = true;
3231 
3232  /*
3233  * xwait is done, but if xwait had just locked the tuple then some
3234  * other xact could update this tuple before we get to this point.
3235  * Check for xmax change, and start over if so.
3236  */
3237  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3238  !TransactionIdEquals(xwait,
3240  goto l2;
3241 
3242  /* Otherwise check if it committed or aborted */
3243  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3244  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3245  can_continue = true;
3246  }
3247 
3248  if (can_continue)
3249  result = TM_Ok;
3250  else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid) ||
3252  result = TM_Updated;
3253  else
3254  result = TM_Deleted;
3255  }
3256 
3257  if (crosscheck != InvalidSnapshot && result == TM_Ok)
3258  {
3259  /* Perform additional check for transaction-snapshot mode RI updates */
3260  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3261  {
3262  result = TM_Updated;
3263  Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3264  }
3265  }
3266 
3267  if (result != TM_Ok)
3268  {
3269  Assert(result == TM_SelfModified ||
3270  result == TM_Updated ||
3271  result == TM_Deleted ||
3272  result == TM_BeingModified);
3273  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3274  Assert(result != TM_Updated ||
3275  !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3276  tmfd->ctid = oldtup.t_data->t_ctid;
3277  tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3278  if (result == TM_SelfModified)
3279  tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3280  else
3281  tmfd->cmax = InvalidCommandId;
3282  UnlockReleaseBuffer(buffer);
3283  if (have_tuple_lock)
3284  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3285  if (vmbuffer != InvalidBuffer)
3286  ReleaseBuffer(vmbuffer);
3287  bms_free(hot_attrs);
3288  bms_free(key_attrs);
3289  bms_free(id_attrs);
3290  bms_free(modified_attrs);
3291  bms_free(interesting_attrs);
3292  return result;
3293  }
3294 
3295  /*
3296  * If we didn't pin the visibility map page and the page has become all
3297  * visible while we were busy locking the buffer, or during some
3298  * subsequent window during which we had it unlocked, we'll have to unlock
3299  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3300  * bit unfortunate, especially since we'll now have to recheck whether the
3301  * tuple has been locked or updated under us, but hopefully it won't
3302  * happen very often.
3303  */
3304  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3305  {
3306  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3307  visibilitymap_pin(relation, block, &vmbuffer);
3309  goto l2;
3310  }
3311 
3312  /* Fill in transaction status data */
3313 
3314  /*
3315  * If the tuple we're updating is locked, we need to preserve the locking
3316  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3317  */
3319  oldtup.t_data->t_infomask,
3320  oldtup.t_data->t_infomask2,
3321  xid, *lockmode, true,
3322  &xmax_old_tuple, &infomask_old_tuple,
3323  &infomask2_old_tuple);
3324 
3325  /*
3326  * And also prepare an Xmax value for the new copy of the tuple. If there
3327  * was no xmax previously, or there was one but all lockers are now gone,
3328  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3329  * rare cases that might also be InvalidXid and yet not have the
3330  * HEAP_XMAX_INVALID bit set; that's fine.)
3331  */
3332  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3334  (checked_lockers && !locker_remains))
3335  xmax_new_tuple = InvalidTransactionId;
3336  else
3337  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3338 
3339  if (!TransactionIdIsValid(xmax_new_tuple))
3340  {
3341  infomask_new_tuple = HEAP_XMAX_INVALID;
3342  infomask2_new_tuple = 0;
3343  }
3344  else
3345  {
3346  /*
3347  * If we found a valid Xmax for the new tuple, then the infomask bits
3348  * to use on the new tuple depend on what was there on the old one.
3349  * Note that since we're doing an update, the only possibility is that
3350  * the lockers had FOR KEY SHARE lock.
3351  */
3352  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3353  {
3354  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3355  &infomask2_new_tuple);
3356  }
3357  else
3358  {
3359  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3360  infomask2_new_tuple = 0;
3361  }
3362  }
3363 
3364  /*
3365  * Prepare the new tuple with the appropriate initial values of Xmin and
3366  * Xmax, as well as initial infomask bits as computed above.
3367  */
3368  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3369  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3370  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3371  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3372  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3373  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3374  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3375 
3376  /*
3377  * Replace cid with a combo cid if necessary. Note that we already put
3378  * the plain cid into the new tuple.
3379  */
3380  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3381 
3382  /*
3383  * If the toaster needs to be activated, OR if the new tuple will not fit
3384  * on the same page as the old, then we need to release the content lock
3385  * (but not the pin!) on the old tuple's buffer while we are off doing
3386  * TOAST and/or table-file-extension work. We must mark the old tuple to
3387  * show that it's locked, else other processes may try to update it
3388  * themselves.
3389  *
3390  * We need to invoke the toaster if there are already any out-of-line
3391  * toasted values present, or if the new tuple is over-threshold.
3392  */
3393  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3394  relation->rd_rel->relkind != RELKIND_MATVIEW)
3395  {
3396  /* toast table entries should never be recursively toasted */
3397  Assert(!HeapTupleHasExternal(&oldtup));
3398  Assert(!HeapTupleHasExternal(newtup));
3399  need_toast = false;
3400  }
3401  else
3402  need_toast = (HeapTupleHasExternal(&oldtup) ||
3403  HeapTupleHasExternal(newtup) ||
3404  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3405 
3406  pagefree = PageGetHeapFreeSpace(page);
3407 
3408  newtupsize = MAXALIGN(newtup->t_len);
3409 
3410  if (need_toast || newtupsize > pagefree)
3411  {
3412  TransactionId xmax_lock_old_tuple;
3413  uint16 infomask_lock_old_tuple,
3414  infomask2_lock_old_tuple;
3415  bool cleared_all_frozen = false;
3416 
3417  /*
3418  * To prevent concurrent sessions from updating the tuple, we have to
3419  * temporarily mark it locked, while we release the page-level lock.
3420  *
3421  * To satisfy the rule that any xid potentially appearing in a buffer
3422  * written out to disk, we unfortunately have to WAL log this
3423  * temporary modification. We can reuse xl_heap_lock for this
3424  * purpose. If we crash/error before following through with the
3425  * actual update, xmax will be of an aborted transaction, allowing
3426  * other sessions to proceed.
3427  */
3428 
3429  /*
3430  * Compute xmax / infomask appropriate for locking the tuple. This has
3431  * to be done separately from the combo that's going to be used for
3432  * updating, because the potentially created multixact would otherwise
3433  * be wrong.
3434  */
3436  oldtup.t_data->t_infomask,
3437  oldtup.t_data->t_infomask2,
3438  xid, *lockmode, false,
3439  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3440  &infomask2_lock_old_tuple);
3441 
3442  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3443 
3445 
3446  /* Clear obsolete visibility flags ... */
3447  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3448  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3449  HeapTupleClearHotUpdated(&oldtup);
3450  /* ... and store info about transaction updating this tuple */
3451  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3452  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3453  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3454  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3455  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3456 
3457  /* temporarily make it look not-updated, but locked */
3458  oldtup.t_data->t_ctid = oldtup.t_self;
3459 
3460  /*
3461  * Clear all-frozen bit on visibility map if needed. We could
3462  * immediately reset ALL_VISIBLE, but given that the WAL logging
3463  * overhead would be unchanged, that doesn't seem necessarily
3464  * worthwhile.
3465  */
3466  if (PageIsAllVisible(BufferGetPage(buffer)) &&
3467  visibilitymap_clear(relation, block, vmbuffer,
3469  cleared_all_frozen = true;
3470 
3471  MarkBufferDirty(buffer);
3472 
3473  if (RelationNeedsWAL(relation))
3474  {
3475  xl_heap_lock xlrec;
3476  XLogRecPtr recptr;
3477 
3478  XLogBeginInsert();
3479  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3480 
3481  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3482  xlrec.locking_xid = xmax_lock_old_tuple;
3484  oldtup.t_data->t_infomask2);
3485  xlrec.flags =
3486  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3487  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3488  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3489  PageSetLSN(page, recptr);
3490  }
3491 
3492  END_CRIT_SECTION();
3493 
3494  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3495 
3496  /*
3497  * Let the toaster do its thing, if needed.
3498  *
3499  * Note: below this point, heaptup is the data we actually intend to
3500  * store into the relation; newtup is the caller's original untoasted
3501  * data.
3502  */
3503  if (need_toast)
3504  {
3505  /* Note we always use WAL and FSM during updates */
3506  heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
3507  newtupsize = MAXALIGN(heaptup->t_len);
3508  }
3509  else
3510  heaptup = newtup;
3511 
3512  /*
3513  * Now, do we need a new page for the tuple, or not? This is a bit
3514  * tricky since someone else could have added tuples to the page while
3515  * we weren't looking. We have to recheck the available space after
3516  * reacquiring the buffer lock. But don't bother to do that if the
3517  * former amount of free space is still not enough; it's unlikely
3518  * there's more free now than before.
3519  *
3520  * What's more, if we need to get a new page, we will need to acquire
3521  * buffer locks on both old and new pages. To avoid deadlock against
3522  * some other backend trying to get the same two locks in the other
3523  * order, we must be consistent about the order we get the locks in.
3524  * We use the rule "lock the lower-numbered page of the relation
3525  * first". To implement this, we must do RelationGetBufferForTuple
3526  * while not holding the lock on the old page, and we must rely on it
3527  * to get the locks on both pages in the correct order.
3528  */
3529  if (newtupsize > pagefree)
3530  {
3531  /* Assume there's no chance to put heaptup on same page. */
3532  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3533  buffer, 0, NULL,
3534  &vmbuffer_new, &vmbuffer);
3535  }
3536  else
3537  {
3538  /* Re-acquire the lock on the old tuple's page. */
3540  /* Re-check using the up-to-date free space */
3541  pagefree = PageGetHeapFreeSpace(page);
3542  if (newtupsize > pagefree)
3543  {
3544  /*
3545  * Rats, it doesn't fit anymore. We must now unlock and
3546  * relock to avoid deadlock. Fortunately, this path should
3547  * seldom be taken.
3548  */
3549  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3550  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3551  buffer, 0, NULL,
3552  &vmbuffer_new, &vmbuffer);
3553  }
3554  else
3555  {
3556  /* OK, it fits here, so we're done. */
3557  newbuf = buffer;
3558  }
3559  }
3560  }
3561  else
3562  {
3563  /* No TOAST work needed, and it'll fit on same page */
3564  newbuf = buffer;
3565  heaptup = newtup;
3566  }
3567 
3568  /*
3569  * We're about to do the actual update -- check for conflict first, to
3570  * avoid possibly having to roll back work we've just done.
3571  *
3572  * This is safe without a recheck as long as there is no possibility of
3573  * another process scanning the pages between this check and the update
3574  * being visible to the scan (i.e., exclusive buffer content lock(s) are
3575  * continuously held from this point until the tuple update is visible).
3576  *
3577  * For the new tuple the only check needed is at the relation level, but
3578  * since both tuples are in the same relation and the check for oldtup
3579  * will include checking the relation level, there is no benefit to a
3580  * separate check for the new tuple.
3581  */
3582  CheckForSerializableConflictIn(relation, &oldtup, buffer);
3583 
3584  /*
3585  * At this point newbuf and buffer are both pinned and locked, and newbuf
3586  * has enough space for the new tuple. If they are the same buffer, only
3587  * one pin is held.
3588  */
3589 
3590  if (newbuf == buffer)
3591  {
3592  /*
3593  * Since the new tuple is going into the same page, we might be able
3594  * to do a HOT update. Check if any of the index columns have been
3595  * changed. If the page was already full, we may have skipped checking
3596  * for index columns, and also can't do a HOT update.
3597  */
3598  if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3599  use_hot_update = true;
3600  }
3601  else
3602  {
3603  /* Set a hint that the old page could use prune/defrag */
3604  PageSetFull(page);
3605  }
3606 
3607  /*
3608  * Compute replica identity tuple before entering the critical section so
3609  * we don't PANIC upon a memory allocation failure.
3610  * ExtractReplicaIdentity() will return NULL if nothing needs to be
3611  * logged.
3612  */
3613  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3614  bms_overlap(modified_attrs, id_attrs),
3615  &old_key_copied);
3616 
3617  /* NO EREPORT(ERROR) from here till changes are logged */
3619 
3620  /*
3621  * If this transaction commits, the old tuple will become DEAD sooner or
3622  * later. Set flag that this page is a candidate for pruning once our xid
3623  * falls below the OldestXmin horizon. If the transaction finally aborts,
3624  * the subsequent page pruning will be a no-op and the hint will be
3625  * cleared.
3626  *
3627  * XXX Should we set hint on newbuf as well? If the transaction aborts,
3628  * there would be a prunable tuple in the newbuf; but for now we choose
3629  * not to optimize for aborts. Note that heap_xlog_update must be kept in
3630  * sync if this decision changes.
3631  */
3632  PageSetPrunable(page, xid);
3633 
3634  if (use_hot_update)
3635  {
3636  /* Mark the old tuple as HOT-updated */
3637  HeapTupleSetHotUpdated(&oldtup);
3638  /* And mark the new tuple as heap-only */
3639  HeapTupleSetHeapOnly(heaptup);
3640  /* Mark the caller's copy too, in case different from heaptup */
3641  HeapTupleSetHeapOnly(newtup);
3642  }
3643  else
3644  {
3645  /* Make sure tuples are correctly marked as not-HOT */
3646  HeapTupleClearHotUpdated(&oldtup);
3647  HeapTupleClearHeapOnly(heaptup);
3648  HeapTupleClearHeapOnly(newtup);
3649  }
3650 
3651  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3652 
3653 
3654  /* Clear obsolete visibility flags, possibly set by ourselves above... */
3655  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3656  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3657  /* ... and store info about transaction updating this tuple */
3658  Assert(TransactionIdIsValid(xmax_old_tuple));
3659  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3660  oldtup.t_data->t_infomask |= infomask_old_tuple;
3661  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3662  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3663 
3664  /* record address of new tuple in t_ctid of old one */
3665  oldtup.t_data->t_ctid = heaptup->t_self;
3666 
3667  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3668  if (PageIsAllVisible(BufferGetPage(buffer)))
3669  {
3670  all_visible_cleared = true;
3672  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3673  vmbuffer, VISIBILITYMAP_VALID_BITS);
3674  }
3675  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3676  {
3677  all_visible_cleared_new = true;
3679  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3680  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3681  }
3682 
3683  if (newbuf != buffer)
3684  MarkBufferDirty(newbuf);
3685  MarkBufferDirty(buffer);
3686 
3687  /* XLOG stuff */
3688  if (RelationNeedsWAL(relation))
3689  {
3690  XLogRecPtr recptr;
3691 
3692  /*
3693  * For logical decoding we need combocids to properly decode the
3694  * catalog.
3695  */
3697  {
3698  log_heap_new_cid(relation, &oldtup);
3699  log_heap_new_cid(relation, heaptup);
3700  }
3701 
3702  recptr = log_heap_update(relation, buffer,
3703  newbuf, &oldtup, heaptup,
3704  old_key_tuple,
3705  all_visible_cleared,
3706  all_visible_cleared_new);
3707  if (newbuf != buffer)
3708  {
3709  PageSetLSN(BufferGetPage(newbuf), recptr);
3710  }
3711  PageSetLSN(BufferGetPage(buffer), recptr);
3712  }
3713 
3714  END_CRIT_SECTION();
3715 
3716  if (newbuf != buffer)
3717  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3718  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3719 
3720  /*
3721  * Mark old tuple for invalidation from system caches at next command
3722  * boundary, and mark the new tuple for invalidation in case we abort. We
3723  * have to do this before releasing the buffer because oldtup is in the
3724  * buffer. (heaptup is all in local memory, but it's necessary to process
3725  * both tuple versions in one call to inval.c so we can avoid redundant
3726  * sinval messages.)
3727  */
3728  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
3729 
3730  /* Now we can release the buffer(s) */
3731  if (newbuf != buffer)
3732  ReleaseBuffer(newbuf);
3733  ReleaseBuffer(buffer);
3734  if (BufferIsValid(vmbuffer_new))
3735  ReleaseBuffer(vmbuffer_new);
3736  if (BufferIsValid(vmbuffer))
3737  ReleaseBuffer(vmbuffer);
3738 
3739  /*
3740  * Release the lmgr tuple lock, if we had it.
3741  */
3742  if (have_tuple_lock)
3743  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3744 
3745  pgstat_count_heap_update(relation, use_hot_update);
3746 
3747  /*
3748  * If heaptup is a private copy, release it. Don't forget to copy t_self
3749  * back to the caller's image, too.
3750  */
3751  if (heaptup != newtup)
3752  {
3753  newtup->t_self = heaptup->t_self;
3754  heap_freetuple(heaptup);
3755  }
3756 
3757  if (old_key_tuple != NULL && old_key_copied)
3758  heap_freetuple(old_key_tuple);
3759 
3760  bms_free(hot_attrs);
3761  bms_free(key_attrs);
3762  bms_free(id_attrs);
3763  bms_free(modified_attrs);
3764  bms_free(interesting_attrs);
3765 
3766  return TM_Ok;
3767 }
3768 
3769 /*
3770  * Check if the specified attribute's value is same in both given tuples.
3771  * Subroutine for HeapDetermineModifiedColumns.
3772  */
3773 static bool
3774 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
3775  HeapTuple tup1, HeapTuple tup2)
3776 {
3777  Datum value1,
3778  value2;
3779  bool isnull1,
3780  isnull2;
3781  Form_pg_attribute att;
3782 
3783  /*
3784  * If it's a whole-tuple reference, say "not equal". It's not really
3785  * worth supporting this case, since it could only succeed after a no-op
3786  * update, which is hardly a case worth optimizing for.
3787  */
3788  if (attrnum == 0)
3789  return false;
3790 
3791  /*
3792  * Likewise, automatically say "not equal" for any system attribute other
3793  * than tableOID; we cannot expect these to be consistent in a HOT chain,
3794  * or even to be set correctly yet in the new tuple.
3795  */
3796  if (attrnum < 0)
3797  {
3798  if (attrnum != TableOidAttributeNumber)
3799  return false;
3800  }
3801 
3802  /*
3803  * Extract the corresponding values. XXX this is pretty inefficient if
3804  * there are many indexed columns. Should HeapDetermineModifiedColumns do
3805  * a single heap_deform_tuple call on each tuple, instead? But that
3806  * doesn't work for system columns ...
3807  */
3808  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
3809  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
3810 
3811  /*
3812  * If one value is NULL and other is not, then they are certainly not
3813  * equal
3814  */
3815  if (isnull1 != isnull2)
3816  return false;
3817 
3818  /*
3819  * If both are NULL, they can be considered equal.
3820  */
3821  if (isnull1)
3822  return true;
3823 
3824  /*
3825  * We do simple binary comparison of the two datums. This may be overly
3826  * strict because there can be multiple binary representations for the
3827  * same logical value. But we should be OK as long as there are no false
3828  * positives. Using a type-specific equality operator is messy because
3829  * there could be multiple notions of equality in different operator
3830  * classes; furthermore, we cannot safely invoke user-defined functions
3831  * while holding exclusive buffer lock.
3832  */
3833  if (attrnum <= 0)
3834  {
3835  /* The only allowed system columns are OIDs, so do this */
3836  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
3837  }
3838  else
3839  {
3840  Assert(attrnum <= tupdesc->natts);
3841  att = TupleDescAttr(tupdesc, attrnum - 1);
3842  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
3843  }
3844 }
3845 
3846 /*
3847  * Check which columns are being updated.
3848  *
3849  * Given an updated tuple, determine (and return into the output bitmapset),
3850  * from those listed as interesting, the set of columns that changed.
3851  *
3852  * The input bitmapset is destructively modified; that is OK since this is
3853  * invoked at most once in heap_update.
3854  */
3855 static Bitmapset *
3857  HeapTuple oldtup, HeapTuple newtup)
3858 {
3859  int attnum;
3860  Bitmapset *modified = NULL;
3861 
3862  while ((attnum = bms_first_member(interesting_cols)) >= 0)
3863  {
3865 
3867  attnum, oldtup, newtup))
3868  modified = bms_add_member(modified,
3870  }
3871 
3872  return modified;
3873 }
3874 
3875 /*
3876  * simple_heap_update - replace a tuple
3877  *
3878  * This routine may be used to update a tuple when concurrent updates of
3879  * the target tuple are not expected (for example, because we have a lock
3880  * on the relation associated with the tuple). Any failure is reported
3881  * via ereport().
3882  */
3883 void
3885 {
3886  TM_Result result;
3887  TM_FailureData tmfd;
3888  LockTupleMode lockmode;
3889 
3890  result = heap_update(relation, otid, tup,
3892  true /* wait for commit */ ,
3893  &tmfd, &lockmode);
3894  switch (result)
3895  {
3896  case TM_SelfModified:
3897  /* Tuple was already updated in current command? */
3898  elog(ERROR, "tuple already updated by self");
3899  break;
3900 
3901  case TM_Ok:
3902  /* done successfully */
3903  break;
3904 
3905  case TM_Updated:
3906  elog(ERROR, "tuple concurrently updated");
3907  break;
3908 
3909  case TM_Deleted:
3910  elog(ERROR, "tuple concurrently deleted");
3911  break;
3912 
3913  default:
3914  elog(ERROR, "unrecognized heap_update status: %u", result);
3915  break;
3916  }
3917 }
3918 
3919 
3920 /*
3921  * Return the MultiXactStatus corresponding to the given tuple lock mode.
3922  */
3923 static MultiXactStatus
3925 {
3926  int retval;
3927 
3928  if (is_update)
3929  retval = tupleLockExtraInfo[mode].updstatus;
3930  else
3931  retval = tupleLockExtraInfo[mode].lockstatus;
3932 
3933  if (retval == -1)
3934  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
3935  is_update ? "true" : "false");
3936 
3937  return (MultiXactStatus) retval;
3938 }
3939 
3940 /*
3941  * heap_lock_tuple - lock a tuple in shared or exclusive mode
3942  *
3943  * Note that this acquires a buffer pin, which the caller must release.
3944  *
3945  * Input parameters:
3946  * relation: relation containing tuple (caller must hold suitable lock)
3947  * tid: TID of tuple to lock
3948  * cid: current command ID (used for visibility test, and stored into
3949  * tuple's cmax if lock is successful)
3950  * mode: indicates if shared or exclusive tuple lock is desired
3951  * wait_policy: what to do if tuple lock is not available
3952  * follow_updates: if true, follow the update chain to also lock descendant
3953  * tuples.
3954  *
3955  * Output parameters:
3956  * *tuple: all fields filled in
3957  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
3958  * *tmfd: filled in failure cases (see below)
3959  *
3960  * Function results are the same as the ones for table_tuple_lock().
3961  *
3962  * In the failure cases other than TM_Invisible, the routine fills
3963  * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
3964  * if necessary), and t_cmax (the last only for TM_SelfModified,
3965  * since we cannot obtain cmax from a combocid generated by another
3966  * transaction).
3967  * See comments for struct TM_FailureData for additional info.
3968  *
3969  * See README.tuplock for a thorough explanation of this mechanism.
3970  */
3971 TM_Result
3973  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
3974  bool follow_updates,
3975  Buffer *buffer, TM_FailureData *tmfd)
3976 {
3977  TM_Result result;
3978  ItemPointer tid = &(tuple->t_self);
3979  ItemId lp;
3980  Page page;
3981  Buffer vmbuffer = InvalidBuffer;
3982  BlockNumber block;
3983  TransactionId xid,
3984  xmax;
3985  uint16 old_infomask,
3986  new_infomask,
3987  new_infomask2;
3988  bool first_time = true;
3989  bool skip_tuple_lock = false;
3990  bool have_tuple_lock = false;
3991  bool cleared_all_frozen = false;
3992 
3993  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
3994  block = ItemPointerGetBlockNumber(tid);
3995 
3996  /*
3997  * Before locking the buffer, pin the visibility map page if it appears to
3998  * be necessary. Since we haven't got the lock yet, someone else might be
3999  * in the middle of changing this, so we'll need to recheck after we have
4000  * the lock.
4001  */
4002  if (PageIsAllVisible(BufferGetPage(*buffer)))
4003  visibilitymap_pin(relation, block, &vmbuffer);
4004 
4006 
4007  page = BufferGetPage(*buffer);
4008  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4009  Assert(ItemIdIsNormal(lp));
4010 
4011  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4012  tuple->t_len = ItemIdGetLength(lp);
4013  tuple->t_tableOid = RelationGetRelid(relation);
4014 
4015 l3:
4016  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4017 
4018  if (result == TM_Invisible)
4019  {
4020  /*
4021  * This is possible, but only when locking a tuple for ON CONFLICT
4022  * UPDATE. We return this value here rather than throwing an error in
4023  * order to give that case the opportunity to throw a more specific
4024  * error.
4025  */
4026  result = TM_Invisible;
4027  goto out_locked;
4028  }
4029  else if (result == TM_BeingModified ||
4030  result == TM_Updated ||
4031  result == TM_Deleted)
4032  {
4033  TransactionId xwait;
4034  uint16 infomask;
4035  uint16 infomask2;
4036  bool require_sleep;
4037  ItemPointerData t_ctid;
4038 
4039  /* must copy state data before unlocking buffer */
4040  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4041  infomask = tuple->t_data->t_infomask;
4042  infomask2 = tuple->t_data->t_infomask2;
4043  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4044 
4045  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4046 
4047  /*
4048  * If any subtransaction of the current top transaction already holds
4049  * a lock as strong as or stronger than what we're requesting, we
4050  * effectively hold the desired lock already. We *must* succeed
4051  * without trying to take the tuple lock, else we will deadlock
4052  * against anyone wanting to acquire a stronger lock.
4053  *
4054  * Note we only do this the first time we loop on the HTSU result;
4055  * there is no point in testing in subsequent passes, because
4056  * evidently our own transaction cannot have acquired a new lock after
4057  * the first time we checked.
4058  */
4059  if (first_time)
4060  {
4061  first_time = false;
4062 
4063  if (infomask & HEAP_XMAX_IS_MULTI)
4064  {
4065  int i;
4066  int nmembers;
4067  MultiXactMember *members;
4068 
4069  /*
4070  * We don't need to allow old multixacts here; if that had
4071  * been the case, HeapTupleSatisfiesUpdate would have returned
4072  * MayBeUpdated and we wouldn't be here.
4073  */
4074  nmembers =
4075  GetMultiXactIdMembers(xwait, &members, false,
4076  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4077 
4078  for (i = 0; i < nmembers; i++)
4079  {
4080  /* only consider members of our own transaction */
4081  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4082  continue;
4083 
4084  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4085  {
4086  pfree(members);
4087  result = TM_Ok;
4088  goto out_unlocked;
4089  }
4090  else
4091  {
4092  /*
4093  * Disable acquisition of the heavyweight tuple lock.
4094  * Otherwise, when promoting a weaker lock, we might
4095  * deadlock with another locker that has acquired the
4096  * heavyweight tuple lock and is waiting for our
4097  * transaction to finish.
4098  *
4099  * Note that in this case we still need to wait for
4100  * the multixact if required, to avoid acquiring
4101  * conflicting locks.
4102  */
4103  skip_tuple_lock = true;
4104  }
4105  }
4106 
4107  if (members)
4108  pfree(members);
4109  }
4110  else if (TransactionIdIsCurrentTransactionId(xwait))
4111  {
4112  switch (mode)
4113  {
4114  case LockTupleKeyShare:
4115  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4116  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4117  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4118  result = TM_Ok;
4119  goto out_unlocked;
4120  case LockTupleShare:
4121  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4122  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4123  {
4124  result = TM_Ok;
4125  goto out_unlocked;
4126  }
4127  break;
4129  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4130  {
4131  result = TM_Ok;
4132  goto out_unlocked;
4133  }
4134  break;
4135  case LockTupleExclusive:
4136  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4137  infomask2 & HEAP_KEYS_UPDATED)
4138  {
4139  result = TM_Ok;
4140  goto out_unlocked;
4141  }
4142  break;
4143  }
4144  }
4145  }
4146 
4147  /*
4148  * Initially assume that we will have to wait for the locking
4149  * transaction(s) to finish. We check various cases below in which
4150  * this can be turned off.
4151  */
4152  require_sleep = true;
4153  if (mode == LockTupleKeyShare)
4154  {
4155  /*
4156  * If we're requesting KeyShare, and there's no update present, we
4157  * don't need to wait. Even if there is an update, we can still
4158  * continue if the key hasn't been modified.
4159  *
4160  * However, if there are updates, we need to walk the update chain
4161  * to mark future versions of the row as locked, too. That way,
4162  * if somebody deletes that future version, we're protected
4163  * against the key going away. This locking of future versions
4164  * could block momentarily, if a concurrent transaction is
4165  * deleting a key; or it could return a value to the effect that
4166  * the transaction deleting the key has already committed. So we
4167  * do this before re-locking the buffer; otherwise this would be
4168  * prone to deadlocks.
4169  *
4170  * Note that the TID we're locking was grabbed before we unlocked
4171  * the buffer. For it to change while we're not looking, the
4172  * other properties we're testing for below after re-locking the
4173  * buffer would also change, in which case we would restart this
4174  * loop above.
4175  */
4176  if (!(infomask2 & HEAP_KEYS_UPDATED))
4177  {
4178  bool updated;
4179 
4180  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4181 
4182  /*
4183  * If there are updates, follow the update chain; bail out if
4184  * that cannot be done.
4185  */
4186  if (follow_updates && updated)
4187  {
4188  TM_Result res;
4189 
4190  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4192  mode);
4193  if (res != TM_Ok)
4194  {
4195  result = res;
4196  /* recovery code expects to have buffer lock held */
4198  goto failed;
4199  }
4200  }
4201 
4203 
4204  /*
4205  * Make sure it's still an appropriate lock, else start over.
4206  * Also, if it wasn't updated before we released the lock, but
4207  * is updated now, we start over too; the reason is that we
4208  * now need to follow the update chain to lock the new
4209  * versions.
4210  */
4211  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4212  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4213  !updated))
4214  goto l3;
4215 
4216  /* Things look okay, so we can skip sleeping */
4217  require_sleep = false;
4218 
4219  /*
4220  * Note we allow Xmax to change here; other updaters/lockers
4221  * could have modified it before we grabbed the buffer lock.
4222  * However, this is not a problem, because with the recheck we
4223  * just did we ensure that they still don't conflict with the
4224  * lock we want.
4225  */
4226  }
4227  }
4228  else if (mode == LockTupleShare)
4229  {
4230  /*
4231  * If we're requesting Share, we can similarly avoid sleeping if
4232  * there's no update and no exclusive lock present.
4233  */
4234  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4235  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4236  {
4238 
4239  /*
4240  * Make sure it's still an appropriate lock, else start over.
4241  * See above about allowing xmax to change.
4242  */
4243  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4245  goto l3;
4246  require_sleep = false;
4247  }
4248  }
4249  else if (mode == LockTupleNoKeyExclusive)
4250  {
4251  /*
4252  * If we're requesting NoKeyExclusive, we might also be able to
4253  * avoid sleeping; just ensure that there no conflicting lock
4254  * already acquired.
4255  */
4256  if (infomask & HEAP_XMAX_IS_MULTI)
4257  {
4258  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4259  mode, NULL))
4260  {
4261  /*
4262  * No conflict, but if the xmax changed under us in the
4263  * meantime, start over.
4264  */
4266  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4268  xwait))
4269  goto l3;
4270 
4271  /* otherwise, we're good */
4272  require_sleep = false;
4273  }
4274  }
4275  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4276  {
4278 
4279  /* if the xmax changed in the meantime, start over */
4280  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4283  xwait))
4284  goto l3;
4285  /* otherwise, we're good */
4286  require_sleep = false;
4287  }
4288  }
4289 
4290  /*
4291  * As a check independent from those above, we can also avoid sleeping
4292  * if the current transaction is the sole locker of the tuple. Note
4293  * that the strength of the lock already held is irrelevant; this is
4294  * not about recording the lock in Xmax (which will be done regardless
4295  * of this optimization, below). Also, note that the cases where we
4296  * hold a lock stronger than we are requesting are already handled
4297  * above by not doing anything.
4298  *
4299  * Note we only deal with the non-multixact case here; MultiXactIdWait
4300  * is well equipped to deal with this situation on its own.
4301  */
4302  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4304  {
4305  /* ... but if the xmax changed in the meantime, start over */
4307  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4309  xwait))
4310  goto l3;
4312  require_sleep = false;
4313  }
4314 
4315  /*
4316  * Time to sleep on the other transaction/multixact, if necessary.
4317  *
4318  * If the other transaction is an update/delete that's already
4319  * committed, then sleeping cannot possibly do any good: if we're
4320  * required to sleep, get out to raise an error instead.
4321  *
4322  * By here, we either have already acquired the buffer exclusive lock,
4323  * or we must wait for the locking transaction or multixact; so below
4324  * we ensure that we grab buffer lock after the sleep.
4325  */
4326  if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4327  {
4329  goto failed;
4330  }
4331  else if (require_sleep)
4332  {
4333  /*
4334  * Acquire tuple lock to establish our priority for the tuple, or
4335  * die trying. LockTuple will release us when we are next-in-line
4336  * for the tuple. We must do this even if we are share-locking,
4337  * but not if we already have a weaker lock on the tuple.
4338  *
4339  * If we are forced to "start over" below, we keep the tuple lock;
4340  * this arranges that we stay at the head of the line while
4341  * rechecking tuple state.
4342  */
4343  if (!skip_tuple_lock &&
4344  !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4345  &have_tuple_lock))
4346  {
4347  /*
4348  * This can only happen if wait_policy is Skip and the lock
4349  * couldn't be obtained.
4350  */
4351  result = TM_WouldBlock;
4352  /* recovery code expects to have buffer lock held */
4354  goto failed;
4355  }
4356 
4357  if (infomask & HEAP_XMAX_IS_MULTI)
4358  {
4360 
4361  /* We only ever lock tuples, never update them */
4362  if (status >= MultiXactStatusNoKeyUpdate)
4363  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4364 
4365  /* wait for multixact to end, or die trying */
4366  switch (wait_policy)
4367  {
4368  case LockWaitBlock:
4369  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4370  relation, &tuple->t_self, XLTW_Lock, NULL);
4371  break;
4372  case LockWaitSkip:
4374  status, infomask, relation,
4375  NULL))
4376  {
4377  result = TM_WouldBlock;
4378  /* recovery code expects to have buffer lock held */
4380  goto failed;
4381  }
4382  break;
4383  case LockWaitError:
4385  status, infomask, relation,
4386  NULL))
4387  ereport(ERROR,
4388  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4389  errmsg("could not obtain lock on row in relation \"%s\"",
4390  RelationGetRelationName(relation))));
4391 
4392  break;
4393  }
4394 
4395  /*
4396  * Of course, the multixact might not be done here: if we're
4397  * requesting a light lock mode, other transactions with light
4398  * locks could still be alive, as well as locks owned by our
4399  * own xact or other subxacts of this backend. We need to
4400  * preserve the surviving MultiXact members. Note that it
4401  * isn't absolutely necessary in the latter case, but doing so
4402  * is simpler.
4403  */
4404  }
4405  else
4406  {
4407  /* wait for regular transaction to end, or die trying */
4408  switch (wait_policy)
4409  {
4410  case LockWaitBlock:
4411  XactLockTableWait(xwait, relation, &tuple->t_self,
4412  XLTW_Lock);
4413  break;
4414  case LockWaitSkip:
4415  if (!ConditionalXactLockTableWait(xwait))
4416  {
4417  result = TM_WouldBlock;
4418  /* recovery code expects to have buffer lock held */
4420  goto failed;
4421  }
4422  break;
4423  case LockWaitError:
4424  if (!ConditionalXactLockTableWait(xwait))
4425  ereport(ERROR,
4426  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4427  errmsg("could not obtain lock on row in relation \"%s\"",
4428  RelationGetRelationName(relation))));
4429  break;
4430  }
4431  }
4432 
4433  /* if there are updates, follow the update chain */
4434  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4435  {
4436  TM_Result res;
4437 
4438  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4440  mode);
4441  if (res != TM_Ok)
4442  {
4443  result = res;
4444  /* recovery code expects to have buffer lock held */
4446  goto failed;
4447  }
4448  }
4449 
4451 
4452  /*
4453  * xwait is done, but if xwait had just locked the tuple then some
4454  * other xact could update this tuple before we get to this point.
4455  * Check for xmax change, and start over if so.
4456  */
4457  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4459  xwait))
4460  goto l3;
4461 
4462  if (!(infomask & HEAP_XMAX_IS_MULTI))
4463  {
4464  /*
4465  * Otherwise check if it committed or aborted. Note we cannot
4466  * be here if the tuple was only locked by somebody who didn't
4467  * conflict with us; that would have been handled above. So
4468  * that transaction must necessarily be gone by now. But
4469  * don't check for this in the multixact case, because some
4470  * locker transactions might still be running.
4471  */
4472  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4473  }
4474  }
4475 
4476  /* By here, we're certain that we hold buffer exclusive lock again */
4477 
4478  /*
4479  * We may lock if previous xmax aborted, or if it committed but only
4480  * locked the tuple without updating it; or if we didn't have to wait
4481  * at all for whatever reason.
4482  */
4483  if (!require_sleep ||
4484  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4487  result = TM_Ok;
4488  else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid) ||
4490  result = TM_Updated;
4491  else
4492  result = TM_Deleted;
4493  }
4494 
4495 failed:
4496  if (result != TM_Ok)
4497  {
4498  Assert(result == TM_SelfModified || result == TM_Updated ||
4499  result == TM_Deleted || result == TM_WouldBlock);
4500  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4501  Assert(result != TM_Updated ||
4502  !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4503  tmfd->ctid = tuple->t_data->t_ctid;
4504  tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4505  if (result == TM_SelfModified)
4506  tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4507  else
4508  tmfd->cmax = InvalidCommandId;
4509  goto out_locked;
4510  }
4511 
4512  /*
4513  * If we didn't pin the visibility map page and the page has become all
4514  * visible while we were busy locking the buffer, or during some
4515  * subsequent window during which we had it unlocked, we'll have to unlock
4516  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4517  * unfortunate, especially since we'll now have to recheck whether the
4518  * tuple has been locked or updated under us, but hopefully it won't
4519  * happen very often.
4520  */
4521  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4522  {
4523  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4524  visibilitymap_pin(relation, block, &vmbuffer);
4526  goto l3;
4527  }
4528 
4529  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4530  old_infomask = tuple->t_data->t_infomask;
4531 
4532  /*
4533  * If this is the first possibly-multixact-able operation in the current
4534  * transaction, set my per-backend OldestMemberMXactId setting. We can be
4535  * certain that the transaction will never become a member of any older
4536  * MultiXactIds than that. (We have to do this even if we end up just
4537  * using our own TransactionId below, since some other backend could
4538  * incorporate our XID into a MultiXact immediately afterwards.)
4539  */
4541 
4542  /*
4543  * Compute the new xmax and infomask to store into the tuple. Note we do
4544  * not modify the tuple just yet, because that would leave it in the wrong
4545  * state if multixact.c elogs.
4546  */
4547  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4548  GetCurrentTransactionId(), mode, false,
4549  &xid, &new_infomask, &new_infomask2);
4550 
4552 
4553  /*
4554  * Store transaction information of xact locking the tuple.
4555  *
4556  * Note: Cmax is meaningless in this context, so don't set it; this avoids
4557  * possibly generating a useless combo CID. Moreover, if we're locking a
4558  * previously updated tuple, it's important to preserve the Cmax.
4559  *
4560  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4561  * we would break the HOT chain.
4562  */
4563  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4564  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4565  tuple->t_data->t_infomask |= new_infomask;
4566  tuple->t_data->t_infomask2 |= new_infomask2;
4567  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4569  HeapTupleHeaderSetXmax(tuple->t_data, xid);
4570 
4571  /*
4572  * Make sure there is no forward chain link in t_ctid. Note that in the
4573  * cases where the tuple has been updated, we must not overwrite t_ctid,
4574  * because it was set by the updater. Moreover, if the tuple has been
4575  * updated, we need to follow the update chain to lock the new versions of
4576  * the tuple as well.
4577  */
4578  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4579  tuple->t_data->t_ctid = *tid;
4580 
4581  /* Clear only the all-frozen bit on visibility map if needed */
4582  if (PageIsAllVisible(page) &&
4583  visibilitymap_clear(relation, block, vmbuffer,
4585  cleared_all_frozen = true;
4586 
4587 
4588  MarkBufferDirty(*buffer);
4589 
4590  /*
4591  * XLOG stuff. You might think that we don't need an XLOG record because
4592  * there is no state change worth restoring after a crash. You would be
4593  * wrong however: we have just written either a TransactionId or a
4594  * MultiXactId that may never have been seen on disk before, and we need
4595  * to make sure that there are XLOG entries covering those ID numbers.
4596  * Else the same IDs might be re-used after a crash, which would be
4597  * disastrous if this page made it to disk before the crash. Essentially
4598  * we have to enforce the WAL log-before-data rule even in this case.
4599  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4600  * entries for everything anyway.)
4601  */
4602  if (RelationNeedsWAL(relation))
4603  {
4604  xl_heap_lock xlrec;
4605  XLogRecPtr recptr;
4606 
4607  XLogBeginInsert();
4608  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4609 
4610  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4611  xlrec.locking_xid = xid;
4612  xlrec.infobits_set = compute_infobits(new_infomask,
4613  tuple->t_data->t_infomask2);
4614  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4615  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4616 
4617  /* we don't decode row locks atm, so no need to log the origin */
4618 
4619  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4620 
4621  PageSetLSN(page, recptr);
4622  }
4623 
4624  END_CRIT_SECTION();
4625 
4626  result = TM_Ok;
4627 
4628 out_locked:
4629  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4630 
4631 out_unlocked:
4632  if (BufferIsValid(vmbuffer))
4633  ReleaseBuffer(vmbuffer);
4634 
4635  /*
4636  * Don't update the visibility map here. Locking a tuple doesn't change
4637  * visibility info.
4638  */
4639 
4640  /*
4641  * Now that we have successfully marked the tuple as locked, we can
4642  * release the lmgr tuple lock, if we had it.
4643  */
4644  if (have_tuple_lock)
4645  UnlockTupleTuplock(relation, tid, mode);
4646 
4647  return result;
4648 }
4649 
4650 /*
4651  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4652  * its normal, Xmax-based tuple lock.
4653  *
4654  * have_tuple_lock is an input and output parameter: on input, it indicates
4655  * whether the lock has previously been acquired (and this function does
4656  * nothing in that case). If this function returns success, have_tuple_lock
4657  * has been flipped to true.
4658  *
4659  * Returns false if it was unable to obtain the lock; this can only happen if
4660  * wait_policy is Skip.
4661  */
4662 static bool
4664  LockWaitPolicy wait_policy, bool *have_tuple_lock)
4665 {
4666  if (*have_tuple_lock)
4667  return true;
4668 
4669  switch (wait_policy)
4670  {
4671  case LockWaitBlock:
4672  LockTupleTuplock(relation, tid, mode);
4673  break;
4674 
4675  case LockWaitSkip:
4676  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4677  return false;
4678  break;
4679 
4680  case LockWaitError:
4681  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4682  ereport(ERROR,
4683  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4684  errmsg("could not obtain lock on row in relation \"%s\"",
4685  RelationGetRelationName(relation))));
4686  break;
4687  }
4688  *have_tuple_lock = true;
4689 
4690  return true;
4691 }
4692 
4693 /*
4694  * Given an original set of Xmax and infomask, and a transaction (identified by
4695  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4696  * corresponding infomasks to use on the tuple.
4697  *
4698  * Note that this might have side effects such as creating a new MultiXactId.
4699  *
4700  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4701  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4702  * but it was not running anymore. There is a race condition, which is that the
4703  * MultiXactId may have finished since then, but that uncommon case is handled
4704  * either here, or within MultiXactIdExpand.
4705  *
4706  * There is a similar race condition possible when the old xmax was a regular
4707  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
4708  * window, but it's still possible to end up creating an unnecessary
4709  * MultiXactId. Fortunately this is harmless.
4710  */
4711 static void
4713  uint16 old_infomask2, TransactionId add_to_xmax,
4714  LockTupleMode mode, bool is_update,
4715  TransactionId *result_xmax, uint16 *result_infomask,
4716  uint16 *result_infomask2)
4717 {
4718  TransactionId new_xmax;
4719  uint16 new_infomask,
4720  new_infomask2;
4721 
4723 
4724 l5:
4725  new_infomask = 0;
4726  new_infomask2 = 0;
4727  if (old_infomask & HEAP_XMAX_INVALID)
4728  {
4729  /*
4730  * No previous locker; we just insert our own TransactionId.
4731  *
4732  * Note that it's critical that this case be the first one checked,
4733  * because there are several blocks below that come back to this one
4734  * to implement certain optimizations; old_infomask might contain
4735  * other dirty bits in those cases, but we don't really care.
4736  */
4737  if (is_update)
4738  {
4739  new_xmax = add_to_xmax;
4740  if (mode == LockTupleExclusive)
4741  new_infomask2 |= HEAP_KEYS_UPDATED;
4742  }
4743  else
4744  {
4745  new_infomask |= HEAP_XMAX_LOCK_ONLY;
4746  switch (mode)
4747  {
4748  case LockTupleKeyShare:
4749  new_xmax = add_to_xmax;
4750  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
4751  break;
4752  case LockTupleShare:
4753  new_xmax = add_to_xmax;
4754  new_infomask |= HEAP_XMAX_SHR_LOCK;
4755  break;
4757  new_xmax = add_to_xmax;
4758  new_infomask |= HEAP_XMAX_EXCL_LOCK;
4759  break;
4760  case LockTupleExclusive:
4761  new_xmax = add_to_xmax;
4762  new_infomask |= HEAP_XMAX_EXCL_LOCK;
4763  new_infomask2 |= HEAP_KEYS_UPDATED;
4764  break;
4765  default:
4766  new_xmax = InvalidTransactionId; /* silence compiler */
4767  elog(ERROR, "invalid lock mode");
4768  }
4769  }
4770  }
4771  else if (old_infomask & HEAP_XMAX_IS_MULTI)
4772  {
4773  MultiXactStatus new_status;
4774 
4775  /*
4776  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
4777  * cross-check.
4778  */
4779  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
4780 
4781  /*
4782  * A multixact together with LOCK_ONLY set but neither lock bit set
4783  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
4784  * anymore. This check is critical for databases upgraded by
4785  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
4786  * that such multis are never passed.
4787  */
4788  if (HEAP_LOCKED_UPGRADED(old_infomask))
4789  {
4790  old_infomask &= ~HEAP_XMAX_IS_MULTI;
4791  old_infomask |= HEAP_XMAX_INVALID;
4792  goto l5;
4793  }
4794 
4795  /*
4796  * If the XMAX is already a MultiXactId, then we need to expand it to
4797  * include add_to_xmax; but if all the members were lockers and are
4798  * all gone, we can do away with the IS_MULTI bit and just set
4799  * add_to_xmax as the only locker/updater. If all lockers are gone
4800  * and we have an updater that aborted, we can also do without a
4801  * multi.
4802  *
4803  * The cost of doing GetMultiXactIdMembers would be paid by
4804  * MultiXactIdExpand if we weren't to do this, so this check is not
4805  * incurring extra work anyhow.
4806  */
4807  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
4808  {
4809  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
4811  old_infomask)))
4812  {
4813  /*
4814  * Reset these bits and restart; otherwise fall through to
4815  * create a new multi below.
4816  */
4817  old_infomask &= ~HEAP_XMAX_IS_MULTI;
4818  old_infomask |= HEAP_XMAX_INVALID;
4819  goto l5;
4820  }
4821  }
4822 
4823  new_status = get_mxact_status_for_lock(mode, is_update);
4824 
4825  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
4826  new_status);
4827  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4828  }
4829  else if (old_infomask & HEAP_XMAX_COMMITTED)
4830  {
4831  /*
4832  * It's a committed update, so we need to preserve him as updater of
4833  * the tuple.
4834  */
4836  MultiXactStatus new_status;
4837 
4838  if (old_infomask2 & HEAP_KEYS_UPDATED)
4839  status = MultiXactStatusUpdate;
4840  else
4841  status = MultiXactStatusNoKeyUpdate;
4842 
4843  new_status = get_mxact_status_for_lock(mode, is_update);
4844 
4845  /*
4846  * since it's not running, it's obviously impossible for the old
4847  * updater to be identical to the current one, so we need not check
4848  * for that case as we do in the block above.
4849  */
4850  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4851  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4852  }
4853  else if (TransactionIdIsInProgress(xmax))
4854  {
4855  /*
4856  * If the XMAX is a valid, in-progress TransactionId, then we need to
4857  * create a new MultiXactId that includes both the old locker or
4858  * updater and our own TransactionId.
4859  */
4860  MultiXactStatus new_status;
4861  MultiXactStatus old_status;
4862  LockTupleMode old_mode;
4863 
4864  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
4865  {
4866  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
4867  old_status = MultiXactStatusForKeyShare;
4868  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
4869  old_status = MultiXactStatusForShare;
4870  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
4871  {
4872  if (old_infomask2 & HEAP_KEYS_UPDATED)
4873  old_status = MultiXactStatusForUpdate;
4874  else
4875  old_status = MultiXactStatusForNoKeyUpdate;
4876  }
4877  else
4878  {
4879  /*
4880  * LOCK_ONLY can be present alone only when a page has been
4881  * upgraded by pg_upgrade. But in that case,
4882  * TransactionIdIsInProgress() should have returned false. We
4883  * assume it's no longer locked in this case.
4884  */
4885  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
4886  old_infomask |= HEAP_XMAX_INVALID;
4887  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
4888  goto l5;
4889  }
4890  }
4891  else
4892  {
4893  /* it's an update, but which kind? */
4894  if (old_infomask2 & HEAP_KEYS_UPDATED)
4895  old_status = MultiXactStatusUpdate;
4896  else
4897  old_status = MultiXactStatusNoKeyUpdate;
4898  }
4899 
4900  old_mode = TUPLOCK_from_mxstatus(old_status);
4901 
4902  /*
4903  * If the lock to be acquired is for the same TransactionId as the
4904  * existing lock, there's an optimization possible: consider only the
4905  * strongest of both locks as the only one present, and restart.
4906  */
4907  if (xmax == add_to_xmax)
4908  {
4909  /*
4910  * Note that it's not possible for the original tuple to be
4911  * updated: we wouldn't be here because the tuple would have been
4912  * invisible and we wouldn't try to update it. As a subtlety,
4913  * this code can also run when traversing an update chain to lock
4914  * future versions of a tuple. But we wouldn't be here either,
4915  * because the add_to_xmax would be different from the original
4916  * updater.
4917  */
4918  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
4919 
4920  /* acquire the strongest of both */
4921  if (mode < old_mode)
4922  mode = old_mode;
4923  /* mustn't touch is_update */
4924 
4925  old_infomask |= HEAP_XMAX_INVALID;
4926  goto l5;
4927  }
4928 
4929  /* otherwise, just fall back to creating a new multixact */
4930  new_status = get_mxact_status_for_lock(mode, is_update);
4931  new_xmax = MultiXactIdCreate(xmax, old_status,
4932  add_to_xmax, new_status);
4933  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4934  }
4935  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
4936  TransactionIdDidCommit(xmax))
4937  {
4938  /*
4939  * It's a committed update, so we gotta preserve him as updater of the
4940  * tuple.
4941  */
4943  MultiXactStatus new_status;
4944 
4945  if (old_infomask2 & HEAP_KEYS_UPDATED)
4946  status = MultiXactStatusUpdate;
4947  else
4948  status = MultiXactStatusNoKeyUpdate;
4949 
4950  new_status = get_mxact_status_for_lock(mode, is_update);
4951 
4952  /*
4953  * since it's not running, it's obviously impossible for the old
4954  * updater to be identical to the current one, so we need not check
4955  * for that case as we do in the block above.
4956  */
4957  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4958  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4959  }
4960  else
4961  {
4962  /*
4963  * Can get here iff the locking/updating transaction was running when
4964  * the infomask was extracted from the tuple, but finished before
4965  * TransactionIdIsInProgress got to run. Deal with it as if there was
4966  * no locker at all in the first place.
4967  */
4968  old_infomask |= HEAP_XMAX_INVALID;
4969  goto l5;
4970  }
4971 
4972  *result_infomask = new_infomask;
4973  *result_infomask2 = new_infomask2;
4974  *result_xmax = new_xmax;
4975 }
4976 
4977 /*
4978  * Subroutine for heap_lock_updated_tuple_rec.
4979  *
4980  * Given a hypothetical multixact status held by the transaction identified
4981  * with the given xid, does the current transaction need to wait, fail, or can
4982  * it continue if it wanted to acquire a lock of the given mode? "needwait"
4983  * is set to true if waiting is necessary; if it can continue, then TM_Ok is
4984  * returned. If the lock is already held by the current transaction, return
4985  * TM_SelfModified. In case of a conflict with another transaction, a
4986  * different HeapTupleSatisfiesUpdate return code is returned.
4987  *
4988  * The held status is said to be hypothetical because it might correspond to a
4989  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
4990  * way for simplicity of API.
4991  */
4992 static TM_Result
4995  bool *needwait)
4996 {
4997  MultiXactStatus wantedstatus;
4998 
4999  *needwait = false;
5000  wantedstatus = get_mxact_status_for_lock(mode, false);
5001 
5002  /*
5003  * Note: we *must* check TransactionIdIsInProgress before
5004  * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5005  * for an explanation.
5006  */
5008  {
5009  /*
5010  * The tuple has already been locked by our own transaction. This is
5011  * very rare but can happen if multiple transactions are trying to
5012  * lock an ancient version of the same tuple.
5013  */
5014  return TM_SelfModified;
5015  }
5016  else if (TransactionIdIsInProgress(xid))
5017  {
5018  /*
5019  * If the locking transaction is running, what we do depends on
5020  * whether the lock modes conflict: if they do, then we must wait for
5021  * it to finish; otherwise we can fall through to lock this tuple
5022  * version without waiting.
5023  */
5025  LOCKMODE_from_mxstatus(wantedstatus)))
5026  {
5027  *needwait = true;
5028  }
5029 
5030  /*
5031  * If we set needwait above, then this value doesn't matter;
5032  * otherwise, this value signals to caller that it's okay to proceed.
5033  */
5034  return TM_Ok;
5035  }
5036  else if (TransactionIdDidAbort(xid))
5037  return TM_Ok;
5038  else if (TransactionIdDidCommit(xid))
5039  {
5040  /*
5041  * The other transaction committed. If it was only a locker, then the
5042  * lock is completely gone now and we can return success; but if it
5043  * was an update, then what we do depends on whether the two lock
5044  * modes conflict. If they conflict, then we must report error to
5045  * caller. But if they don't, we can fall through to allow the current
5046  * transaction to lock the tuple.
5047  *
5048  * Note: the reason we worry about ISUPDATE here is because as soon as
5049  * a transaction ends, all its locks are gone and meaningless, and
5050  * thus we can ignore them; whereas its updates persist. In the
5051  * TransactionIdIsInProgress case, above, we don't need to check
5052  * because we know the lock is still "alive" and thus a conflict needs
5053  * always be checked.
5054  */
5055  if (!ISUPDATE_from_mxstatus(status))
5056  return TM_Ok;
5057 
5059  LOCKMODE_from_mxstatus(wantedstatus)))
5060  {
5061  /* bummer */
5062  if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid) ||
5064  return TM_Updated;
5065  else
5066  return TM_Deleted;
5067  }
5068 
5069  return TM_Ok;
5070  }
5071 
5072  /* Not in progress, not aborted, not committed -- must have crashed */
5073  return TM_Ok;
5074 }
5075 
5076 
5077 /*
5078  * Recursive part of heap_lock_updated_tuple
5079  *
5080  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5081  * xid with the given mode; if this tuple is updated, recurse to lock the new
5082  * version as well.
5083  */
5084 static TM_Result
5087 {
5088  TM_Result result;
5089  ItemPointerData tupid;
5090  HeapTupleData mytup;
5091  Buffer buf;
5092  uint16 new_infomask,
5093  new_infomask2,
5094  old_infomask,
5095  old_infomask2;
5096  TransactionId xmax,
5097  new_xmax;
5098  TransactionId priorXmax = InvalidTransactionId;
5099  bool cleared_all_frozen = false;
5100  bool pinned_desired_page;
5101  Buffer vmbuffer = InvalidBuffer;
5102  BlockNumber block;
5103 
5104  ItemPointerCopy(tid, &tupid);
5105 
5106  for (;;)
5107  {
5108  new_infomask = 0;
5109  new_xmax = InvalidTransactionId;
5110  block = ItemPointerGetBlockNumber(&tupid);
5111  ItemPointerCopy(&tupid, &(mytup.t_self));
5112 
5113  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5114  {
5115  /*
5116  * if we fail to find the updated version of the tuple, it's
5117  * because it was vacuumed/pruned away after its creator
5118  * transaction aborted. So behave as if we got to the end of the
5119  * chain, and there's no further tuple to lock: return success to
5120  * caller.
5121  */
5122  result = TM_Ok;
5123  goto out_unlocked;
5124  }
5125 
5126 l4:
5128 
5129  /*
5130  * Before locking the buffer, pin the visibility map page if it
5131  * appears to be necessary. Since we haven't got the lock yet,
5132  * someone else might be in the middle of changing this, so we'll need
5133  * to recheck after we have the lock.
5134  */
5135  if (PageIsAllVisible(BufferGetPage(buf)))
5136  {
5137  visibilitymap_pin(rel, block, &vmbuffer);
5138  pinned_desired_page = true;
5139  }
5140  else
5141  pinned_desired_page = false;
5142 
5144 
5145  /*
5146  * If we didn't pin the visibility map page and the page has become
5147  * all visible while we were busy locking the buffer, we'll have to
5148  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5149  * That's a bit unfortunate, but hopefully shouldn't happen often.
5150  *
5151  * Note: in some paths through this function, we will reach here
5152  * holding a pin on a vm page that may or may not be the one matching
5153  * this page. If this page isn't all-visible, we won't use the vm
5154  * page, but we hold onto such a pin till the end of the function.
5155  */
5156  if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5157  {
5159  visibilitymap_pin(rel, block, &vmbuffer);
5161  }
5162 
5163  /*
5164  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5165  * end of the chain, we're done, so return success.
5166  */
5167  if (TransactionIdIsValid(priorXmax) &&
5169  priorXmax))
5170  {
5171  result = TM_Ok;
5172  goto out_locked;
5173  }
5174 
5175  /*
5176  * Also check Xmin: if this tuple was created by an aborted
5177  * (sub)transaction, then we already locked the last live one in the
5178  * chain, thus we're done, so return success.
5179  */
5181  {
5182  result = TM_Ok;
5183  goto out_locked;
5184  }
5185 
5186  old_infomask = mytup.t_data->t_infomask;
5187  old_infomask2 = mytup.t_data->t_infomask2;
5188  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5189 
5190  /*
5191  * If this tuple version has been updated or locked by some concurrent
5192  * transaction(s), what we do depends on whether our lock mode
5193  * conflicts with what those other transactions hold, and also on the
5194  * status of them.
5195  */
5196  if (!(old_infomask & HEAP_XMAX_INVALID))
5197  {
5198  TransactionId rawxmax;
5199  bool needwait;
5200 
5201  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5202  if (old_infomask & HEAP_XMAX_IS_MULTI)
5203  {
5204  int nmembers;
5205  int i;
5206  MultiXactMember *members;
5207 
5208  /*
5209  * We don't need a test for pg_upgrade'd tuples: this is only
5210  * applied to tuples after the first in an update chain. Said
5211  * first tuple in the chain may well be locked-in-9.2-and-
5212  * pg_upgraded, but that one was already locked by our caller,
5213  * not us; and any subsequent ones cannot be because our
5214  * caller must necessarily have obtained a snapshot later than
5215  * the pg_upgrade itself.
5216  */
5218 
5219  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5220  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5221  for (i = 0; i < nmembers; i++)
5222  {
5223  result = test_lockmode_for_conflict(members[i].status,
5224  members[i].xid,
5225  mode,
5226  &mytup,
5227  &needwait);
5228 
5229  /*
5230  * If the tuple was already locked by ourselves in a
5231  * previous iteration of this (say heap_lock_tuple was
5232  * forced to restart the locking loop because of a change
5233  * in xmax), then we hold the lock already on this tuple
5234  * version and we don't need to do anything; and this is
5235  * not an error condition either. We just need to skip
5236  * this tuple and continue locking the next version in the
5237  * update chain.
5238  */
5239  if (result == TM_SelfModified)
5240  {
5241  pfree(members);
5242  goto next;
5243  }
5244 
5245  if (needwait)
5246  {
5248  XactLockTableWait(members[i].xid, rel,
5249  &mytup.t_self,
5251  pfree(members);
5252  goto l4;
5253  }
5254  if (result != TM_Ok)
5255  {
5256  pfree(members);
5257  goto out_locked;
5258  }
5259  }
5260  if (members)
5261  pfree(members);
5262  }
5263  else
5264  {
5266 
5267  /*
5268  * For a non-multi Xmax, we first need to compute the
5269  * corresponding MultiXactStatus by using the infomask bits.
5270  */
5271  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5272  {
5273  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5274  status = MultiXactStatusForKeyShare;
5275  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5276  status = MultiXactStatusForShare;
5277  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5278  {
5279  if (old_infomask2 & HEAP_KEYS_UPDATED)
5280  status = MultiXactStatusForUpdate;
5281  else
5283  }
5284  else
5285  {
5286  /*
5287  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5288  * as share-locked in the old cluster) shouldn't be
5289  * seen in the middle of an update chain.
5290  */
5291  elog(ERROR, "invalid lock status in tuple");
5292  }
5293  }
5294  else
5295  {
5296  /* it's an update, but which kind? */
5297  if (old_infomask2 & HEAP_KEYS_UPDATED)
5298  status = MultiXactStatusUpdate;
5299  else
5300  status = MultiXactStatusNoKeyUpdate;
5301  }
5302 
5303  result = test_lockmode_for_conflict(status, rawxmax, mode,
5304  &mytup, &needwait);
5305 
5306  /*
5307  * If the tuple was already locked by ourselves in a previous
5308  * iteration of this (say heap_lock_tuple was forced to
5309  * restart the locking loop because of a change in xmax), then
5310  * we hold the lock already on this tuple version and we don't
5311  * need to do anything; and this is not an error condition
5312  * either. We just need to skip this tuple and continue
5313  * locking the next version in the update chain.
5314  */
5315  if (result == TM_SelfModified)
5316  goto next;
5317 
5318  if (needwait)
5319  {
5321  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5323  goto l4;
5324  }
5325  if (result != TM_Ok)
5326  {
5327  goto out_locked;
5328  }
5329  }
5330  }
5331 
5332  /* compute the new Xmax and infomask values for the tuple ... */
5333  compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5334  xid, mode, false,
5335  &new_xmax, &new_infomask, &new_infomask2);
5336 
5337  if (PageIsAllVisible(BufferGetPage(buf)) &&
5338  visibilitymap_clear(rel, block, vmbuffer,
5340  cleared_all_frozen = true;
5341 
5343 
5344  /* ... and set them */
5345  HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5346  mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5348  mytup.t_data->t_infomask |= new_infomask;
5349  mytup.t_data->t_infomask2 |= new_infomask2;
5350 
5351  MarkBufferDirty(buf);
5352 
5353  /* XLOG stuff */
5354  if (RelationNeedsWAL(rel))
5355  {
5356  xl_heap_lock_updated xlrec;
5357  XLogRecPtr recptr;
5358  Page page = BufferGetPage(buf);
5359 
5360  XLogBeginInsert();
5362 
5363  xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5364  xlrec.xmax = new_xmax;
5365  xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5366  xlrec.flags =
5367  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5368 
5369  XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5370 
5371  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5372 
5373  PageSetLSN(page, recptr);
5374  }
5375 
5376  END_CRIT_SECTION();
5377 
5378 next:
5379  /* if we find the end of update chain, we're done. */
5380  if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5382  ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5384  {
5385  result = TM_Ok;
5386  goto out_locked;
5387  }
5388 
5389  /* tail recursion */
5390  priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5391  ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5392  UnlockReleaseBuffer(buf);
5393  }
5394 
5395  result = TM_Ok;
5396 
5397 out_locked:
5398  UnlockReleaseBuffer(buf);
5399 
5400 out_unlocked:
5401  if (vmbuffer != InvalidBuffer)
5402  ReleaseBuffer(vmbuffer);
5403 
5404  return result;
5405 }
5406 
5407 /*
5408  * heap_lock_updated_tuple
5409  * Follow update chain when locking an updated tuple, acquiring locks (row
5410  * marks) on the updated versions.
5411  *
5412  * The initial tuple is assumed to be already locked.
5413  *
5414  * This function doesn't check visibility, it just unconditionally marks the
5415  * tuple(s) as locked. If any tuple in the updated chain is being deleted
5416  * concurrently (or updated with the key being modified), sleep until the
5417  * transaction doing it is finished.
5418  *
5419  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5420  * when we have to wait for other transactions to release them, as opposed to
5421  * what heap_lock_tuple does. The reason is that having more than one
5422  * transaction walking the chain is probably uncommon enough that risk of
5423  * starvation is not likely: one of the preconditions for being here is that
5424  * the snapshot in use predates the update that created this tuple (because we
5425  * started at an earlier version of the tuple), but at the same time such a
5426  * transaction cannot be using repeatable read or serializable isolation
5427  * levels, because that would lead to a serializability failure.
5428  */
5429 static TM_Result
5432 {
5433  /*
5434  * If the tuple has not been updated, or has moved into another partition
5435  * (effectively a delete) stop here.
5436  */
5438  !ItemPointerEquals(&tuple->t_self, ctid))
5439  {
5440  /*
5441  * If this is the first possibly-multixact-able operation in the
5442  * current transaction, set my per-backend OldestMemberMXactId
5443  * setting. We can be certain that the transaction will never become a
5444  * member of any older MultiXactIds than that. (We have to do this
5445  * even if we end up just using our own TransactionId below, since
5446  * some other backend could incorporate our XID into a MultiXact
5447  * immediately afterwards.)
5448  */
5450 
5451  return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5452  }
5453 
5454  /* nothing to lock */
5455  return TM_Ok;
5456 }
5457 
5458 /*
5459  * heap_finish_speculative - mark speculative insertion as successful
5460  *
5461  * To successfully finish a speculative insertion we have to clear speculative
5462  * token from tuple. To do so the t_ctid field, which will contain a
5463  * speculative token value, is modified in place to point to the tuple itself,
5464  * which is characteristic of a newly inserted ordinary tuple.
5465  *
5466  * NB: It is not ok to commit without either finishing or aborting a
5467  * speculative insertion. We could treat speculative tuples of committed
5468  * transactions implicitly as completed, but then we would have to be prepared
5469  * to deal with speculative tokens on committed tuples. That wouldn't be
5470  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5471  * but clearing the token at completion isn't very expensive either.
5472  * An explicit confirmation WAL record also makes logical decoding simpler.
5473  */
5474 void
5476 {
5477  Buffer buffer;
5478  Page page;
5479  OffsetNumber offnum;
5480  ItemId lp = NULL;
5481  HeapTupleHeader htup;
5482 
5483  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5485  page = (Page) BufferGetPage(buffer);
5486 
5487  offnum = ItemPointerGetOffsetNumber(tid);
5488  if (PageGetMaxOffsetNumber(page) >= offnum)
5489  lp = PageGetItemId(page, offnum);
5490 
5491  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5492  elog(ERROR, "invalid lp");
5493 
5494  htup = (HeapTupleHeader) PageGetItem(page, lp);
5495 
5496  /* SpecTokenOffsetNumber should be distinguishable from any real offset */
5498  "invalid speculative token constant");
5499 
5500  /* NO EREPORT(ERROR) from here till changes are logged */
5502 
5504 
5505  MarkBufferDirty(buffer);
5506 
5507  /*
5508  * Replace the speculative insertion token with a real t_ctid, pointing to
5509  * itself like it does on regular tuples.
5510  */
5511  htup->t_ctid = *tid;
5512 
5513  /* XLOG stuff */
5514  if (RelationNeedsWAL(relation))
5515  {
5516  xl_heap_confirm xlrec;
5517  XLogRecPtr recptr;
5518 
5519  xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5520 
5521  XLogBeginInsert();
5522 
5523  /* We want the same filtering on this as on a plain insert */
5525 
5526  XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5527  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5528 
5529  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5530 
5531  PageSetLSN(page, recptr);
5532  }
5533 
5534  END_CRIT_SECTION();
5535 
5536  UnlockReleaseBuffer(buffer);
5537 }
5538 
5539 /*
5540  * heap_abort_speculative - kill a speculatively inserted tuple
5541  *
5542  * Marks a tuple that was speculatively inserted in the same command as dead,
5543  * by setting its xmin as invalid. That makes it immediately appear as dead
5544  * to all transactions, including our own. In particular, it makes
5545  * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5546  * inserting a duplicate key value won't unnecessarily wait for our whole
5547  * transaction to finish (it'll just wait for our speculative insertion to
5548  * finish).
5549  *
5550  * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5551  * that arise due to a mutual dependency that is not user visible. By
5552  * definition, unprincipled deadlocks cannot be prevented by the user
5553  * reordering lock acquisition in client code, because the implementation level
5554  * lock acquisitions are not under the user's direct control. If speculative
5555  * inserters did not take this precaution, then under high concurrency they
5556  * could deadlock with each other, which would not be acceptable.
5557  *
5558  * This is somewhat redundant with heap_delete, but we prefer to have a
5559  * dedicated routine with stripped down requirements. Note that this is also
5560  * used to delete the TOAST tuples created during speculative insertion.
5561  *
5562  * This routine does not affect logical decoding as it only looks at
5563  * confirmation records.
5564  */
5565 void
5567 {
5569  ItemId lp;
5570  HeapTupleData tp;
5571  Page page;
5572  BlockNumber block;
5573  Buffer buffer;
5574 
5575  Assert(ItemPointerIsValid(tid));
5576 
5577  block = ItemPointerGetBlockNumber(tid);
5578  buffer = ReadBuffer(relation, block);
5579  page = BufferGetPage(buffer);
5580 
5582 
5583  /*
5584  * Page can't be all visible, we just inserted into it, and are still
5585  * running.
5586  */
5587  Assert(!PageIsAllVisible(page));
5588 
5589  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5590  Assert(ItemIdIsNormal(lp));
5591 
5592  tp.t_tableOid = RelationGetRelid(relation);
5593  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5594  tp.t_len = ItemIdGetLength(lp);
5595  tp.t_self = *tid;
5596 
5597  /*
5598  * Sanity check that the tuple really is a speculatively inserted tuple,
5599  * inserted by us.
5600  */
5601  if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5602  elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5603  if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5604  elog(ERROR, "attempted to kill a non-speculative tuple");
5606 
5607  /*
5608  * No need to check for serializable conflicts here. There is never a
5609  * need for a combocid, either. No need to extract replica identity, or
5610  * do anything special with infomask bits.
5611  */
5612 
5614 
5615  /*
5616  * The tuple will become DEAD immediately. Flag that this page
5617  * immediately is a candidate for pruning by setting xmin to
5618  * RecentGlobalXmin. That's not pretty, but it doesn't seem worth
5619  * inventing a nicer API for this.
5620  */
5623 
5624  /* store transaction information of xact deleting the tuple */
5627 
5628  /*
5629  * Set the tuple header xmin to InvalidTransactionId. This makes the
5630  * tuple immediately invisible everyone. (In particular, to any
5631  * transactions waiting on the speculative token, woken up later.)
5632  */
5634 
5635  /* Clear the speculative insertion token too */
5636  tp.t_data->t_ctid = tp.t_self;
5637 
5638  MarkBufferDirty(buffer);
5639 
5640  /*
5641  * XLOG stuff
5642  *
5643  * The WAL records generated here match heap_delete(). The same recovery
5644  * routines are used.
5645  */
5646  if (RelationNeedsWAL(relation))
5647  {
5648  xl_heap_delete xlrec;
5649  XLogRecPtr recptr;
5650 
5651  xlrec.flags = XLH_DELETE_IS_SUPER;
5653  tp.t_data->t_infomask2);
5655  xlrec.xmax = xid;
5656 
5657  XLogBeginInsert();
5658  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5659  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5660 
5661  /* No replica identity & replication origin logged */
5662 
5663  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5664 
5665  PageSetLSN(page, recptr);
5666  }
5667 
5668  END_CRIT_SECTION();
5669 
5670  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5671 
5672  if (HeapTupleHasExternal(&tp))
5673  {
5674  Assert(!IsToastRelation(relation));
5675  toast_delete(relation, &tp, true);
5676  }
5677 
5678  /*
5679  * Never need to mark tuple for invalidation, since catalogs don't support
5680  * speculative insertion
5681  */
5682 
5683  /* Now we can release the buffer */
5684  ReleaseBuffer(buffer);
5685 
5686  /* count deletion, as we counted the insertion too */
5687  pgstat_count_heap_delete(relation);
5688 }
5689 
5690 /*
5691  * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5692  *
5693  * Overwriting violates both MVCC and transactional safety, so the uses
5694  * of this function in Postgres are extremely limited. Nonetheless we
5695  * find some places to use it.
5696  *
5697  * The tuple cannot change size, and therefore it's reasonable to assume
5698  * that its null bitmap (if any) doesn't change either. So we just
5699  * overwrite the data portion of the tuple without touching the null
5700  * bitmap or any of the header fields.
5701  *
5702  * tuple is an in-memory tuple structure containing the data to be written
5703  * over the target tuple. Also, tuple->t_self identifies the target tuple.
5704  */
5705 void
5707 {
5708  Buffer buffer;
5709  Page page;
5710  OffsetNumber offnum;
5711  ItemId lp = NULL;
5712  HeapTupleHeader htup;
5713  uint32 oldlen;
5714  uint32 newlen;
5715 
5716  /*
5717  * For now, parallel operations are required to be strictly read-only.
5718  * Unlike a regular update, this should never create a combo CID, so it
5719  * might be possible to relax this restriction, but not without more
5720  * thought and testing. It's not clear that it would be useful, anyway.
5721  */
5722  if (IsInParallelMode())
5723  ereport(ERROR,
5724  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5725  errmsg("cannot update tuples during a parallel operation")));
5726 
5727  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
5729  page = (Page) BufferGetPage(buffer);
5730 
5731  offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
5732  if (PageGetMaxOffsetNumber(page) >= offnum)
5733  lp = PageGetItemId(page, offnum);
5734 
5735  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5736  elog(ERROR, "invalid lp");
5737 
5738  htup = (HeapTupleHeader) PageGetItem(page, lp);
5739 
5740  oldlen = ItemIdGetLength(lp) - htup->t_hoff;
5741  newlen = tuple->t_len - tuple->t_data->t_hoff;
5742  if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
5743  elog(ERROR, "wrong tuple length");
5744 
5745  /* NO EREPORT(ERROR) from here till changes are logged */
5747 
5748  memcpy((char *) htup + htup->t_hoff,
5749  (char *) tuple->t_data + tuple->t_data->t_hoff,
5750  newlen);
5751 
5752  MarkBufferDirty(buffer);
5753 
5754  /* XLOG stuff */
5755  if (RelationNeedsWAL(relation))
5756  {
5757  xl_heap_inplace xlrec;
5758  XLogRecPtr recptr;
5759 
5760  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5761 
5762  XLogBeginInsert();
5763  XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
5764 
5765  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5766  XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
5767 
5768  /* inplace updates aren't decoded atm, don't log the origin */
5769 
5770  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
5771 
5772  PageSetLSN(page, recptr);
5773  }
5774 
5775  END_CRIT_SECTION();
5776 
5777  UnlockReleaseBuffer(buffer);
5778 
5779  /*
5780  * Send out shared cache inval if necessary. Note that because we only
5781  * pass the new version of the tuple, this mustn't be used for any
5782  * operations that could change catcache lookup keys. But we aren't
5783  * bothering with index updates either, so that's true a fortiori.
5784  */
5786  CacheInvalidateHeapTuple(relation, tuple, NULL);
5787 }
5788 
5789 #define FRM_NOOP 0x0001
5790 #define FRM_INVALIDATE_XMAX 0x0002
5791 #define FRM_RETURN_IS_XID 0x0004
5792 #define FRM_RETURN_IS_MULTI 0x0008
5793 #define FRM_MARK_COMMITTED 0x0010
5794 
5795 /*
5796  * FreezeMultiXactId
5797  * Determine what to do during freezing when a tuple is marked by a
5798  * MultiXactId.
5799  *