PostgreSQL Source Code  git master
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * heap_beginscan - begin relation scan
16  * heap_rescan - restart a relation scan
17  * heap_endscan - end relation scan
18  * heap_getnext - retrieve next tuple in scan
19  * heap_fetch - retrieve tuple with given tid
20  * heap_insert - insert tuple into a relation
21  * heap_multi_insert - insert multiple tuples into a relation
22  * heap_delete - delete a tuple from a relation
23  * heap_update - replace a tuple in a relation with another tuple
24  * heap_sync - sync heap, for when no WAL has been written
25  *
26  * NOTES
27  * This file contains the heap_ routines which implement
28  * the POSTGRES heap access method used for all POSTGRES
29  * relations.
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include "access/bufmask.h"
36 #include "access/genam.h"
37 #include "access/heapam.h"
38 #include "access/heapam_xlog.h"
39 #include "access/heaptoast.h"
40 #include "access/hio.h"
41 #include "access/multixact.h"
42 #include "access/parallel.h"
43 #include "access/relscan.h"
44 #include "access/sysattr.h"
45 #include "access/tableam.h"
46 #include "access/transam.h"
47 #include "access/valid.h"
48 #include "access/visibilitymap.h"
49 #include "access/xact.h"
50 #include "access/xlog.h"
51 #include "access/xloginsert.h"
52 #include "access/xlogutils.h"
53 #include "catalog/catalog.h"
54 #include "miscadmin.h"
55 #include "pgstat.h"
56 #include "port/atomics.h"
57 #include "storage/bufmgr.h"
58 #include "storage/freespace.h"
59 #include "storage/lmgr.h"
60 #include "storage/predicate.h"
61 #include "storage/procarray.h"
62 #include "storage/smgr.h"
63 #include "storage/spin.h"
64 #include "storage/standby.h"
65 #include "utils/datum.h"
66 #include "utils/inval.h"
67 #include "utils/lsyscache.h"
68 #include "utils/relcache.h"
69 #include "utils/snapmgr.h"
70 #include "utils/spccache.h"
71 
72 
73 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
74  TransactionId xid, CommandId cid, int options);
75 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
76  Buffer newbuf, HeapTuple oldtup,
77  HeapTuple newtup, HeapTuple old_key_tuple,
78  bool all_visible_cleared, bool new_all_visible_cleared);
80  Bitmapset *interesting_cols,
81  HeapTuple oldtup, HeapTuple newtup);
82 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
83  LockTupleMode mode, LockWaitPolicy wait_policy,
84  bool *have_tuple_lock);
85 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
86  uint16 old_infomask2, TransactionId add_to_xmax,
87  LockTupleMode mode, bool is_update,
88  TransactionId *result_xmax, uint16 *result_infomask,
89  uint16 *result_infomask2);
91  ItemPointer ctid, TransactionId xid,
93 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
94  uint16 *new_infomask2);
96  uint16 t_infomask);
97 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
98  LockTupleMode lockmode, bool *current_is_member);
99 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
100  Relation rel, ItemPointer ctid, XLTW_Oper oper,
101  int *remaining);
103  uint16 infomask, Relation rel, int *remaining);
104 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
105 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
106  bool *copy);
107 
108 
109 /*
110  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
111  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
112  * update them). This table (and the macros below) helps us determine the
113  * heavyweight lock mode and MultiXactStatus values to use for any particular
114  * tuple lock strength.
115  *
116  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
117  * instead.
118  */
119 static const struct
120 {
124 }
125 
127 {
128  { /* LockTupleKeyShare */
131  -1 /* KeyShare does not allow updating tuples */
132  },
133  { /* LockTupleShare */
134  RowShareLock,
136  -1 /* Share does not allow updating tuples */
137  },
138  { /* LockTupleNoKeyExclusive */
142  },
143  { /* LockTupleExclusive */
147  }
148 };
149 
150 /* Get the LOCKMODE for a given MultiXactStatus */
151 #define LOCKMODE_from_mxstatus(status) \
152  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153 
154 /*
155  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156  * This is more readable than having every caller translate it to lock.h's
157  * LOCKMODE.
158  */
159 #define LockTupleTuplock(rel, tup, mode) \
160  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161 #define UnlockTupleTuplock(rel, tup, mode) \
162  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163 #define ConditionalLockTupleTuplock(rel, tup, mode) \
164  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165 
166 #ifdef USE_PREFETCH
167 /*
168  * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
169  * this structure to coordinate prefetching activity.
170  */
171 typedef struct
172 {
173  BlockNumber cur_hblkno;
174  int next_item;
175  int nitems;
176  ItemPointerData *tids;
177 } XidHorizonPrefetchState;
178 #endif
179 
180 /*
181  * This table maps tuple lock strength values for each particular
182  * MultiXactStatus value.
183  */
185 {
186  LockTupleKeyShare, /* ForKeyShare */
187  LockTupleShare, /* ForShare */
188  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
189  LockTupleExclusive, /* ForUpdate */
190  LockTupleNoKeyExclusive, /* NoKeyUpdate */
191  LockTupleExclusive /* Update */
192 };
193 
194 /* Get the LockTupleMode for a given MultiXactStatus */
195 #define TUPLOCK_from_mxstatus(status) \
196  (MultiXactStatusLock[(status)])
197 
198 /* ----------------------------------------------------------------
199  * heap support routines
200  * ----------------------------------------------------------------
201  */
202 
203 /* ----------------
204  * initscan - scan code common to heap_beginscan and heap_rescan
205  * ----------------
206  */
207 static void
208 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
209 {
210  ParallelBlockTableScanDesc bpscan = NULL;
211  bool allow_strat;
212  bool allow_sync;
213 
214  /*
215  * Determine the number of blocks we have to scan.
216  *
217  * It is sufficient to do this once at scan start, since any tuples added
218  * while the scan is in progress will be invisible to my snapshot anyway.
219  * (That is not true when using a non-MVCC snapshot. However, we couldn't
220  * guarantee to return tuples added after scan start anyway, since they
221  * might go into pages we already scanned. To guarantee consistent
222  * results for a non-MVCC snapshot, the caller must hold some higher-level
223  * lock that ensures the interesting tuple(s) won't change.)
224  */
225  if (scan->rs_base.rs_parallel != NULL)
226  {
228  scan->rs_nblocks = bpscan->phs_nblocks;
229  }
230  else
232 
233  /*
234  * If the table is large relative to NBuffers, use a bulk-read access
235  * strategy and enable synchronized scanning (see syncscan.c). Although
236  * the thresholds for these features could be different, we make them the
237  * same so that there are only two behaviors to tune rather than four.
238  * (However, some callers need to be able to disable one or both of these
239  * behaviors, independently of the size of the table; also there is a GUC
240  * variable that can disable synchronized scanning.)
241  *
242  * Note that table_block_parallelscan_initialize has a very similar test;
243  * if you change this, consider changing that one, too.
244  */
245  if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
246  scan->rs_nblocks > NBuffers / 4)
247  {
248  allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
249  allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
250  }
251  else
252  allow_strat = allow_sync = false;
253 
254  if (allow_strat)
255  {
256  /* During a rescan, keep the previous strategy object. */
257  if (scan->rs_strategy == NULL)
259  }
260  else
261  {
262  if (scan->rs_strategy != NULL)
264  scan->rs_strategy = NULL;
265  }
266 
267  if (scan->rs_base.rs_parallel != NULL)
268  {
269  /* For parallel scan, believe whatever ParallelTableScanDesc says. */
270  if (scan->rs_base.rs_parallel->phs_syncscan)
271  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
272  else
273  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
274  }
275  else if (keep_startblock)
276  {
277  /*
278  * When rescanning, we want to keep the previous startblock setting,
279  * so that rewinding a cursor doesn't generate surprising results.
280  * Reset the active syncscan setting, though.
281  */
282  if (allow_sync && synchronize_seqscans)
283  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
284  else
285  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
286  }
287  else if (allow_sync && synchronize_seqscans)
288  {
289  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
290  scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
291  }
292  else
293  {
294  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
295  scan->rs_startblock = 0;
296  }
297 
299  scan->rs_inited = false;
300  scan->rs_ctup.t_data = NULL;
302  scan->rs_cbuf = InvalidBuffer;
304 
305  /* page-at-a-time fields are always invalid when not rs_inited */
306 
307  /*
308  * copy the scan key, if appropriate
309  */
310  if (key != NULL)
311  memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
312 
313  /*
314  * Currently, we only have a stats counter for sequential heap scans (but
315  * e.g for bitmap scans the underlying bitmap index scans will be counted,
316  * and for sample scans we update stats for tuple fetches).
317  */
318  if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
320 }
321 
322 /*
323  * heap_setscanlimits - restrict range of a heapscan
324  *
325  * startBlk is the page to start at
326  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
327  */
328 void
330 {
331  HeapScanDesc scan = (HeapScanDesc) sscan;
332 
333  Assert(!scan->rs_inited); /* else too late to change */
334  /* else rs_startblock is significant */
335  Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
336 
337  /* Check startBlk is valid (but allow case of zero blocks...) */
338  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
339 
340  scan->rs_startblock = startBlk;
341  scan->rs_numblocks = numBlks;
342 }
343 
344 /*
345  * heapgetpage - subroutine for heapgettup()
346  *
347  * This routine reads and pins the specified page of the relation.
348  * In page-at-a-time mode it performs additional work, namely determining
349  * which tuples on the page are visible.
350  */
351 void
353 {
354  HeapScanDesc scan = (HeapScanDesc) sscan;
355  Buffer buffer;
356  Snapshot snapshot;
357  Page dp;
358  int lines;
359  int ntup;
360  OffsetNumber lineoff;
361  ItemId lpp;
362  bool all_visible;
363 
364  Assert(page < scan->rs_nblocks);
365 
366  /* release previous scan buffer, if any */
367  if (BufferIsValid(scan->rs_cbuf))
368  {
369  ReleaseBuffer(scan->rs_cbuf);
370  scan->rs_cbuf = InvalidBuffer;
371  }
372 
373  /*
374  * Be sure to check for interrupts at least once per page. Checks at
375  * higher code levels won't be able to stop a seqscan that encounters many
376  * pages' worth of consecutive dead tuples.
377  */
379 
380  /* read page using selected strategy */
381  scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
382  RBM_NORMAL, scan->rs_strategy);
383  scan->rs_cblock = page;
384 
385  if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
386  return;
387 
388  buffer = scan->rs_cbuf;
389  snapshot = scan->rs_base.rs_snapshot;
390 
391  /*
392  * Prune and repair fragmentation for the whole page, if possible.
393  */
394  heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
395 
396  /*
397  * We must hold share lock on the buffer content while examining tuple
398  * visibility. Afterwards, however, the tuples we have found to be
399  * visible are guaranteed good as long as we hold the buffer pin.
400  */
401  LockBuffer(buffer, BUFFER_LOCK_SHARE);
402 
403  dp = BufferGetPage(buffer);
404  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
405  lines = PageGetMaxOffsetNumber(dp);
406  ntup = 0;
407 
408  /*
409  * If the all-visible flag indicates that all tuples on the page are
410  * visible to everyone, we can skip the per-tuple visibility tests.
411  *
412  * Note: In hot standby, a tuple that's already visible to all
413  * transactions in the master might still be invisible to a read-only
414  * transaction in the standby. We partly handle this problem by tracking
415  * the minimum xmin of visible tuples as the cut-off XID while marking a
416  * page all-visible on master and WAL log that along with the visibility
417  * map SET operation. In hot standby, we wait for (or abort) all
418  * transactions that can potentially may not see one or more tuples on the
419  * page. That's how index-only scans work fine in hot standby. A crucial
420  * difference between index-only scans and heap scans is that the
421  * index-only scan completely relies on the visibility map where as heap
422  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
423  * the page-level flag can be trusted in the same way, because it might
424  * get propagated somehow without being explicitly WAL-logged, e.g. via a
425  * full page write. Until we can prove that beyond doubt, let's check each
426  * tuple for visibility the hard way.
427  */
428  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
429 
430  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
431  lineoff <= lines;
432  lineoff++, lpp++)
433  {
434  if (ItemIdIsNormal(lpp))
435  {
436  HeapTupleData loctup;
437  bool valid;
438 
439  loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
440  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
441  loctup.t_len = ItemIdGetLength(lpp);
442  ItemPointerSet(&(loctup.t_self), page, lineoff);
443 
444  if (all_visible)
445  valid = true;
446  else
447  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
448 
450  &loctup, buffer, snapshot);
451 
452  if (valid)
453  scan->rs_vistuples[ntup++] = lineoff;
454  }
455  }
456 
458 
459  Assert(ntup <= MaxHeapTuplesPerPage);
460  scan->rs_ntuples = ntup;
461 }
462 
463 /* ----------------
464  * heapgettup - fetch next heap tuple
465  *
466  * Initialize the scan if not already done; then advance to the next
467  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
468  * or set scan->rs_ctup.t_data = NULL if no more tuples.
469  *
470  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
471  * by scan->rs_ctup".
472  *
473  * Note: the reason nkeys/key are passed separately, even though they are
474  * kept in the scan descriptor, is that the caller may not want us to check
475  * the scankeys.
476  *
477  * Note: when we fall off the end of the scan in either direction, we
478  * reset rs_inited. This means that a further request with the same
479  * scan direction will restart the scan, which is a bit odd, but a
480  * request with the opposite scan direction will start a fresh scan
481  * in the proper direction. The latter is required behavior for cursors,
482  * while the former case is generally undefined behavior in Postgres
483  * so we don't care too much.
484  * ----------------
485  */
486 static void
488  ScanDirection dir,
489  int nkeys,
490  ScanKey key)
491 {
492  HeapTuple tuple = &(scan->rs_ctup);
493  Snapshot snapshot = scan->rs_base.rs_snapshot;
494  bool backward = ScanDirectionIsBackward(dir);
495  BlockNumber page;
496  bool finished;
497  Page dp;
498  int lines;
499  OffsetNumber lineoff;
500  int linesleft;
501  ItemId lpp;
502 
503  /*
504  * calculate next starting lineoff, given scan direction
505  */
506  if (ScanDirectionIsForward(dir))
507  {
508  if (!scan->rs_inited)
509  {
510  /*
511  * return null immediately if relation is empty
512  */
513  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
514  {
515  Assert(!BufferIsValid(scan->rs_cbuf));
516  tuple->t_data = NULL;
517  return;
518  }
519  if (scan->rs_base.rs_parallel != NULL)
520  {
523 
525  pbscan);
526 
528  pbscan);
529 
530  /* Other processes might have already finished the scan. */
531  if (page == InvalidBlockNumber)
532  {
533  Assert(!BufferIsValid(scan->rs_cbuf));
534  tuple->t_data = NULL;
535  return;
536  }
537  }
538  else
539  page = scan->rs_startblock; /* first page */
540  heapgetpage((TableScanDesc) scan, page);
541  lineoff = FirstOffsetNumber; /* first offnum */
542  scan->rs_inited = true;
543  }
544  else
545  {
546  /* continue from previously returned page/tuple */
547  page = scan->rs_cblock; /* current page */
548  lineoff = /* next offnum */
550  }
551 
553 
554  dp = BufferGetPage(scan->rs_cbuf);
555  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
556  lines = PageGetMaxOffsetNumber(dp);
557  /* page and lineoff now reference the physically next tid */
558 
559  linesleft = lines - lineoff + 1;
560  }
561  else if (backward)
562  {
563  /* backward parallel scan not supported */
564  Assert(scan->rs_base.rs_parallel == NULL);
565 
566  if (!scan->rs_inited)
567  {
568  /*
569  * return null immediately if relation is empty
570  */
571  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
572  {
573  Assert(!BufferIsValid(scan->rs_cbuf));
574  tuple->t_data = NULL;
575  return;
576  }
577 
578  /*
579  * Disable reporting to syncscan logic in a backwards scan; it's
580  * not very likely anyone else is doing the same thing at the same
581  * time, and much more likely that we'll just bollix things for
582  * forward scanners.
583  */
584  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
585  /* start from last page of the scan */
586  if (scan->rs_startblock > 0)
587  page = scan->rs_startblock - 1;
588  else
589  page = scan->rs_nblocks - 1;
590  heapgetpage((TableScanDesc) scan, page);
591  }
592  else
593  {
594  /* continue from previously returned page/tuple */
595  page = scan->rs_cblock; /* current page */
596  }
597 
599 
600  dp = BufferGetPage(scan->rs_cbuf);
601  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
602  lines = PageGetMaxOffsetNumber(dp);
603 
604  if (!scan->rs_inited)
605  {
606  lineoff = lines; /* final offnum */
607  scan->rs_inited = true;
608  }
609  else
610  {
611  lineoff = /* previous offnum */
613  }
614  /* page and lineoff now reference the physically previous tid */
615 
616  linesleft = lineoff;
617  }
618  else
619  {
620  /*
621  * ``no movement'' scan direction: refetch prior tuple
622  */
623  if (!scan->rs_inited)
624  {
625  Assert(!BufferIsValid(scan->rs_cbuf));
626  tuple->t_data = NULL;
627  return;
628  }
629 
630  page = ItemPointerGetBlockNumber(&(tuple->t_self));
631  if (page != scan->rs_cblock)
632  heapgetpage((TableScanDesc) scan, page);
633 
634  /* Since the tuple was previously fetched, needn't lock page here */
635  dp = BufferGetPage(scan->rs_cbuf);
636  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
637  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
638  lpp = PageGetItemId(dp, lineoff);
639  Assert(ItemIdIsNormal(lpp));
640 
641  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
642  tuple->t_len = ItemIdGetLength(lpp);
643 
644  return;
645  }
646 
647  /*
648  * advance the scan until we find a qualifying tuple or run out of stuff
649  * to scan
650  */
651  lpp = PageGetItemId(dp, lineoff);
652  for (;;)
653  {
654  while (linesleft > 0)
655  {
656  if (ItemIdIsNormal(lpp))
657  {
658  bool valid;
659 
660  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
661  tuple->t_len = ItemIdGetLength(lpp);
662  ItemPointerSet(&(tuple->t_self), page, lineoff);
663 
664  /*
665  * if current tuple qualifies, return it.
666  */
667  valid = HeapTupleSatisfiesVisibility(tuple,
668  snapshot,
669  scan->rs_cbuf);
670 
672  tuple, scan->rs_cbuf,
673  snapshot);
674 
675  if (valid && key != NULL)
677  nkeys, key, valid);
678 
679  if (valid)
680  {
682  return;
683  }
684  }
685 
686  /*
687  * otherwise move to the next item on the page
688  */
689  --linesleft;
690  if (backward)
691  {
692  --lpp; /* move back in this page's ItemId array */
693  --lineoff;
694  }
695  else
696  {
697  ++lpp; /* move forward in this page's ItemId array */
698  ++lineoff;
699  }
700  }
701 
702  /*
703  * if we get here, it means we've exhausted the items on this page and
704  * it's time to move to the next.
705  */
707 
708  /*
709  * advance to next/prior page and detect end of scan
710  */
711  if (backward)
712  {
713  finished = (page == scan->rs_startblock) ||
714  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
715  if (page == 0)
716  page = scan->rs_nblocks;
717  page--;
718  }
719  else if (scan->rs_base.rs_parallel != NULL)
720  {
723 
725  pbscan);
726  finished = (page == InvalidBlockNumber);
727  }
728  else
729  {
730  page++;
731  if (page >= scan->rs_nblocks)
732  page = 0;
733  finished = (page == scan->rs_startblock) ||
734  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
735 
736  /*
737  * Report our new scan position for synchronization purposes. We
738  * don't do that when moving backwards, however. That would just
739  * mess up any other forward-moving scanners.
740  *
741  * Note: we do this before checking for end of scan so that the
742  * final state of the position hint is back at the start of the
743  * rel. That's not strictly necessary, but otherwise when you run
744  * the same query multiple times the starting position would shift
745  * a little bit backwards on every invocation, which is confusing.
746  * We don't guarantee any specific ordering in general, though.
747  */
748  if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
749  ss_report_location(scan->rs_base.rs_rd, page);
750  }
751 
752  /*
753  * return NULL if we've exhausted all the pages
754  */
755  if (finished)
756  {
757  if (BufferIsValid(scan->rs_cbuf))
758  ReleaseBuffer(scan->rs_cbuf);
759  scan->rs_cbuf = InvalidBuffer;
761  tuple->t_data = NULL;
762  scan->rs_inited = false;
763  return;
764  }
765 
766  heapgetpage((TableScanDesc) scan, page);
767 
768  LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
769 
770  dp = BufferGetPage(scan->rs_cbuf);
771  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
772  lines = PageGetMaxOffsetNumber((Page) dp);
773  linesleft = lines;
774  if (backward)
775  {
776  lineoff = lines;
777  lpp = PageGetItemId(dp, lines);
778  }
779  else
780  {
781  lineoff = FirstOffsetNumber;
782  lpp = PageGetItemId(dp, FirstOffsetNumber);
783  }
784  }
785 }
786 
787 /* ----------------
788  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
789  *
790  * Same API as heapgettup, but used in page-at-a-time mode
791  *
792  * The internal logic is much the same as heapgettup's too, but there are some
793  * differences: we do not take the buffer content lock (that only needs to
794  * happen inside heapgetpage), and we iterate through just the tuples listed
795  * in rs_vistuples[] rather than all tuples on the page. Notice that
796  * lineindex is 0-based, where the corresponding loop variable lineoff in
797  * heapgettup is 1-based.
798  * ----------------
799  */
800 static void
802  ScanDirection dir,
803  int nkeys,
804  ScanKey key)
805 {
806  HeapTuple tuple = &(scan->rs_ctup);
807  bool backward = ScanDirectionIsBackward(dir);
808  BlockNumber page;
809  bool finished;
810  Page dp;
811  int lines;
812  int lineindex;
813  OffsetNumber lineoff;
814  int linesleft;
815  ItemId lpp;
816 
817  /*
818  * calculate next starting lineindex, given scan direction
819  */
820  if (ScanDirectionIsForward(dir))
821  {
822  if (!scan->rs_inited)
823  {
824  /*
825  * return null immediately if relation is empty
826  */
827  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
828  {
829  Assert(!BufferIsValid(scan->rs_cbuf));
830  tuple->t_data = NULL;
831  return;
832  }
833  if (scan->rs_base.rs_parallel != NULL)
834  {
837 
839  pbscan);
840 
842  pbscan);
843 
844  /* Other processes might have already finished the scan. */
845  if (page == InvalidBlockNumber)
846  {
847  Assert(!BufferIsValid(scan->rs_cbuf));
848  tuple->t_data = NULL;
849  return;
850  }
851  }
852  else
853  page = scan->rs_startblock; /* first page */
854  heapgetpage((TableScanDesc) scan, page);
855  lineindex = 0;
856  scan->rs_inited = true;
857  }
858  else
859  {
860  /* continue from previously returned page/tuple */
861  page = scan->rs_cblock; /* current page */
862  lineindex = scan->rs_cindex + 1;
863  }
864 
865  dp = BufferGetPage(scan->rs_cbuf);
867  lines = scan->rs_ntuples;
868  /* page and lineindex now reference the next visible tid */
869 
870  linesleft = lines - lineindex;
871  }
872  else if (backward)
873  {
874  /* backward parallel scan not supported */
875  Assert(scan->rs_base.rs_parallel == NULL);
876 
877  if (!scan->rs_inited)
878  {
879  /*
880  * return null immediately if relation is empty
881  */
882  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
883  {
884  Assert(!BufferIsValid(scan->rs_cbuf));
885  tuple->t_data = NULL;
886  return;
887  }
888 
889  /*
890  * Disable reporting to syncscan logic in a backwards scan; it's
891  * not very likely anyone else is doing the same thing at the same
892  * time, and much more likely that we'll just bollix things for
893  * forward scanners.
894  */
895  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
896  /* start from last page of the scan */
897  if (scan->rs_startblock > 0)
898  page = scan->rs_startblock - 1;
899  else
900  page = scan->rs_nblocks - 1;
901  heapgetpage((TableScanDesc) scan, page);
902  }
903  else
904  {
905  /* continue from previously returned page/tuple */
906  page = scan->rs_cblock; /* current page */
907  }
908 
909  dp = BufferGetPage(scan->rs_cbuf);
911  lines = scan->rs_ntuples;
912 
913  if (!scan->rs_inited)
914  {
915  lineindex = lines - 1;
916  scan->rs_inited = true;
917  }
918  else
919  {
920  lineindex = scan->rs_cindex - 1;
921  }
922  /* page and lineindex now reference the previous visible tid */
923 
924  linesleft = lineindex + 1;
925  }
926  else
927  {
928  /*
929  * ``no movement'' scan direction: refetch prior tuple
930  */
931  if (!scan->rs_inited)
932  {
933  Assert(!BufferIsValid(scan->rs_cbuf));
934  tuple->t_data = NULL;
935  return;
936  }
937 
938  page = ItemPointerGetBlockNumber(&(tuple->t_self));
939  if (page != scan->rs_cblock)
940  heapgetpage((TableScanDesc) scan, page);
941 
942  /* Since the tuple was previously fetched, needn't lock page here */
943  dp = BufferGetPage(scan->rs_cbuf);
944  TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
945  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
946  lpp = PageGetItemId(dp, lineoff);
947  Assert(ItemIdIsNormal(lpp));
948 
949  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
950  tuple->t_len = ItemIdGetLength(lpp);
951 
952  /* check that rs_cindex is in sync */
953  Assert(scan->rs_cindex < scan->rs_ntuples);
954  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
955 
956  return;
957  }
958 
959  /*
960  * advance the scan until we find a qualifying tuple or run out of stuff
961  * to scan
962  */
963  for (;;)
964  {
965  while (linesleft > 0)
966  {
967  lineoff = scan->rs_vistuples[lineindex];
968  lpp = PageGetItemId(dp, lineoff);
969  Assert(ItemIdIsNormal(lpp));
970 
971  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
972  tuple->t_len = ItemIdGetLength(lpp);
973  ItemPointerSet(&(tuple->t_self), page, lineoff);
974 
975  /*
976  * if current tuple qualifies, return it.
977  */
978  if (key != NULL)
979  {
980  bool valid;
981 
983  nkeys, key, valid);
984  if (valid)
985  {
986  scan->rs_cindex = lineindex;
987  return;
988  }
989  }
990  else
991  {
992  scan->rs_cindex = lineindex;
993  return;
994  }
995 
996  /*
997  * otherwise move to the next item on the page
998  */
999  --linesleft;
1000  if (backward)
1001  --lineindex;
1002  else
1003  ++lineindex;
1004  }
1005 
1006  /*
1007  * if we get here, it means we've exhausted the items on this page and
1008  * it's time to move to the next.
1009  */
1010  if (backward)
1011  {
1012  finished = (page == scan->rs_startblock) ||
1013  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1014  if (page == 0)
1015  page = scan->rs_nblocks;
1016  page--;
1017  }
1018  else if (scan->rs_base.rs_parallel != NULL)
1019  {
1022 
1024  pbscan);
1025  finished = (page == InvalidBlockNumber);
1026  }
1027  else
1028  {
1029  page++;
1030  if (page >= scan->rs_nblocks)
1031  page = 0;
1032  finished = (page == scan->rs_startblock) ||
1033  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1034 
1035  /*
1036  * Report our new scan position for synchronization purposes. We
1037  * don't do that when moving backwards, however. That would just
1038  * mess up any other forward-moving scanners.
1039  *
1040  * Note: we do this before checking for end of scan so that the
1041  * final state of the position hint is back at the start of the
1042  * rel. That's not strictly necessary, but otherwise when you run
1043  * the same query multiple times the starting position would shift
1044  * a little bit backwards on every invocation, which is confusing.
1045  * We don't guarantee any specific ordering in general, though.
1046  */
1047  if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1048  ss_report_location(scan->rs_base.rs_rd, page);
1049  }
1050 
1051  /*
1052  * return NULL if we've exhausted all the pages
1053  */
1054  if (finished)
1055  {
1056  if (BufferIsValid(scan->rs_cbuf))
1057  ReleaseBuffer(scan->rs_cbuf);
1058  scan->rs_cbuf = InvalidBuffer;
1059  scan->rs_cblock = InvalidBlockNumber;
1060  tuple->t_data = NULL;
1061  scan->rs_inited = false;
1062  return;
1063  }
1064 
1065  heapgetpage((TableScanDesc) scan, page);
1066 
1067  dp = BufferGetPage(scan->rs_cbuf);
1068  TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1069  lines = scan->rs_ntuples;
1070  linesleft = lines;
1071  if (backward)
1072  lineindex = lines - 1;
1073  else
1074  lineindex = 0;
1075  }
1076 }
1077 
1078 
1079 #if defined(DISABLE_COMPLEX_MACRO)
1080 /*
1081  * This is formatted so oddly so that the correspondence to the macro
1082  * definition in access/htup_details.h is maintained.
1083  */
1084 Datum
1085 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1086  bool *isnull)
1087 {
1088  return (
1089  (attnum) > 0 ?
1090  (
1091  (*(isnull) = false),
1092  HeapTupleNoNulls(tup) ?
1093  (
1094  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1095  (
1096  fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1097  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1098  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1099  )
1100  :
1101  nocachegetattr((tup), (attnum), (tupleDesc))
1102  )
1103  :
1104  (
1105  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1106  (
1107  (*(isnull) = true),
1108  (Datum) NULL
1109  )
1110  :
1111  (
1112  nocachegetattr((tup), (attnum), (tupleDesc))
1113  )
1114  )
1115  )
1116  :
1117  (
1118  (Datum) NULL
1119  )
1120  );
1121 }
1122 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1123 
1124 
1125 /* ----------------------------------------------------------------
1126  * heap access method interface
1127  * ----------------------------------------------------------------
1128  */
1129 
1130 
1132 heap_beginscan(Relation relation, Snapshot snapshot,
1133  int nkeys, ScanKey key,
1134  ParallelTableScanDesc parallel_scan,
1135  uint32 flags)
1136 {
1137  HeapScanDesc scan;
1138 
1139  /*
1140  * increment relation ref count while scanning relation
1141  *
1142  * This is just to make really sure the relcache entry won't go away while
1143  * the scan has a pointer to it. Caller should be holding the rel open
1144  * anyway, so this is redundant in all normal scenarios...
1145  */
1147 
1148  /*
1149  * allocate and initialize scan descriptor
1150  */
1151  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1152 
1153  scan->rs_base.rs_rd = relation;
1154  scan->rs_base.rs_snapshot = snapshot;
1155  scan->rs_base.rs_nkeys = nkeys;
1156  scan->rs_base.rs_flags = flags;
1157  scan->rs_base.rs_parallel = parallel_scan;
1158  scan->rs_strategy = NULL; /* set in initscan */
1159 
1160  /*
1161  * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1162  */
1163  if (!(snapshot && IsMVCCSnapshot(snapshot)))
1165 
1166  /*
1167  * For seqscan and sample scans in a serializable transaction, acquire a
1168  * predicate lock on the entire relation. This is required not only to
1169  * lock all the matching tuples, but also to conflict with new insertions
1170  * into the table. In an indexscan, we take page locks on the index pages
1171  * covering the range specified in the scan qual, but in a heap scan there
1172  * is nothing more fine-grained to lock. A bitmap scan is a different
1173  * story, there we have already scanned the index and locked the index
1174  * pages covering the predicate. But in that case we still have to lock
1175  * any matching heap tuples. For sample scan we could optimize the locking
1176  * to be at least page-level granularity, but we'd need to add per-tuple
1177  * locking for that.
1178  */
1180  {
1181  /*
1182  * Ensure a missing snapshot is noticed reliably, even if the
1183  * isolation mode means predicate locking isn't performed (and
1184  * therefore the snapshot isn't used here).
1185  */
1186  Assert(snapshot);
1187  PredicateLockRelation(relation, snapshot);
1188  }
1189 
1190  /* we only need to set this up once */
1191  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1192 
1193  /*
1194  * we do this here instead of in initscan() because heap_rescan also calls
1195  * initscan() and we don't want to allocate memory again
1196  */
1197  if (nkeys > 0)
1198  scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1199  else
1200  scan->rs_base.rs_key = NULL;
1201 
1202  initscan(scan, key, false);
1203 
1204  return (TableScanDesc) scan;
1205 }
1206 
1207 void
1208 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1209  bool allow_strat, bool allow_sync, bool allow_pagemode)
1210 {
1211  HeapScanDesc scan = (HeapScanDesc) sscan;
1212 
1213  if (set_params)
1214  {
1215  if (allow_strat)
1216  scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1217  else
1218  scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1219 
1220  if (allow_sync)
1221  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1222  else
1223  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1224 
1225  if (allow_pagemode && scan->rs_base.rs_snapshot &&
1228  else
1230  }
1231 
1232  /*
1233  * unpin scan buffers
1234  */
1235  if (BufferIsValid(scan->rs_cbuf))
1236  ReleaseBuffer(scan->rs_cbuf);
1237 
1238  /*
1239  * reinitialize scan descriptor
1240  */
1241  initscan(scan, key, true);
1242 }
1243 
1244 void
1246 {
1247  HeapScanDesc scan = (HeapScanDesc) sscan;
1248 
1249  /* Note: no locking manipulations needed */
1250 
1251  /*
1252  * unpin scan buffers
1253  */
1254  if (BufferIsValid(scan->rs_cbuf))
1255  ReleaseBuffer(scan->rs_cbuf);
1256 
1257  /*
1258  * decrement relation reference count and free scan descriptor storage
1259  */
1261 
1262  if (scan->rs_base.rs_key)
1263  pfree(scan->rs_base.rs_key);
1264 
1265  if (scan->rs_strategy != NULL)
1267 
1268  if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1270 
1271  pfree(scan);
1272 }
1273 
1274 #ifdef HEAPDEBUGALL
1275 #define HEAPDEBUG_1 \
1276  elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1277  RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1278 #define HEAPDEBUG_2 \
1279  elog(DEBUG2, "heap_getnext returning EOS")
1280 #define HEAPDEBUG_3 \
1281  elog(DEBUG2, "heap_getnext returning tuple")
1282 #else
1283 #define HEAPDEBUG_1
1284 #define HEAPDEBUG_2
1285 #define HEAPDEBUG_3
1286 #endif /* !defined(HEAPDEBUGALL) */
1287 
1288 
1289 HeapTuple
1291 {
1292  HeapScanDesc scan = (HeapScanDesc) sscan;
1293 
1294  /*
1295  * This is still widely used directly, without going through table AM, so
1296  * add a safety check. It's possible we should, at a later point,
1297  * downgrade this to an assert. The reason for checking the AM routine,
1298  * rather than the AM oid, is that this allows to write regression tests
1299  * that create another AM reusing the heap handler.
1300  */
1302  ereport(ERROR,
1303  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1304  errmsg_internal("only heap AM is supported")));
1305 
1306  /* Note: no locking manipulations needed */
1307 
1308  HEAPDEBUG_1; /* heap_getnext( info ) */
1309 
1310  if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1311  heapgettup_pagemode(scan, direction,
1312  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1313  else
1314  heapgettup(scan, direction,
1315  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1316 
1317  if (scan->rs_ctup.t_data == NULL)
1318  {
1319  HEAPDEBUG_2; /* heap_getnext returning EOS */
1320  return NULL;
1321  }
1322 
1323  /*
1324  * if we get here it means we have a new current scan tuple, so point to
1325  * the proper return buffer and return the tuple.
1326  */
1327  HEAPDEBUG_3; /* heap_getnext returning tuple */
1328 
1330 
1331  return &scan->rs_ctup;
1332 }
1333 
1334 #ifdef HEAPAMSLOTDEBUGALL
1335 #define HEAPAMSLOTDEBUG_1 \
1336  elog(DEBUG2, "heapam_getnextslot([%s,nkeys=%d],dir=%d) called", \
1337  RelationGetRelationName(scan->rs_base.rs_rd), scan->rs_base.rs_nkeys, (int) direction)
1338 #define HEAPAMSLOTDEBUG_2 \
1339  elog(DEBUG2, "heapam_getnextslot returning EOS")
1340 #define HEAPAMSLOTDEBUG_3 \
1341  elog(DEBUG2, "heapam_getnextslot returning tuple")
1342 #else
1343 #define HEAPAMSLOTDEBUG_1
1344 #define HEAPAMSLOTDEBUG_2
1345 #define HEAPAMSLOTDEBUG_3
1346 #endif
1347 
1348 bool
1350 {
1351  HeapScanDesc scan = (HeapScanDesc) sscan;
1352 
1353  /* Note: no locking manipulations needed */
1354 
1355  HEAPAMSLOTDEBUG_1; /* heap_getnextslot( info ) */
1356 
1357  if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1358  heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1359  else
1360  heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1361 
1362  if (scan->rs_ctup.t_data == NULL)
1363  {
1364  HEAPAMSLOTDEBUG_2; /* heap_getnextslot returning EOS */
1365  ExecClearTuple(slot);
1366  return false;
1367  }
1368 
1369  /*
1370  * if we get here it means we have a new current scan tuple, so point to
1371  * the proper return buffer and return the tuple.
1372  */
1373  HEAPAMSLOTDEBUG_3; /* heap_getnextslot returning tuple */
1374 
1376 
1377  ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1378  scan->rs_cbuf);
1379  return true;
1380 }
1381 
1382 /*
1383  * heap_fetch - retrieve tuple with given tid
1384  *
1385  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1386  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1387  * against the specified snapshot.
1388  *
1389  * If successful (tuple found and passes snapshot time qual), then *userbuf
1390  * is set to the buffer holding the tuple and true is returned. The caller
1391  * must unpin the buffer when done with the tuple.
1392  *
1393  * If the tuple is not found (ie, item number references a deleted slot),
1394  * then tuple->t_data is set to NULL and false is returned.
1395  *
1396  * If the tuple is found but fails the time qual check, then false is returned
1397  * but tuple->t_data is left pointing to the tuple.
1398  *
1399  * heap_fetch does not follow HOT chains: only the exact TID requested will
1400  * be fetched.
1401  *
1402  * It is somewhat inconsistent that we ereport() on invalid block number but
1403  * return false on invalid item number. There are a couple of reasons though.
1404  * One is that the caller can relatively easily check the block number for
1405  * validity, but cannot check the item number without reading the page
1406  * himself. Another is that when we are following a t_ctid link, we can be
1407  * reasonably confident that the page number is valid (since VACUUM shouldn't
1408  * truncate off the destination page without having killed the referencing
1409  * tuple first), but the item number might well not be good.
1410  */
1411 bool
1413  Snapshot snapshot,
1414  HeapTuple tuple,
1415  Buffer *userbuf)
1416 {
1417  ItemPointer tid = &(tuple->t_self);
1418  ItemId lp;
1419  Buffer buffer;
1420  Page page;
1421  OffsetNumber offnum;
1422  bool valid;
1423 
1424  /*
1425  * Fetch and pin the appropriate page of the relation.
1426  */
1427  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1428 
1429  /*
1430  * Need share lock on buffer to examine tuple commit status.
1431  */
1432  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1433  page = BufferGetPage(buffer);
1434  TestForOldSnapshot(snapshot, relation, page);
1435 
1436  /*
1437  * We'd better check for out-of-range offnum in case of VACUUM since the
1438  * TID was obtained.
1439  */
1440  offnum = ItemPointerGetOffsetNumber(tid);
1441  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1442  {
1443  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1444  ReleaseBuffer(buffer);
1445  *userbuf = InvalidBuffer;
1446  tuple->t_data = NULL;
1447  return false;
1448  }
1449 
1450  /*
1451  * get the item line pointer corresponding to the requested tid
1452  */
1453  lp = PageGetItemId(page, offnum);
1454 
1455  /*
1456  * Must check for deleted tuple.
1457  */
1458  if (!ItemIdIsNormal(lp))
1459  {
1460  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1461  ReleaseBuffer(buffer);
1462  *userbuf = InvalidBuffer;
1463  tuple->t_data = NULL;
1464  return false;
1465  }
1466 
1467  /*
1468  * fill in *tuple fields
1469  */
1470  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1471  tuple->t_len = ItemIdGetLength(lp);
1472  tuple->t_tableOid = RelationGetRelid(relation);
1473 
1474  /*
1475  * check tuple visibility, then release lock
1476  */
1477  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1478 
1479  if (valid)
1480  PredicateLockTuple(relation, tuple, snapshot);
1481 
1482  CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1483 
1484  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1485 
1486  if (valid)
1487  {
1488  /*
1489  * All checks passed, so return the tuple as valid. Caller is now
1490  * responsible for releasing the buffer.
1491  */
1492  *userbuf = buffer;
1493 
1494  return true;
1495  }
1496 
1497  /* Tuple failed time qual */
1498  ReleaseBuffer(buffer);
1499  *userbuf = InvalidBuffer;
1500 
1501  return false;
1502 }
1503 
1504 /*
1505  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1506  *
1507  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1508  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1509  * for the first chain member satisfying the given snapshot. If one is
1510  * found, we update *tid to reference that tuple's offset number, and
1511  * return true. If no match, return false without modifying *tid.
1512  *
1513  * heapTuple is a caller-supplied buffer. When a match is found, we return
1514  * the tuple here, in addition to updating *tid. If no match is found, the
1515  * contents of this buffer on return are undefined.
1516  *
1517  * If all_dead is not NULL, we check non-visible tuples to see if they are
1518  * globally dead; *all_dead is set true if all members of the HOT chain
1519  * are vacuumable, false if not.
1520  *
1521  * Unlike heap_fetch, the caller must already have pin and (at least) share
1522  * lock on the buffer; it is still pinned/locked at exit. Also unlike
1523  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1524  */
1525 bool
1527  Snapshot snapshot, HeapTuple heapTuple,
1528  bool *all_dead, bool first_call)
1529 {
1530  Page dp = (Page) BufferGetPage(buffer);
1531  TransactionId prev_xmax = InvalidTransactionId;
1532  BlockNumber blkno;
1533  OffsetNumber offnum;
1534  bool at_chain_start;
1535  bool valid;
1536  bool skip;
1537 
1538  /* If this is not the first call, previous call returned a (live!) tuple */
1539  if (all_dead)
1540  *all_dead = first_call;
1541 
1542  blkno = ItemPointerGetBlockNumber(tid);
1543  offnum = ItemPointerGetOffsetNumber(tid);
1544  at_chain_start = first_call;
1545  skip = !first_call;
1546 
1548  Assert(BufferGetBlockNumber(buffer) == blkno);
1549 
1550  /* Scan through possible multiple members of HOT-chain */
1551  for (;;)
1552  {
1553  ItemId lp;
1554 
1555  /* check for bogus TID */
1556  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1557  break;
1558 
1559  lp = PageGetItemId(dp, offnum);
1560 
1561  /* check for unused, dead, or redirected items */
1562  if (!ItemIdIsNormal(lp))
1563  {
1564  /* We should only see a redirect at start of chain */
1565  if (ItemIdIsRedirected(lp) && at_chain_start)
1566  {
1567  /* Follow the redirect */
1568  offnum = ItemIdGetRedirect(lp);
1569  at_chain_start = false;
1570  continue;
1571  }
1572  /* else must be end of chain */
1573  break;
1574  }
1575 
1576  /*
1577  * Update heapTuple to point to the element of the HOT chain we're
1578  * currently investigating. Having t_self set correctly is important
1579  * because the SSI checks and the *Satisfies routine for historical
1580  * MVCC snapshots need the correct tid to decide about the visibility.
1581  */
1582  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1583  heapTuple->t_len = ItemIdGetLength(lp);
1584  heapTuple->t_tableOid = RelationGetRelid(relation);
1585  ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1586 
1587  /*
1588  * Shouldn't see a HEAP_ONLY tuple at chain start.
1589  */
1590  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1591  break;
1592 
1593  /*
1594  * The xmin should match the previous xmax value, else chain is
1595  * broken.
1596  */
1597  if (TransactionIdIsValid(prev_xmax) &&
1598  !TransactionIdEquals(prev_xmax,
1599  HeapTupleHeaderGetXmin(heapTuple->t_data)))
1600  break;
1601 
1602  /*
1603  * When first_call is true (and thus, skip is initially false) we'll
1604  * return the first tuple we find. But on later passes, heapTuple
1605  * will initially be pointing to the tuple we returned last time.
1606  * Returning it again would be incorrect (and would loop forever), so
1607  * we skip it and return the next match we find.
1608  */
1609  if (!skip)
1610  {
1611  /* If it's visible per the snapshot, we must return it */
1612  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1613  CheckForSerializableConflictOut(valid, relation, heapTuple,
1614  buffer, snapshot);
1615 
1616  if (valid)
1617  {
1618  ItemPointerSetOffsetNumber(tid, offnum);
1619  PredicateLockTuple(relation, heapTuple, snapshot);
1620  if (all_dead)
1621  *all_dead = false;
1622  return true;
1623  }
1624  }
1625  skip = false;
1626 
1627  /*
1628  * If we can't see it, maybe no one else can either. At caller
1629  * request, check whether all chain members are dead to all
1630  * transactions.
1631  *
1632  * Note: if you change the criterion here for what is "dead", fix the
1633  * planner's get_actual_variable_range() function to match.
1634  */
1635  if (all_dead && *all_dead &&
1637  *all_dead = false;
1638 
1639  /*
1640  * Check to see if HOT chain continues past this tuple; if so fetch
1641  * the next offnum and loop around.
1642  */
1643  if (HeapTupleIsHotUpdated(heapTuple))
1644  {
1646  blkno);
1647  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1648  at_chain_start = false;
1649  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1650  }
1651  else
1652  break; /* end of chain */
1653  }
1654 
1655  return false;
1656 }
1657 
1658 /*
1659  * heap_get_latest_tid - get the latest tid of a specified tuple
1660  *
1661  * Actually, this gets the latest version that is visible according to the
1662  * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1663  * possibly uncommitted version.
1664  *
1665  * *tid is both an input and an output parameter: it is updated to
1666  * show the latest version of the row. Note that it will not be changed
1667  * if no version of the row passes the snapshot test.
1668  */
1669 void
1671  ItemPointer tid)
1672 {
1673  Relation relation = sscan->rs_rd;
1674  Snapshot snapshot = sscan->rs_snapshot;
1675  ItemPointerData ctid;
1676  TransactionId priorXmax;
1677 
1678  /*
1679  * table_get_latest_tid verified that the passed in tid is valid. Assume
1680  * that t_ctid links are valid however - there shouldn't be invalid ones
1681  * in the table.
1682  */
1683  Assert(ItemPointerIsValid(tid));
1684 
1685  /*
1686  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1687  * need to examine, and *tid is the TID we will return if ctid turns out
1688  * to be bogus.
1689  *
1690  * Note that we will loop until we reach the end of the t_ctid chain.
1691  * Depending on the snapshot passed, there might be at most one visible
1692  * version of the row, but we don't try to optimize for that.
1693  */
1694  ctid = *tid;
1695  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1696  for (;;)
1697  {
1698  Buffer buffer;
1699  Page page;
1700  OffsetNumber offnum;
1701  ItemId lp;
1702  HeapTupleData tp;
1703  bool valid;
1704 
1705  /*
1706  * Read, pin, and lock the page.
1707  */
1708  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1709  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1710  page = BufferGetPage(buffer);
1711  TestForOldSnapshot(snapshot, relation, page);
1712 
1713  /*
1714  * Check for bogus item number. This is not treated as an error
1715  * condition because it can happen while following a t_ctid link. We
1716  * just assume that the prior tid is OK and return it unchanged.
1717  */
1718  offnum = ItemPointerGetOffsetNumber(&ctid);
1719  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1720  {
1721  UnlockReleaseBuffer(buffer);
1722  break;
1723  }
1724  lp = PageGetItemId(page, offnum);
1725  if (!ItemIdIsNormal(lp))
1726  {
1727  UnlockReleaseBuffer(buffer);
1728  break;
1729  }
1730 
1731  /* OK to access the tuple */
1732  tp.t_self = ctid;
1733  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1734  tp.t_len = ItemIdGetLength(lp);
1735  tp.t_tableOid = RelationGetRelid(relation);
1736 
1737  /*
1738  * After following a t_ctid link, we might arrive at an unrelated
1739  * tuple. Check for XMIN match.
1740  */
1741  if (TransactionIdIsValid(priorXmax) &&
1743  {
1744  UnlockReleaseBuffer(buffer);
1745  break;
1746  }
1747 
1748  /*
1749  * Check tuple visibility; if visible, set it as the new result
1750  * candidate.
1751  */
1752  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1753  CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1754  if (valid)
1755  *tid = ctid;
1756 
1757  /*
1758  * If there's a valid t_ctid link, follow it, else we're done.
1759  */
1760  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1764  {
1765  UnlockReleaseBuffer(buffer);
1766  break;
1767  }
1768 
1769  ctid = tp.t_data->t_ctid;
1770  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1771  UnlockReleaseBuffer(buffer);
1772  } /* end of loop */
1773 }
1774 
1775 
1776 /*
1777  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1778  *
1779  * This is called after we have waited for the XMAX transaction to terminate.
1780  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1781  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1782  * hint bit if possible --- but beware that that may not yet be possible,
1783  * if the transaction committed asynchronously.
1784  *
1785  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1786  * even if it commits.
1787  *
1788  * Hence callers should look only at XMAX_INVALID.
1789  *
1790  * Note this is not allowed for tuples whose xmax is a multixact.
1791  */
1792 static void
1794 {
1796  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1797 
1798  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1799  {
1800  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1803  xid);
1804  else
1805  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1807  }
1808 }
1809 
1810 
1811 /*
1812  * GetBulkInsertState - prepare status object for a bulk insert
1813  */
1816 {
1817  BulkInsertState bistate;
1818 
1819  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1821  bistate->current_buf = InvalidBuffer;
1822  return bistate;
1823 }
1824 
1825 /*
1826  * FreeBulkInsertState - clean up after finishing a bulk insert
1827  */
1828 void
1830 {
1831  if (bistate->current_buf != InvalidBuffer)
1832  ReleaseBuffer(bistate->current_buf);
1833  FreeAccessStrategy(bistate->strategy);
1834  pfree(bistate);
1835 }
1836 
1837 /*
1838  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1839  */
1840 void
1842 {
1843  if (bistate->current_buf != InvalidBuffer)
1844  ReleaseBuffer(bistate->current_buf);
1845  bistate->current_buf = InvalidBuffer;
1846 }
1847 
1848 
1849 /*
1850  * heap_insert - insert tuple into a heap
1851  *
1852  * The new tuple is stamped with current transaction ID and the specified
1853  * command ID.
1854  *
1855  * See table_tuple_insert for comments about most of the input flags, except
1856  * that this routine directly takes a tuple rather than a slot.
1857  *
1858  * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1859  * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1860  * implement table_tuple_insert_speculative().
1861  *
1862  * On return the header fields of *tup are updated to match the stored tuple;
1863  * in particular tup->t_self receives the actual TID where the tuple was
1864  * stored. But note that any toasting of fields within the tuple data is NOT
1865  * reflected into *tup.
1866  */
1867 void
1869  int options, BulkInsertState bistate)
1870 {
1872  HeapTuple heaptup;
1873  Buffer buffer;
1874  Buffer vmbuffer = InvalidBuffer;
1875  bool all_visible_cleared = false;
1876 
1877  /*
1878  * Fill in tuple header fields and toast the tuple if necessary.
1879  *
1880  * Note: below this point, heaptup is the data we actually intend to store
1881  * into the relation; tup is the caller's original untoasted data.
1882  */
1883  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
1884 
1885  /*
1886  * Find buffer to insert this tuple into. If the page is all visible,
1887  * this will also pin the requisite visibility map page.
1888  */
1889  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1890  InvalidBuffer, options, bistate,
1891  &vmbuffer, NULL);
1892 
1893  /*
1894  * We're about to do the actual insert -- but check for conflict first, to
1895  * avoid possibly having to roll back work we've just done.
1896  *
1897  * This is safe without a recheck as long as there is no possibility of
1898  * another process scanning the page between this check and the insert
1899  * being visible to the scan (i.e., an exclusive buffer content lock is
1900  * continuously held from this point until the tuple insert is visible).
1901  *
1902  * For a heap insert, we only need to check for table-level SSI locks. Our
1903  * new tuple can't possibly conflict with existing tuple locks, and heap
1904  * page locks are only consolidated versions of tuple locks; they do not
1905  * lock "gaps" as index page locks do. So we don't need to specify a
1906  * buffer when making the call, which makes for a faster check.
1907  */
1909 
1910  /* NO EREPORT(ERROR) from here till changes are logged */
1912 
1913  RelationPutHeapTuple(relation, buffer, heaptup,
1914  (options & HEAP_INSERT_SPECULATIVE) != 0);
1915 
1916  if (PageIsAllVisible(BufferGetPage(buffer)))
1917  {
1918  all_visible_cleared = true;
1920  visibilitymap_clear(relation,
1921  ItemPointerGetBlockNumber(&(heaptup->t_self)),
1922  vmbuffer, VISIBILITYMAP_VALID_BITS);
1923  }
1924 
1925  /*
1926  * XXX Should we set PageSetPrunable on this page ?
1927  *
1928  * The inserting transaction may eventually abort thus making this tuple
1929  * DEAD and hence available for pruning. Though we don't want to optimize
1930  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1931  * aborted tuple will never be pruned until next vacuum is triggered.
1932  *
1933  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1934  */
1935 
1936  MarkBufferDirty(buffer);
1937 
1938  /* XLOG stuff */
1939  if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
1940  {
1941  xl_heap_insert xlrec;
1942  xl_heap_header xlhdr;
1943  XLogRecPtr recptr;
1944  Page page = BufferGetPage(buffer);
1945  uint8 info = XLOG_HEAP_INSERT;
1946  int bufflags = 0;
1947 
1948  /*
1949  * If this is a catalog, we need to transmit combocids to properly
1950  * decode, so log that as well.
1951  */
1953  log_heap_new_cid(relation, heaptup);
1954 
1955  /*
1956  * If this is the single and first tuple on page, we can reinit the
1957  * page instead of restoring the whole thing. Set flag, and hide
1958  * buffer references from XLogInsert.
1959  */
1960  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1962  {
1963  info |= XLOG_HEAP_INIT_PAGE;
1964  bufflags |= REGBUF_WILL_INIT;
1965  }
1966 
1967  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
1968  xlrec.flags = 0;
1969  if (all_visible_cleared)
1971  if (options & HEAP_INSERT_SPECULATIVE)
1974 
1975  /*
1976  * For logical decoding, we need the tuple even if we're doing a full
1977  * page write, so make sure it's included even if we take a full-page
1978  * image. (XXX We could alternatively store a pointer into the FPW).
1979  */
1980  if (RelationIsLogicallyLogged(relation) &&
1981  !(options & HEAP_INSERT_NO_LOGICAL))
1982  {
1984  bufflags |= REGBUF_KEEP_DATA;
1985  }
1986 
1987  XLogBeginInsert();
1988  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
1989 
1990  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1991  xlhdr.t_infomask = heaptup->t_data->t_infomask;
1992  xlhdr.t_hoff = heaptup->t_data->t_hoff;
1993 
1994  /*
1995  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
1996  * write the whole page to the xlog, we don't need to store
1997  * xl_heap_header in the xlog.
1998  */
1999  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2000  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2001  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2003  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2004  heaptup->t_len - SizeofHeapTupleHeader);
2005 
2006  /* filtering by origin on a row level is much more efficient */
2008 
2009  recptr = XLogInsert(RM_HEAP_ID, info);
2010 
2011  PageSetLSN(page, recptr);
2012  }
2013 
2014  END_CRIT_SECTION();
2015 
2016  UnlockReleaseBuffer(buffer);
2017  if (vmbuffer != InvalidBuffer)
2018  ReleaseBuffer(vmbuffer);
2019 
2020  /*
2021  * If tuple is cachable, mark it for invalidation from the caches in case
2022  * we abort. Note it is OK to do this after releasing the buffer, because
2023  * the heaptup data structure is all in local memory, not in the shared
2024  * buffer.
2025  */
2026  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2027 
2028  /* Note: speculative insertions are counted too, even if aborted later */
2029  pgstat_count_heap_insert(relation, 1);
2030 
2031  /*
2032  * If heaptup is a private copy, release it. Don't forget to copy t_self
2033  * back to the caller's image, too.
2034  */
2035  if (heaptup != tup)
2036  {
2037  tup->t_self = heaptup->t_self;
2038  heap_freetuple(heaptup);
2039  }
2040 }
2041 
2042 /*
2043  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2044  * tuple header fields and toasts the tuple if necessary. Returns a toasted
2045  * version of the tuple if it was toasted, or the original tuple if not. Note
2046  * that in any case, the header fields are also set in the original tuple.
2047  */
2048 static HeapTuple
2050  CommandId cid, int options)
2051 {
2052  /*
2053  * Parallel operations are required to be strictly read-only in a parallel
2054  * worker. Parallel inserts are not safe even in the leader in the
2055  * general case, because group locking means that heavyweight locks for
2056  * relation extension or GIN page locks will not conflict between members
2057  * of a lock group, but we don't prohibit that case here because there are
2058  * useful special cases that we can safely allow, such as CREATE TABLE AS.
2059  */
2060  if (IsParallelWorker())
2061  ereport(ERROR,
2062  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2063  errmsg("cannot insert tuples in a parallel worker")));
2064 
2065  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2066  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2068  HeapTupleHeaderSetXmin(tup->t_data, xid);
2069  if (options & HEAP_INSERT_FROZEN)
2071 
2072  HeapTupleHeaderSetCmin(tup->t_data, cid);
2073  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2074  tup->t_tableOid = RelationGetRelid(relation);
2075 
2076  /*
2077  * If the new tuple is too big for storage or contains already toasted
2078  * out-of-line attributes from some other relation, invoke the toaster.
2079  */
2080  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2081  relation->rd_rel->relkind != RELKIND_MATVIEW)
2082  {
2083  /* toast table entries should never be recursively toasted */
2085  return tup;
2086  }
2087  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2088  return heap_toast_insert_or_update(relation, tup, NULL, options);
2089  else
2090  return tup;
2091 }
2092 
2093 /*
2094  * heap_multi_insert - insert multiple tuple into a heap
2095  *
2096  * This is like heap_insert(), but inserts multiple tuples in one operation.
2097  * That's faster than calling heap_insert() in a loop, because when multiple
2098  * tuples can be inserted on a single page, we can write just a single WAL
2099  * record covering all of them, and only need to lock/unlock the page once.
2100  *
2101  * Note: this leaks memory into the current memory context. You can create a
2102  * temporary context before calling this, if that's a problem.
2103  */
2104 void
2105 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2106  CommandId cid, int options, BulkInsertState bistate)
2107 {
2109  HeapTuple *heaptuples;
2110  int i;
2111  int ndone;
2112  PGAlignedBlock scratch;
2113  Page page;
2114  bool needwal;
2115  Size saveFreeSpace;
2116  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2117  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2118 
2119  /* currently not needed (thus unsupported) for heap_multi_insert() */
2120  AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2121 
2122  needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2123  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2125 
2126  /* Toast and set header data in all the slots */
2127  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2128  for (i = 0; i < ntuples; i++)
2129  {
2130  HeapTuple tuple;
2131 
2132  tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2133  slots[i]->tts_tableOid = RelationGetRelid(relation);
2134  tuple->t_tableOid = slots[i]->tts_tableOid;
2135  heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2136  options);
2137  }
2138 
2139  /*
2140  * We're about to do the actual inserts -- but check for conflict first,
2141  * to minimize the possibility of having to roll back work we've just
2142  * done.
2143  *
2144  * A check here does not definitively prevent a serialization anomaly;
2145  * that check MUST be done at least past the point of acquiring an
2146  * exclusive buffer content lock on every buffer that will be affected,
2147  * and MAY be done after all inserts are reflected in the buffers and
2148  * those locks are released; otherwise there race condition. Since
2149  * multiple buffers can be locked and unlocked in the loop below, and it
2150  * would not be feasible to identify and lock all of those buffers before
2151  * the loop, we must do a final check at the end.
2152  *
2153  * The check here could be omitted with no loss of correctness; it is
2154  * present strictly as an optimization.
2155  *
2156  * For heap inserts, we only need to check for table-level SSI locks. Our
2157  * new tuples can't possibly conflict with existing tuple locks, and heap
2158  * page locks are only consolidated versions of tuple locks; they do not
2159  * lock "gaps" as index page locks do. So we don't need to specify a
2160  * buffer when making the call, which makes for a faster check.
2161  */
2163 
2164  ndone = 0;
2165  while (ndone < ntuples)
2166  {
2167  Buffer buffer;
2168  Buffer vmbuffer = InvalidBuffer;
2169  bool all_visible_cleared = false;
2170  int nthispage;
2171 
2173 
2174  /*
2175  * Find buffer where at least the next tuple will fit. If the page is
2176  * all-visible, this will also pin the requisite visibility map page.
2177  */
2178  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2179  InvalidBuffer, options, bistate,
2180  &vmbuffer, NULL);
2181  page = BufferGetPage(buffer);
2182 
2183  /* NO EREPORT(ERROR) from here till changes are logged */
2185 
2186  /*
2187  * RelationGetBufferForTuple has ensured that the first tuple fits.
2188  * Put that on the page, and then as many other tuples as fit.
2189  */
2190  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2191  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2192  {
2193  HeapTuple heaptup = heaptuples[ndone + nthispage];
2194 
2195  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2196  break;
2197 
2198  RelationPutHeapTuple(relation, buffer, heaptup, false);
2199 
2200  /*
2201  * We don't use heap_multi_insert for catalog tuples yet, but
2202  * better be prepared...
2203  */
2204  if (needwal && need_cids)
2205  log_heap_new_cid(relation, heaptup);
2206  }
2207 
2208  if (PageIsAllVisible(page))
2209  {
2210  all_visible_cleared = true;
2211  PageClearAllVisible(page);
2212  visibilitymap_clear(relation,
2213  BufferGetBlockNumber(buffer),
2214  vmbuffer, VISIBILITYMAP_VALID_BITS);
2215  }
2216 
2217  /*
2218  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2219  */
2220 
2221  MarkBufferDirty(buffer);
2222 
2223  /* XLOG stuff */
2224  if (needwal)
2225  {
2226  XLogRecPtr recptr;
2227  xl_heap_multi_insert *xlrec;
2229  char *tupledata;
2230  int totaldatalen;
2231  char *scratchptr = scratch.data;
2232  bool init;
2233  int bufflags = 0;
2234 
2235  /*
2236  * If the page was previously empty, we can reinit the page
2237  * instead of restoring the whole thing.
2238  */
2239  init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2240  PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2241 
2242  /* allocate xl_heap_multi_insert struct from the scratch area */
2243  xlrec = (xl_heap_multi_insert *) scratchptr;
2244  scratchptr += SizeOfHeapMultiInsert;
2245 
2246  /*
2247  * Allocate offsets array. Unless we're reinitializing the page,
2248  * in that case the tuples are stored in order starting at
2249  * FirstOffsetNumber and we don't need to store the offsets
2250  * explicitly.
2251  */
2252  if (!init)
2253  scratchptr += nthispage * sizeof(OffsetNumber);
2254 
2255  /* the rest of the scratch space is used for tuple data */
2256  tupledata = scratchptr;
2257 
2258  xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2259  xlrec->ntuples = nthispage;
2260 
2261  /*
2262  * Write out an xl_multi_insert_tuple and the tuple data itself
2263  * for each tuple.
2264  */
2265  for (i = 0; i < nthispage; i++)
2266  {
2267  HeapTuple heaptup = heaptuples[ndone + i];
2268  xl_multi_insert_tuple *tuphdr;
2269  int datalen;
2270 
2271  if (!init)
2272  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2273  /* xl_multi_insert_tuple needs two-byte alignment. */
2274  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2275  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2276 
2277  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2278  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2279  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2280 
2281  /* write bitmap [+ padding] [+ oid] + data */
2282  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2283  memcpy(scratchptr,
2284  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2285  datalen);
2286  tuphdr->datalen = datalen;
2287  scratchptr += datalen;
2288  }
2289  totaldatalen = scratchptr - tupledata;
2290  Assert((scratchptr - scratch.data) < BLCKSZ);
2291 
2292  if (need_tuple_data)
2294 
2295  /*
2296  * Signal that this is the last xl_heap_multi_insert record
2297  * emitted by this call to heap_multi_insert(). Needed for logical
2298  * decoding so it knows when to cleanup temporary data.
2299  */
2300  if (ndone + nthispage == ntuples)
2301  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2302 
2303  if (init)
2304  {
2305  info |= XLOG_HEAP_INIT_PAGE;
2306  bufflags |= REGBUF_WILL_INIT;
2307  }
2308 
2309  /*
2310  * If we're doing logical decoding, include the new tuple data
2311  * even if we take a full-page image of the page.
2312  */
2313  if (need_tuple_data)
2314  bufflags |= REGBUF_KEEP_DATA;
2315 
2316  XLogBeginInsert();
2317  XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2318  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2319 
2320  XLogRegisterBufData(0, tupledata, totaldatalen);
2321 
2322  /* filtering by origin on a row level is much more efficient */
2324 
2325  recptr = XLogInsert(RM_HEAP2_ID, info);
2326 
2327  PageSetLSN(page, recptr);
2328  }
2329 
2330  END_CRIT_SECTION();
2331 
2332  UnlockReleaseBuffer(buffer);
2333  if (vmbuffer != InvalidBuffer)
2334  ReleaseBuffer(vmbuffer);
2335 
2336  ndone += nthispage;
2337  }
2338 
2339  /*
2340  * We're done with the actual inserts. Check for conflicts again, to
2341  * ensure that all rw-conflicts in to these inserts are detected. Without
2342  * this final check, a sequential scan of the heap may have locked the
2343  * table after the "before" check, missing one opportunity to detect the
2344  * conflict, and then scanned the table before the new tuples were there,
2345  * missing the other chance to detect the conflict.
2346  *
2347  * For heap inserts, we only need to check for table-level SSI locks. Our
2348  * new tuples can't possibly conflict with existing tuple locks, and heap
2349  * page locks are only consolidated versions of tuple locks; they do not
2350  * lock "gaps" as index page locks do. So we don't need to specify a
2351  * buffer when making the call.
2352  */
2354 
2355  /*
2356  * If tuples are cachable, mark them for invalidation from the caches in
2357  * case we abort. Note it is OK to do this after releasing the buffer,
2358  * because the heaptuples data structure is all in local memory, not in
2359  * the shared buffer.
2360  */
2361  if (IsCatalogRelation(relation))
2362  {
2363  for (i = 0; i < ntuples; i++)
2364  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2365  }
2366 
2367  /* copy t_self fields back to the caller's slots */
2368  for (i = 0; i < ntuples; i++)
2369  slots[i]->tts_tid = heaptuples[i]->t_self;
2370 
2371  pgstat_count_heap_insert(relation, ntuples);
2372 }
2373 
2374 /*
2375  * simple_heap_insert - insert a tuple
2376  *
2377  * Currently, this routine differs from heap_insert only in supplying
2378  * a default command ID and not allowing access to the speedup options.
2379  *
2380  * This should be used rather than using heap_insert directly in most places
2381  * where we are modifying system catalogs.
2382  */
2383 void
2385 {
2386  heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2387 }
2388 
2389 /*
2390  * Given infomask/infomask2, compute the bits that must be saved in the
2391  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2392  * xl_heap_lock_updated WAL records.
2393  *
2394  * See fix_infomask_from_infobits.
2395  */
2396 static uint8
2397 compute_infobits(uint16 infomask, uint16 infomask2)
2398 {
2399  return
2400  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2401  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2402  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2403  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2404  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2405  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2406  XLHL_KEYS_UPDATED : 0);
2407 }
2408 
2409 /*
2410  * Given two versions of the same t_infomask for a tuple, compare them and
2411  * return whether the relevant status for a tuple Xmax has changed. This is
2412  * used after a buffer lock has been released and reacquired: we want to ensure
2413  * that the tuple state continues to be the same it was when we previously
2414  * examined it.
2415  *
2416  * Note the Xmax field itself must be compared separately.
2417  */
2418 static inline bool
2419 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2420 {
2421  const uint16 interesting =
2423 
2424  if ((new_infomask & interesting) != (old_infomask & interesting))
2425  return true;
2426 
2427  return false;
2428 }
2429 
2430 /*
2431  * heap_delete - delete a tuple
2432  *
2433  * See table_tuple_delete() for an explanation of the parameters, except that
2434  * this routine directly takes a tuple rather than a slot.
2435  *
2436  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2437  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2438  * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2439  * generated by another transaction).
2440  */
2441 TM_Result
2443  CommandId cid, Snapshot crosscheck, bool wait,
2444  TM_FailureData *tmfd, bool changingPart)
2445 {
2446  TM_Result result;
2448  ItemId lp;
2449  HeapTupleData tp;
2450  Page page;
2451  BlockNumber block;
2452  Buffer buffer;
2453  Buffer vmbuffer = InvalidBuffer;
2454  TransactionId new_xmax;
2455  uint16 new_infomask,
2456  new_infomask2;
2457  bool have_tuple_lock = false;
2458  bool iscombo;
2459  bool all_visible_cleared = false;
2460  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2461  bool old_key_copied = false;
2462 
2463  Assert(ItemPointerIsValid(tid));
2464 
2465  /*
2466  * Forbid this during a parallel operation, lest it allocate a combocid.
2467  * Other workers might need that combocid for visibility checks, and we
2468  * have no provision for broadcasting it to them.
2469  */
2470  if (IsInParallelMode())
2471  ereport(ERROR,
2472  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2473  errmsg("cannot delete tuples during a parallel operation")));
2474 
2475  block = ItemPointerGetBlockNumber(tid);
2476  buffer = ReadBuffer(relation, block);
2477  page = BufferGetPage(buffer);
2478 
2479  /*
2480  * Before locking the buffer, pin the visibility map page if it appears to
2481  * be necessary. Since we haven't got the lock yet, someone else might be
2482  * in the middle of changing this, so we'll need to recheck after we have
2483  * the lock.
2484  */
2485  if (PageIsAllVisible(page))
2486  visibilitymap_pin(relation, block, &vmbuffer);
2487 
2489 
2490  /*
2491  * If we didn't pin the visibility map page and the page has become all
2492  * visible while we were busy locking the buffer, we'll have to unlock and
2493  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2494  * unfortunate, but hopefully shouldn't happen often.
2495  */
2496  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2497  {
2498  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2499  visibilitymap_pin(relation, block, &vmbuffer);
2501  }
2502 
2503  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2504  Assert(ItemIdIsNormal(lp));
2505 
2506  tp.t_tableOid = RelationGetRelid(relation);
2507  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2508  tp.t_len = ItemIdGetLength(lp);
2509  tp.t_self = *tid;
2510 
2511 l1:
2512  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2513 
2514  if (result == TM_Invisible)
2515  {
2516  UnlockReleaseBuffer(buffer);
2517  ereport(ERROR,
2518  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2519  errmsg("attempted to delete invisible tuple")));
2520  }
2521  else if (result == TM_BeingModified && wait)
2522  {
2523  TransactionId xwait;
2524  uint16 infomask;
2525 
2526  /* must copy state data before unlocking buffer */
2527  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2528  infomask = tp.t_data->t_infomask;
2529 
2530  /*
2531  * Sleep until concurrent transaction ends -- except when there's a
2532  * single locker and it's our own transaction. Note we don't care
2533  * which lock mode the locker has, because we need the strongest one.
2534  *
2535  * Before sleeping, we need to acquire tuple lock to establish our
2536  * priority for the tuple (see heap_lock_tuple). LockTuple will
2537  * release us when we are next-in-line for the tuple.
2538  *
2539  * If we are forced to "start over" below, we keep the tuple lock;
2540  * this arranges that we stay at the head of the line while rechecking
2541  * tuple state.
2542  */
2543  if (infomask & HEAP_XMAX_IS_MULTI)
2544  {
2545  bool current_is_member = false;
2546 
2547  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2548  LockTupleExclusive, &current_is_member))
2549  {
2550  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2551 
2552  /*
2553  * Acquire the lock, if necessary (but skip it when we're
2554  * requesting a lock and already have one; avoids deadlock).
2555  */
2556  if (!current_is_member)
2558  LockWaitBlock, &have_tuple_lock);
2559 
2560  /* wait for multixact */
2562  relation, &(tp.t_self), XLTW_Delete,
2563  NULL);
2565 
2566  /*
2567  * If xwait had just locked the tuple then some other xact
2568  * could update this tuple before we get to this point. Check
2569  * for xmax change, and start over if so.
2570  */
2571  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2573  xwait))
2574  goto l1;
2575  }
2576 
2577  /*
2578  * You might think the multixact is necessarily done here, but not
2579  * so: it could have surviving members, namely our own xact or
2580  * other subxacts of this backend. It is legal for us to delete
2581  * the tuple in either case, however (the latter case is
2582  * essentially a situation of upgrading our former shared lock to
2583  * exclusive). We don't bother changing the on-disk hint bits
2584  * since we are about to overwrite the xmax altogether.
2585  */
2586  }
2587  else if (!TransactionIdIsCurrentTransactionId(xwait))
2588  {
2589  /*
2590  * Wait for regular transaction to end; but first, acquire tuple
2591  * lock.
2592  */
2593  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2595  LockWaitBlock, &have_tuple_lock);
2596  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2598 
2599  /*
2600  * xwait is done, but if xwait had just locked the tuple then some
2601  * other xact could update this tuple before we get to this point.
2602  * Check for xmax change, and start over if so.
2603  */
2604  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2606  xwait))
2607  goto l1;
2608 
2609  /* Otherwise check if it committed or aborted */
2610  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2611  }
2612 
2613  /*
2614  * We may overwrite if previous xmax aborted, or if it committed but
2615  * only locked the tuple without updating it.
2616  */
2617  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2620  result = TM_Ok;
2621  else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid) ||
2623  result = TM_Updated;
2624  else
2625  result = TM_Deleted;
2626  }
2627 
2628  if (crosscheck != InvalidSnapshot && result == TM_Ok)
2629  {
2630  /* Perform additional check for transaction-snapshot mode RI updates */
2631  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2632  result = TM_Updated;
2633  }
2634 
2635  if (result != TM_Ok)
2636  {
2637  Assert(result == TM_SelfModified ||
2638  result == TM_Updated ||
2639  result == TM_Deleted ||
2640  result == TM_BeingModified);
2642  Assert(result != TM_Updated ||
2643  !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2644  tmfd->ctid = tp.t_data->t_ctid;
2646  if (result == TM_SelfModified)
2647  tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2648  else
2649  tmfd->cmax = InvalidCommandId;
2650  UnlockReleaseBuffer(buffer);
2651  if (have_tuple_lock)
2652  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2653  if (vmbuffer != InvalidBuffer)
2654  ReleaseBuffer(vmbuffer);
2655  return result;
2656  }
2657 
2658  /*
2659  * We're about to do the actual delete -- check for conflict first, to
2660  * avoid possibly having to roll back work we've just done.
2661  *
2662  * This is safe without a recheck as long as there is no possibility of
2663  * another process scanning the page between this check and the delete
2664  * being visible to the scan (i.e., an exclusive buffer content lock is
2665  * continuously held from this point until the tuple delete is visible).
2666  */
2667  CheckForSerializableConflictIn(relation, &tp, buffer);
2668 
2669  /* replace cid with a combo cid if necessary */
2670  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2671 
2672  /*
2673  * Compute replica identity tuple before entering the critical section so
2674  * we don't PANIC upon a memory allocation failure.
2675  */
2676  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2677 
2678  /*
2679  * If this is the first possibly-multixact-able operation in the current
2680  * transaction, set my per-backend OldestMemberMXactId setting. We can be
2681  * certain that the transaction will never become a member of any older
2682  * MultiXactIds than that. (We have to do this even if we end up just
2683  * using our own TransactionId below, since some other backend could
2684  * incorporate our XID into a MultiXact immediately afterwards.)
2685  */
2687 
2690  xid, LockTupleExclusive, true,
2691  &new_xmax, &new_infomask, &new_infomask2);
2692 
2694 
2695  /*
2696  * If this transaction commits, the tuple will become DEAD sooner or
2697  * later. Set flag that this page is a candidate for pruning once our xid
2698  * falls below the OldestXmin horizon. If the transaction finally aborts,
2699  * the subsequent page pruning will be a no-op and the hint will be
2700  * cleared.
2701  */
2702  PageSetPrunable(page, xid);
2703 
2704  if (PageIsAllVisible(page))
2705  {
2706  all_visible_cleared = true;
2707  PageClearAllVisible(page);
2708  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2709  vmbuffer, VISIBILITYMAP_VALID_BITS);
2710  }
2711 
2712  /* store transaction information of xact deleting the tuple */
2715  tp.t_data->t_infomask |= new_infomask;
2716  tp.t_data->t_infomask2 |= new_infomask2;
2718  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2719  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2720  /* Make sure there is no forward chain link in t_ctid */
2721  tp.t_data->t_ctid = tp.t_self;
2722 
2723  /* Signal that this is actually a move into another partition */
2724  if (changingPart)
2726 
2727  MarkBufferDirty(buffer);
2728 
2729  /*
2730  * XLOG stuff
2731  *
2732  * NB: heap_abort_speculative() uses the same xlog record and replay
2733  * routines.
2734  */
2735  if (RelationNeedsWAL(relation))
2736  {
2737  xl_heap_delete xlrec;
2738  xl_heap_header xlhdr;
2739  XLogRecPtr recptr;
2740 
2741  /* For logical decode we need combocids to properly decode the catalog */
2743  log_heap_new_cid(relation, &tp);
2744 
2745  xlrec.flags = 0;
2746  if (all_visible_cleared)
2748  if (changingPart)
2751  tp.t_data->t_infomask2);
2753  xlrec.xmax = new_xmax;
2754 
2755  if (old_key_tuple != NULL)
2756  {
2757  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
2759  else
2761  }
2762 
2763  XLogBeginInsert();
2764  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
2765 
2766  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2767 
2768  /*
2769  * Log replica identity of the deleted tuple if there is one
2770  */
2771  if (old_key_tuple != NULL)
2772  {
2773  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
2774  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
2775  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
2776 
2777  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
2778  XLogRegisterData((char *) old_key_tuple->t_data
2780  old_key_tuple->t_len
2782  }
2783 
2784  /* filtering by origin on a row level is much more efficient */
2786 
2787  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2788 
2789  PageSetLSN(page, recptr);
2790  }
2791 
2792  END_CRIT_SECTION();
2793 
2794  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2795 
2796  if (vmbuffer != InvalidBuffer)
2797  ReleaseBuffer(vmbuffer);
2798 
2799  /*
2800  * If the tuple has toasted out-of-line attributes, we need to delete
2801  * those items too. We have to do this before releasing the buffer
2802  * because we need to look at the contents of the tuple, but it's OK to
2803  * release the content lock on the buffer first.
2804  */
2805  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2806  relation->rd_rel->relkind != RELKIND_MATVIEW)
2807  {
2808  /* toast table entries should never be recursively toasted */
2810  }
2811  else if (HeapTupleHasExternal(&tp))
2812  heap_toast_delete(relation, &tp, false);
2813 
2814  /*
2815  * Mark tuple for invalidation from system caches at next command
2816  * boundary. We have to do this before releasing the buffer because we
2817  * need to look at the contents of the tuple.
2818  */
2819  CacheInvalidateHeapTuple(relation, &tp, NULL);
2820 
2821  /* Now we can release the buffer */
2822  ReleaseBuffer(buffer);
2823 
2824  /*
2825  * Release the lmgr tuple lock, if we had it.
2826  */
2827  if (have_tuple_lock)
2828  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2829 
2830  pgstat_count_heap_delete(relation);
2831 
2832  if (old_key_tuple != NULL && old_key_copied)
2833  heap_freetuple(old_key_tuple);
2834 
2835  return TM_Ok;
2836 }
2837 
2838 /*
2839  * simple_heap_delete - delete a tuple
2840  *
2841  * This routine may be used to delete a tuple when concurrent updates of
2842  * the target tuple are not expected (for example, because we have a lock
2843  * on the relation associated with the tuple). Any failure is reported
2844  * via ereport().
2845  */
2846 void
2848 {
2849  TM_Result result;
2850  TM_FailureData tmfd;
2851 
2852  result = heap_delete(relation, tid,
2854  true /* wait for commit */ ,
2855  &tmfd, false /* changingPart */ );
2856  switch (result)
2857  {
2858  case TM_SelfModified:
2859  /* Tuple was already updated in current command? */
2860  elog(ERROR, "tuple already updated by self");
2861  break;
2862 
2863  case TM_Ok:
2864  /* done successfully */
2865  break;
2866 
2867  case TM_Updated:
2868  elog(ERROR, "tuple concurrently updated");
2869  break;
2870 
2871  case TM_Deleted:
2872  elog(ERROR, "tuple concurrently deleted");
2873  break;
2874 
2875  default:
2876  elog(ERROR, "unrecognized heap_delete status: %u", result);
2877  break;
2878  }
2879 }
2880 
2881 /*
2882  * heap_update - replace a tuple
2883  *
2884  * See table_tuple_update() for an explanation of the parameters, except that
2885  * this routine directly takes a tuple rather than a slot.
2886  *
2887  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2888  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2889  * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2890  * generated by another transaction).
2891  */
2892 TM_Result
2894  CommandId cid, Snapshot crosscheck, bool wait,
2895  TM_FailureData *tmfd, LockTupleMode *lockmode)
2896 {
2897  TM_Result result;
2899  Bitmapset *hot_attrs;
2900  Bitmapset *key_attrs;
2901  Bitmapset *id_attrs;
2902  Bitmapset *interesting_attrs;
2903  Bitmapset *modified_attrs;
2904  ItemId lp;
2905  HeapTupleData oldtup;
2906  HeapTuple heaptup;
2907  HeapTuple old_key_tuple = NULL;
2908  bool old_key_copied = false;
2909  Page page;
2910  BlockNumber block;
2911  MultiXactStatus mxact_status;
2912  Buffer buffer,
2913  newbuf,
2914  vmbuffer = InvalidBuffer,
2915  vmbuffer_new = InvalidBuffer;
2916  bool need_toast;
2917  Size newtupsize,
2918  pagefree;
2919  bool have_tuple_lock = false;
2920  bool iscombo;
2921  bool use_hot_update = false;
2922  bool hot_attrs_checked = false;
2923  bool key_intact;
2924  bool all_visible_cleared = false;
2925  bool all_visible_cleared_new = false;
2926  bool checked_lockers;
2927  bool locker_remains;
2928  TransactionId xmax_new_tuple,
2929  xmax_old_tuple;
2930  uint16 infomask_old_tuple,
2931  infomask2_old_tuple,
2932  infomask_new_tuple,
2933  infomask2_new_tuple;
2934 
2935  Assert(ItemPointerIsValid(otid));
2936 
2937  /*
2938  * Forbid this during a parallel operation, lest it allocate a combocid.
2939  * Other workers might need that combocid for visibility checks, and we
2940  * have no provision for broadcasting it to them.
2941  */
2942  if (IsInParallelMode())
2943  ereport(ERROR,
2944  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2945  errmsg("cannot update tuples during a parallel operation")));
2946 
2947  /*
2948  * Fetch the list of attributes to be checked for various operations.
2949  *
2950  * For HOT considerations, this is wasted effort if we fail to update or
2951  * have to put the new tuple on a different page. But we must compute the
2952  * list before obtaining buffer lock --- in the worst case, if we are
2953  * doing an update on one of the relevant system catalogs, we could
2954  * deadlock if we try to fetch the list later. In any case, the relcache
2955  * caches the data so this is usually pretty cheap.
2956  *
2957  * We also need columns used by the replica identity and columns that are
2958  * considered the "key" of rows in the table.
2959  *
2960  * Note that we get copies of each bitmap, so we need not worry about
2961  * relcache flush happening midway through.
2962  */
2963  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
2964  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
2965  id_attrs = RelationGetIndexAttrBitmap(relation,
2967 
2968 
2969  block = ItemPointerGetBlockNumber(otid);
2970  buffer = ReadBuffer(relation, block);
2971  page = BufferGetPage(buffer);
2972 
2973  interesting_attrs = NULL;
2974 
2975  /*
2976  * If the page is already full, there is hardly any chance of doing a HOT
2977  * update on this page. It might be wasteful effort to look for index
2978  * column updates only to later reject HOT updates for lack of space in
2979  * the same page. So we be conservative and only fetch hot_attrs if the
2980  * page is not already full. Since we are already holding a pin on the
2981  * buffer, there is no chance that the buffer can get cleaned up
2982  * concurrently and even if that was possible, in the worst case we lose a
2983  * chance to do a HOT update.
2984  */
2985  if (!PageIsFull(page))
2986  {
2987  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
2988  hot_attrs_checked = true;
2989  }
2990  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
2991  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
2992 
2993  /*
2994  * Before locking the buffer, pin the visibility map page if it appears to
2995  * be necessary. Since we haven't got the lock yet, someone else might be
2996  * in the middle of changing this, so we'll need to recheck after we have
2997  * the lock.
2998  */
2999  if (PageIsAllVisible(page))
3000  visibilitymap_pin(relation, block, &vmbuffer);
3001 
3003 
3004  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3005  Assert(ItemIdIsNormal(lp));
3006 
3007  /*
3008  * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3009  * properly.
3010  */
3011  oldtup.t_tableOid = RelationGetRelid(relation);
3012  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3013  oldtup.t_len = ItemIdGetLength(lp);
3014  oldtup.t_self = *otid;
3015 
3016  /* the new tuple is ready, except for this: */
3017  newtup->t_tableOid = RelationGetRelid(relation);
3018 
3019  /* Determine columns modified by the update. */
3020  modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3021  &oldtup, newtup);
3022 
3023  /*
3024  * If we're not updating any "key" column, we can grab a weaker lock type.
3025  * This allows for more concurrency when we are running simultaneously
3026  * with foreign key checks.
3027  *
3028  * Note that if a column gets detoasted while executing the update, but
3029  * the value ends up being the same, this test will fail and we will use
3030  * the stronger lock. This is acceptable; the important case to optimize
3031  * is updates that don't manipulate key columns, not those that
3032  * serendipitously arrive at the same key values.
3033  */
3034  if (!bms_overlap(modified_attrs, key_attrs))
3035  {
3036  *lockmode = LockTupleNoKeyExclusive;
3037  mxact_status = MultiXactStatusNoKeyUpdate;
3038  key_intact = true;
3039 
3040  /*
3041  * If this is the first possibly-multixact-able operation in the
3042  * current transaction, set my per-backend OldestMemberMXactId
3043  * setting. We can be certain that the transaction will never become a
3044  * member of any older MultiXactIds than that. (We have to do this
3045  * even if we end up just using our own TransactionId below, since
3046  * some other backend could incorporate our XID into a MultiXact
3047  * immediately afterwards.)
3048  */
3050  }
3051  else
3052  {
3053  *lockmode = LockTupleExclusive;
3054  mxact_status = MultiXactStatusUpdate;
3055  key_intact = false;
3056  }
3057 
3058  /*
3059  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3060  * otid may very well point at newtup->t_self, which we will overwrite
3061  * with the new tuple's location, so there's great risk of confusion if we
3062  * use otid anymore.
3063  */
3064 
3065 l2:
3066  checked_lockers = false;
3067  locker_remains = false;
3068  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3069 
3070  /* see below about the "no wait" case */
3071  Assert(result != TM_BeingModified || wait);
3072 
3073  if (result == TM_Invisible)
3074  {
3075  UnlockReleaseBuffer(buffer);
3076  ereport(ERROR,
3077  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3078  errmsg("attempted to update invisible tuple")));
3079  }
3080  else if (result == TM_BeingModified && wait)
3081  {
3082  TransactionId xwait;
3083  uint16 infomask;
3084  bool can_continue = false;
3085 
3086  /*
3087  * XXX note that we don't consider the "no wait" case here. This
3088  * isn't a problem currently because no caller uses that case, but it
3089  * should be fixed if such a caller is introduced. It wasn't a
3090  * problem previously because this code would always wait, but now
3091  * that some tuple locks do not conflict with one of the lock modes we
3092  * use, it is possible that this case is interesting to handle
3093  * specially.
3094  *
3095  * This may cause failures with third-party code that calls
3096  * heap_update directly.
3097  */
3098 
3099  /* must copy state data before unlocking buffer */
3100  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3101  infomask = oldtup.t_data->t_infomask;
3102 
3103  /*
3104  * Now we have to do something about the existing locker. If it's a
3105  * multi, sleep on it; we might be awakened before it is completely
3106  * gone (or even not sleep at all in some cases); we need to preserve
3107  * it as locker, unless it is gone completely.
3108  *
3109  * If it's not a multi, we need to check for sleeping conditions
3110  * before actually going to sleep. If the update doesn't conflict
3111  * with the locks, we just continue without sleeping (but making sure
3112  * it is preserved).
3113  *
3114  * Before sleeping, we need to acquire tuple lock to establish our
3115  * priority for the tuple (see heap_lock_tuple). LockTuple will
3116  * release us when we are next-in-line for the tuple. Note we must
3117  * not acquire the tuple lock until we're sure we're going to sleep;
3118  * otherwise we're open for race conditions with other transactions
3119  * holding the tuple lock which sleep on us.
3120  *
3121  * If we are forced to "start over" below, we keep the tuple lock;
3122  * this arranges that we stay at the head of the line while rechecking
3123  * tuple state.
3124  */
3125  if (infomask & HEAP_XMAX_IS_MULTI)
3126  {
3127  TransactionId update_xact;
3128  int remain;
3129  bool current_is_member = false;
3130 
3131  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3132  *lockmode, &current_is_member))
3133  {
3134  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3135 
3136  /*
3137  * Acquire the lock, if necessary (but skip it when we're
3138  * requesting a lock and already have one; avoids deadlock).
3139  */
3140  if (!current_is_member)
3141  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3142  LockWaitBlock, &have_tuple_lock);
3143 
3144  /* wait for multixact */
3145  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3146  relation, &oldtup.t_self, XLTW_Update,
3147  &remain);
3148  checked_lockers = true;
3149  locker_remains = remain != 0;
3151 
3152  /*
3153  * If xwait had just locked the tuple then some other xact
3154  * could update this tuple before we get to this point. Check
3155  * for xmax change, and start over if so.
3156  */
3158  infomask) ||
3160  xwait))
3161  goto l2;
3162  }
3163 
3164  /*
3165  * Note that the multixact may not be done by now. It could have
3166  * surviving members; our own xact or other subxacts of this
3167  * backend, and also any other concurrent transaction that locked
3168  * the tuple with LockTupleKeyShare if we only got
3169  * LockTupleNoKeyExclusive. If this is the case, we have to be
3170  * careful to mark the updated tuple with the surviving members in
3171  * Xmax.
3172  *
3173  * Note that there could have been another update in the
3174  * MultiXact. In that case, we need to check whether it committed
3175  * or aborted. If it aborted we are safe to update it again;
3176  * otherwise there is an update conflict, and we have to return
3177  * TableTuple{Deleted, Updated} below.
3178  *
3179  * In the LockTupleExclusive case, we still need to preserve the
3180  * surviving members: those would include the tuple locks we had
3181  * before this one, which are important to keep in case this
3182  * subxact aborts.
3183  */
3185  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3186  else
3187  update_xact = InvalidTransactionId;
3188 
3189  /*
3190  * There was no UPDATE in the MultiXact; or it aborted. No
3191  * TransactionIdIsInProgress() call needed here, since we called
3192  * MultiXactIdWait() above.
3193  */
3194  if (!TransactionIdIsValid(update_xact) ||
3195  TransactionIdDidAbort(update_xact))
3196  can_continue = true;
3197  }
3198  else if (TransactionIdIsCurrentTransactionId(xwait))
3199  {
3200  /*
3201  * The only locker is ourselves; we can avoid grabbing the tuple
3202  * lock here, but must preserve our locking information.
3203  */
3204  checked_lockers = true;
3205  locker_remains = true;
3206  can_continue = true;
3207  }
3208  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3209  {
3210  /*
3211  * If it's just a key-share locker, and we're not changing the key
3212  * columns, we don't need to wait for it to end; but we need to
3213  * preserve it as locker.
3214  */
3215  checked_lockers = true;
3216  locker_remains = true;
3217  can_continue = true;
3218  }
3219  else
3220  {
3221  /*
3222  * Wait for regular transaction to end; but first, acquire tuple
3223  * lock.
3224  */
3225  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3226  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3227  LockWaitBlock, &have_tuple_lock);
3228  XactLockTableWait(xwait, relation, &oldtup.t_self,
3229  XLTW_Update);
3230  checked_lockers = true;
3232 
3233  /*
3234  * xwait is done, but if xwait had just locked the tuple then some
3235  * other xact could update this tuple before we get to this point.
3236  * Check for xmax change, and start over if so.
3237  */
3238  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3239  !TransactionIdEquals(xwait,
3241  goto l2;
3242 
3243  /* Otherwise check if it committed or aborted */
3244  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3245  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3246  can_continue = true;
3247  }
3248 
3249  if (can_continue)
3250  result = TM_Ok;
3251  else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid) ||
3253  result = TM_Updated;
3254  else
3255  result = TM_Deleted;
3256  }
3257 
3258  if (crosscheck != InvalidSnapshot && result == TM_Ok)
3259  {
3260  /* Perform additional check for transaction-snapshot mode RI updates */
3261  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3262  {
3263  result = TM_Updated;
3264  Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3265  }
3266  }
3267 
3268  if (result != TM_Ok)
3269  {
3270  Assert(result == TM_SelfModified ||
3271  result == TM_Updated ||
3272  result == TM_Deleted ||
3273  result == TM_BeingModified);
3274  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3275  Assert(result != TM_Updated ||
3276  !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3277  tmfd->ctid = oldtup.t_data->t_ctid;
3278  tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3279  if (result == TM_SelfModified)
3280  tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3281  else
3282  tmfd->cmax = InvalidCommandId;
3283  UnlockReleaseBuffer(buffer);
3284  if (have_tuple_lock)
3285  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3286  if (vmbuffer != InvalidBuffer)
3287  ReleaseBuffer(vmbuffer);
3288  bms_free(hot_attrs);
3289  bms_free(key_attrs);
3290  bms_free(id_attrs);
3291  bms_free(modified_attrs);
3292  bms_free(interesting_attrs);
3293  return result;
3294  }
3295 
3296  /*
3297  * If we didn't pin the visibility map page and the page has become all
3298  * visible while we were busy locking the buffer, or during some
3299  * subsequent window during which we had it unlocked, we'll have to unlock
3300  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3301  * bit unfortunate, especially since we'll now have to recheck whether the
3302  * tuple has been locked or updated under us, but hopefully it won't
3303  * happen very often.
3304  */
3305  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3306  {
3307  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3308  visibilitymap_pin(relation, block, &vmbuffer);
3310  goto l2;
3311  }
3312 
3313  /* Fill in transaction status data */
3314 
3315  /*
3316  * If the tuple we're updating is locked, we need to preserve the locking
3317  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3318  */
3320  oldtup.t_data->t_infomask,
3321  oldtup.t_data->t_infomask2,
3322  xid, *lockmode, true,
3323  &xmax_old_tuple, &infomask_old_tuple,
3324  &infomask2_old_tuple);
3325 
3326  /*
3327  * And also prepare an Xmax value for the new copy of the tuple. If there
3328  * was no xmax previously, or there was one but all lockers are now gone,
3329  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3330  * rare cases that might also be InvalidXid and yet not have the
3331  * HEAP_XMAX_INVALID bit set; that's fine.)
3332  */
3333  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3335  (checked_lockers && !locker_remains))
3336  xmax_new_tuple = InvalidTransactionId;
3337  else
3338  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3339 
3340  if (!TransactionIdIsValid(xmax_new_tuple))
3341  {
3342  infomask_new_tuple = HEAP_XMAX_INVALID;
3343  infomask2_new_tuple = 0;
3344  }
3345  else
3346  {
3347  /*
3348  * If we found a valid Xmax for the new tuple, then the infomask bits
3349  * to use on the new tuple depend on what was there on the old one.
3350  * Note that since we're doing an update, the only possibility is that
3351  * the lockers had FOR KEY SHARE lock.
3352  */
3353  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3354  {
3355  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3356  &infomask2_new_tuple);
3357  }
3358  else
3359  {
3360  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3361  infomask2_new_tuple = 0;
3362  }
3363  }
3364 
3365  /*
3366  * Prepare the new tuple with the appropriate initial values of Xmin and
3367  * Xmax, as well as initial infomask bits as computed above.
3368  */
3369  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3370  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3371  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3372  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3373  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3374  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3375  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3376 
3377  /*
3378  * Replace cid with a combo cid if necessary. Note that we already put
3379  * the plain cid into the new tuple.
3380  */
3381  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3382 
3383  /*
3384  * If the toaster needs to be activated, OR if the new tuple will not fit
3385  * on the same page as the old, then we need to release the content lock
3386  * (but not the pin!) on the old tuple's buffer while we are off doing
3387  * TOAST and/or table-file-extension work. We must mark the old tuple to
3388  * show that it's locked, else other processes may try to update it
3389  * themselves.
3390  *
3391  * We need to invoke the toaster if there are already any out-of-line
3392  * toasted values present, or if the new tuple is over-threshold.
3393  */
3394  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3395  relation->rd_rel->relkind != RELKIND_MATVIEW)
3396  {
3397  /* toast table entries should never be recursively toasted */
3398  Assert(!HeapTupleHasExternal(&oldtup));
3399  Assert(!HeapTupleHasExternal(newtup));
3400  need_toast = false;
3401  }
3402  else
3403  need_toast = (HeapTupleHasExternal(&oldtup) ||
3404  HeapTupleHasExternal(newtup) ||
3405  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3406 
3407  pagefree = PageGetHeapFreeSpace(page);
3408 
3409  newtupsize = MAXALIGN(newtup->t_len);
3410 
3411  if (need_toast || newtupsize > pagefree)
3412  {
3413  TransactionId xmax_lock_old_tuple;
3414  uint16 infomask_lock_old_tuple,
3415  infomask2_lock_old_tuple;
3416  bool cleared_all_frozen = false;
3417 
3418  /*
3419  * To prevent concurrent sessions from updating the tuple, we have to
3420  * temporarily mark it locked, while we release the page-level lock.
3421  *
3422  * To satisfy the rule that any xid potentially appearing in a buffer
3423  * written out to disk, we unfortunately have to WAL log this
3424  * temporary modification. We can reuse xl_heap_lock for this
3425  * purpose. If we crash/error before following through with the
3426  * actual update, xmax will be of an aborted transaction, allowing
3427  * other sessions to proceed.
3428  */
3429 
3430  /*
3431  * Compute xmax / infomask appropriate for locking the tuple. This has
3432  * to be done separately from the combo that's going to be used for
3433  * updating, because the potentially created multixact would otherwise
3434  * be wrong.
3435  */
3437  oldtup.t_data->t_infomask,
3438  oldtup.t_data->t_infomask2,
3439  xid, *lockmode, false,
3440  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3441  &infomask2_lock_old_tuple);
3442 
3443  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3444 
3446 
3447  /* Clear obsolete visibility flags ... */
3448  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3449  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3450  HeapTupleClearHotUpdated(&oldtup);
3451  /* ... and store info about transaction updating this tuple */
3452  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3453  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3454  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3455  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3456  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3457 
3458  /* temporarily make it look not-updated, but locked */
3459  oldtup.t_data->t_ctid = oldtup.t_self;
3460 
3461  /*
3462  * Clear all-frozen bit on visibility map if needed. We could
3463  * immediately reset ALL_VISIBLE, but given that the WAL logging
3464  * overhead would be unchanged, that doesn't seem necessarily
3465  * worthwhile.
3466  */
3467  if (PageIsAllVisible(BufferGetPage(buffer)) &&
3468  visibilitymap_clear(relation, block, vmbuffer,
3470  cleared_all_frozen = true;
3471 
3472  MarkBufferDirty(buffer);
3473 
3474  if (RelationNeedsWAL(relation))
3475  {
3476  xl_heap_lock xlrec;
3477  XLogRecPtr recptr;
3478 
3479  XLogBeginInsert();
3480  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3481 
3482  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3483  xlrec.locking_xid = xmax_lock_old_tuple;
3485  oldtup.t_data->t_infomask2);
3486  xlrec.flags =
3487  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3488  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3489  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3490  PageSetLSN(page, recptr);
3491  }
3492 
3493  END_CRIT_SECTION();
3494 
3495  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3496 
3497  /*
3498  * Let the toaster do its thing, if needed.
3499  *
3500  * Note: below this point, heaptup is the data we actually intend to
3501  * store into the relation; newtup is the caller's original untoasted
3502  * data.
3503  */
3504  if (need_toast)
3505  {
3506  /* Note we always use WAL and FSM during updates */
3507  heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3508  newtupsize = MAXALIGN(heaptup->t_len);
3509  }
3510  else
3511  heaptup = newtup;
3512 
3513  /*
3514  * Now, do we need a new page for the tuple, or not? This is a bit
3515  * tricky since someone else could have added tuples to the page while
3516  * we weren't looking. We have to recheck the available space after
3517  * reacquiring the buffer lock. But don't bother to do that if the
3518  * former amount of free space is still not enough; it's unlikely
3519  * there's more free now than before.
3520  *
3521  * What's more, if we need to get a new page, we will need to acquire
3522  * buffer locks on both old and new pages. To avoid deadlock against
3523  * some other backend trying to get the same two locks in the other
3524  * order, we must be consistent about the order we get the locks in.
3525  * We use the rule "lock the lower-numbered page of the relation
3526  * first". To implement this, we must do RelationGetBufferForTuple
3527  * while not holding the lock on the old page, and we must rely on it
3528  * to get the locks on both pages in the correct order.
3529  */
3530  if (newtupsize > pagefree)
3531  {
3532  /* Assume there's no chance to put heaptup on same page. */
3533  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3534  buffer, 0, NULL,
3535  &vmbuffer_new, &vmbuffer);
3536  }
3537  else
3538  {
3539  /* Re-acquire the lock on the old tuple's page. */
3541  /* Re-check using the up-to-date free space */
3542  pagefree = PageGetHeapFreeSpace(page);
3543  if (newtupsize > pagefree)
3544  {
3545  /*
3546  * Rats, it doesn't fit anymore. We must now unlock and
3547  * relock to avoid deadlock. Fortunately, this path should
3548  * seldom be taken.
3549  */
3550  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3551  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3552  buffer, 0, NULL,
3553  &vmbuffer_new, &vmbuffer);
3554  }
3555  else
3556  {
3557  /* OK, it fits here, so we're done. */
3558  newbuf = buffer;
3559  }
3560  }
3561  }
3562  else
3563  {
3564  /* No TOAST work needed, and it'll fit on same page */
3565  newbuf = buffer;
3566  heaptup = newtup;
3567  }
3568 
3569  /*
3570  * We're about to do the actual update -- check for conflict first, to
3571  * avoid possibly having to roll back work we've just done.
3572  *
3573  * This is safe without a recheck as long as there is no possibility of
3574  * another process scanning the pages between this check and the update
3575  * being visible to the scan (i.e., exclusive buffer content lock(s) are
3576  * continuously held from this point until the tuple update is visible).
3577  *
3578  * For the new tuple the only check needed is at the relation level, but
3579  * since both tuples are in the same relation and the check for oldtup
3580  * will include checking the relation level, there is no benefit to a
3581  * separate check for the new tuple.
3582  */
3583  CheckForSerializableConflictIn(relation, &oldtup, buffer);
3584 
3585  /*
3586  * At this point newbuf and buffer are both pinned and locked, and newbuf
3587  * has enough space for the new tuple. If they are the same buffer, only
3588  * one pin is held.
3589  */
3590 
3591  if (newbuf == buffer)
3592  {
3593  /*
3594  * Since the new tuple is going into the same page, we might be able
3595  * to do a HOT update. Check if any of the index columns have been
3596  * changed. If the page was already full, we may have skipped checking
3597  * for index columns, and also can't do a HOT update.
3598  */
3599  if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3600  use_hot_update = true;
3601  }
3602  else
3603  {
3604  /* Set a hint that the old page could use prune/defrag */
3605  PageSetFull(page);
3606  }
3607 
3608  /*
3609  * Compute replica identity tuple before entering the critical section so
3610  * we don't PANIC upon a memory allocation failure.
3611  * ExtractReplicaIdentity() will return NULL if nothing needs to be
3612  * logged.
3613  */
3614  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3615  bms_overlap(modified_attrs, id_attrs),
3616  &old_key_copied);
3617 
3618  /* NO EREPORT(ERROR) from here till changes are logged */
3620 
3621  /*
3622  * If this transaction commits, the old tuple will become DEAD sooner or
3623  * later. Set flag that this page is a candidate for pruning once our xid
3624  * falls below the OldestXmin horizon. If the transaction finally aborts,
3625  * the subsequent page pruning will be a no-op and the hint will be
3626  * cleared.
3627  *
3628  * XXX Should we set hint on newbuf as well? If the transaction aborts,
3629  * there would be a prunable tuple in the newbuf; but for now we choose
3630  * not to optimize for aborts. Note that heap_xlog_update must be kept in
3631  * sync if this decision changes.
3632  */
3633  PageSetPrunable(page, xid);
3634 
3635  if (use_hot_update)
3636  {
3637  /* Mark the old tuple as HOT-updated */
3638  HeapTupleSetHotUpdated(&oldtup);
3639  /* And mark the new tuple as heap-only */
3640  HeapTupleSetHeapOnly(heaptup);
3641  /* Mark the caller's copy too, in case different from heaptup */
3642  HeapTupleSetHeapOnly(newtup);
3643  }
3644  else
3645  {
3646  /* Make sure tuples are correctly marked as not-HOT */
3647  HeapTupleClearHotUpdated(&oldtup);
3648  HeapTupleClearHeapOnly(heaptup);
3649  HeapTupleClearHeapOnly(newtup);
3650  }
3651 
3652  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3653 
3654 
3655  /* Clear obsolete visibility flags, possibly set by ourselves above... */
3656  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3657  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3658  /* ... and store info about transaction updating this tuple */
3659  Assert(TransactionIdIsValid(xmax_old_tuple));
3660  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3661  oldtup.t_data->t_infomask |= infomask_old_tuple;
3662  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3663  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3664 
3665  /* record address of new tuple in t_ctid of old one */
3666  oldtup.t_data->t_ctid = heaptup->t_self;
3667 
3668  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3669  if (PageIsAllVisible(BufferGetPage(buffer)))
3670  {
3671  all_visible_cleared = true;
3673  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3674  vmbuffer, VISIBILITYMAP_VALID_BITS);
3675  }
3676  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3677  {
3678  all_visible_cleared_new = true;
3680  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3681  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3682  }
3683 
3684  if (newbuf != buffer)
3685  MarkBufferDirty(newbuf);
3686  MarkBufferDirty(buffer);
3687 
3688  /* XLOG stuff */
3689  if (RelationNeedsWAL(relation))
3690  {
3691  XLogRecPtr recptr;
3692 
3693  /*
3694  * For logical decoding we need combocids to properly decode the
3695  * catalog.
3696  */
3698  {
3699  log_heap_new_cid(relation, &oldtup);
3700  log_heap_new_cid(relation, heaptup);
3701  }
3702 
3703  recptr = log_heap_update(relation, buffer,
3704  newbuf, &oldtup, heaptup,
3705  old_key_tuple,
3706  all_visible_cleared,
3707  all_visible_cleared_new);
3708  if (newbuf != buffer)
3709  {
3710  PageSetLSN(BufferGetPage(newbuf), recptr);
3711  }
3712  PageSetLSN(BufferGetPage(buffer), recptr);
3713  }
3714 
3715  END_CRIT_SECTION();
3716 
3717  if (newbuf != buffer)
3718  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3719  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3720 
3721  /*
3722  * Mark old tuple for invalidation from system caches at next command
3723  * boundary, and mark the new tuple for invalidation in case we abort. We
3724  * have to do this before releasing the buffer because oldtup is in the
3725  * buffer. (heaptup is all in local memory, but it's necessary to process
3726  * both tuple versions in one call to inval.c so we can avoid redundant
3727  * sinval messages.)
3728  */
3729  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
3730 
3731  /* Now we can release the buffer(s) */
3732  if (newbuf != buffer)
3733  ReleaseBuffer(newbuf);
3734  ReleaseBuffer(buffer);
3735  if (BufferIsValid(vmbuffer_new))
3736  ReleaseBuffer(vmbuffer_new);
3737  if (BufferIsValid(vmbuffer))
3738  ReleaseBuffer(vmbuffer);
3739 
3740  /*
3741  * Release the lmgr tuple lock, if we had it.
3742  */
3743  if (have_tuple_lock)
3744  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3745 
3746  pgstat_count_heap_update(relation, use_hot_update);
3747 
3748  /*
3749  * If heaptup is a private copy, release it. Don't forget to copy t_self
3750  * back to the caller's image, too.
3751  */
3752  if (heaptup != newtup)
3753  {
3754  newtup->t_self = heaptup->t_self;
3755  heap_freetuple(heaptup);
3756  }
3757 
3758  if (old_key_tuple != NULL && old_key_copied)
3759  heap_freetuple(old_key_tuple);
3760 
3761  bms_free(hot_attrs);
3762  bms_free(key_attrs);
3763  bms_free(id_attrs);
3764  bms_free(modified_attrs);
3765  bms_free(interesting_attrs);
3766 
3767  return TM_Ok;
3768 }
3769 
3770 /*
3771  * Check if the specified attribute's value is same in both given tuples.
3772  * Subroutine for HeapDetermineModifiedColumns.
3773  */
3774 static bool
3775 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
3776  HeapTuple tup1, HeapTuple tup2)
3777 {
3778  Datum value1,
3779  value2;
3780  bool isnull1,
3781  isnull2;
3782  Form_pg_attribute att;
3783 
3784  /*
3785  * If it's a whole-tuple reference, say "not equal". It's not really
3786  * worth supporting this case, since it could only succeed after a no-op
3787  * update, which is hardly a case worth optimizing for.
3788  */
3789  if (attrnum == 0)
3790  return false;
3791 
3792  /*
3793  * Likewise, automatically say "not equal" for any system attribute other
3794  * than tableOID; we cannot expect these to be consistent in a HOT chain,
3795  * or even to be set correctly yet in the new tuple.
3796  */
3797  if (attrnum < 0)
3798  {
3799  if (attrnum != TableOidAttributeNumber)
3800  return false;
3801  }
3802 
3803  /*
3804  * Extract the corresponding values. XXX this is pretty inefficient if
3805  * there are many indexed columns. Should HeapDetermineModifiedColumns do
3806  * a single heap_deform_tuple call on each tuple, instead? But that
3807  * doesn't work for system columns ...
3808  */
3809  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
3810  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
3811 
3812  /*
3813  * If one value is NULL and other is not, then they are certainly not
3814  * equal
3815  */
3816  if (isnull1 != isnull2)
3817  return false;
3818 
3819  /*
3820  * If both are NULL, they can be considered equal.
3821  */
3822  if (isnull1)
3823  return true;
3824 
3825  /*
3826  * We do simple binary comparison of the two datums. This may be overly
3827  * strict because there can be multiple binary representations for the
3828  * same logical value. But we should be OK as long as there are no false
3829  * positives. Using a type-specific equality operator is messy because
3830  * there could be multiple notions of equality in different operator
3831  * classes; furthermore, we cannot safely invoke user-defined functions
3832  * while holding exclusive buffer lock.
3833  */
3834  if (attrnum <= 0)
3835  {
3836  /* The only allowed system columns are OIDs, so do this */
3837  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
3838  }
3839  else
3840  {
3841  Assert(attrnum <= tupdesc->natts);
3842  att = TupleDescAttr(tupdesc, attrnum - 1);
3843  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
3844  }
3845 }
3846 
3847 /*
3848  * Check which columns are being updated.
3849  *
3850  * Given an updated tuple, determine (and return into the output bitmapset),
3851  * from those listed as interesting, the set of columns that changed.
3852  *
3853  * The input bitmapset is destructively modified; that is OK since this is
3854  * invoked at most once in heap_update.
3855  */
3856 static Bitmapset *
3858  HeapTuple oldtup, HeapTuple newtup)
3859 {
3860  int attnum;
3861  Bitmapset *modified = NULL;
3862 
3863  while ((attnum = bms_first_member(interesting_cols)) >= 0)
3864  {
3866 
3868  attnum, oldtup, newtup))
3869  modified = bms_add_member(modified,
3871  }
3872 
3873  return modified;
3874 }
3875 
3876 /*
3877  * simple_heap_update - replace a tuple
3878  *
3879  * This routine may be used to update a tuple when concurrent updates of
3880  * the target tuple are not expected (for example, because we have a lock
3881  * on the relation associated with the tuple). Any failure is reported
3882  * via ereport().
3883  */
3884 void
3886 {
3887  TM_Result result;
3888  TM_FailureData tmfd;
3889  LockTupleMode lockmode;
3890 
3891  result = heap_update(relation, otid, tup,
3893  true /* wait for commit */ ,
3894  &tmfd, &lockmode);
3895  switch (result)
3896  {
3897  case TM_SelfModified:
3898  /* Tuple was already updated in current command? */
3899  elog(ERROR, "tuple already updated by self");
3900  break;
3901 
3902  case TM_Ok:
3903  /* done successfully */
3904  break;
3905 
3906  case TM_Updated:
3907  elog(ERROR, "tuple concurrently updated");
3908  break;
3909 
3910  case TM_Deleted:
3911  elog(ERROR, "tuple concurrently deleted");
3912  break;
3913 
3914  default:
3915  elog(ERROR, "unrecognized heap_update status: %u", result);
3916  break;
3917  }
3918 }
3919 
3920 
3921 /*
3922  * Return the MultiXactStatus corresponding to the given tuple lock mode.
3923  */
3924 static MultiXactStatus
3926 {
3927  int retval;
3928 
3929  if (is_update)
3930  retval = tupleLockExtraInfo[mode].updstatus;
3931  else
3932  retval = tupleLockExtraInfo[mode].lockstatus;
3933 
3934  if (retval == -1)
3935  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
3936  is_update ? "true" : "false");
3937 
3938  return (MultiXactStatus) retval;
3939 }
3940 
3941 /*
3942  * heap_lock_tuple - lock a tuple in shared or exclusive mode
3943  *
3944  * Note that this acquires a buffer pin, which the caller must release.
3945  *
3946  * Input parameters:
3947  * relation: relation containing tuple (caller must hold suitable lock)
3948  * tid: TID of tuple to lock
3949  * cid: current command ID (used for visibility test, and stored into
3950  * tuple's cmax if lock is successful)
3951  * mode: indicates if shared or exclusive tuple lock is desired
3952  * wait_policy: what to do if tuple lock is not available
3953  * follow_updates: if true, follow the update chain to also lock descendant
3954  * tuples.
3955  *
3956  * Output parameters:
3957  * *tuple: all fields filled in
3958  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
3959  * *tmfd: filled in failure cases (see below)
3960  *
3961  * Function results are the same as the ones for table_tuple_lock().
3962  *
3963  * In the failure cases other than TM_Invisible, the routine fills
3964  * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
3965  * if necessary), and t_cmax (the last only for TM_SelfModified,
3966  * since we cannot obtain cmax from a combocid generated by another
3967  * transaction).
3968  * See comments for struct TM_FailureData for additional info.
3969  *
3970  * See README.tuplock for a thorough explanation of this mechanism.
3971  */
3972 TM_Result
3974  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
3975  bool follow_updates,
3976  Buffer *buffer, TM_FailureData *tmfd)
3977 {
3978  TM_Result result;
3979  ItemPointer tid = &(tuple->t_self);
3980  ItemId lp;
3981  Page page;
3982  Buffer vmbuffer = InvalidBuffer;
3983  BlockNumber block;
3984  TransactionId xid,
3985  xmax;
3986  uint16 old_infomask,
3987  new_infomask,
3988  new_infomask2;
3989  bool first_time = true;
3990  bool skip_tuple_lock = false;
3991  bool have_tuple_lock = false;
3992  bool cleared_all_frozen = false;
3993 
3994  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
3995  block = ItemPointerGetBlockNumber(tid);
3996 
3997  /*
3998  * Before locking the buffer, pin the visibility map page if it appears to
3999  * be necessary. Since we haven't got the lock yet, someone else might be
4000  * in the middle of changing this, so we'll need to recheck after we have
4001  * the lock.
4002  */
4003  if (PageIsAllVisible(BufferGetPage(*buffer)))
4004  visibilitymap_pin(relation, block, &vmbuffer);
4005 
4007 
4008  page = BufferGetPage(*buffer);
4009  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4010  Assert(ItemIdIsNormal(lp));
4011 
4012  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4013  tuple->t_len = ItemIdGetLength(lp);
4014  tuple->t_tableOid = RelationGetRelid(relation);
4015 
4016 l3:
4017  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4018 
4019  if (result == TM_Invisible)
4020  {
4021  /*
4022  * This is possible, but only when locking a tuple for ON CONFLICT
4023  * UPDATE. We return this value here rather than throwing an error in
4024  * order to give that case the opportunity to throw a more specific
4025  * error.
4026  */
4027  result = TM_Invisible;
4028  goto out_locked;
4029  }
4030  else if (result == TM_BeingModified ||
4031  result == TM_Updated ||
4032  result == TM_Deleted)
4033  {
4034  TransactionId xwait;
4035  uint16 infomask;
4036  uint16 infomask2;
4037  bool require_sleep;
4038  ItemPointerData t_ctid;
4039 
4040  /* must copy state data before unlocking buffer */
4041  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4042  infomask = tuple->t_data->t_infomask;
4043  infomask2 = tuple->t_data->t_infomask2;
4044  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4045 
4046  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4047 
4048  /*
4049  * If any subtransaction of the current top transaction already holds
4050  * a lock as strong as or stronger than what we're requesting, we
4051  * effectively hold the desired lock already. We *must* succeed
4052  * without trying to take the tuple lock, else we will deadlock
4053  * against anyone wanting to acquire a stronger lock.
4054  *
4055  * Note we only do this the first time we loop on the HTSU result;
4056  * there is no point in testing in subsequent passes, because
4057  * evidently our own transaction cannot have acquired a new lock after
4058  * the first time we checked.
4059  */
4060  if (first_time)
4061  {
4062  first_time = false;
4063 
4064  if (infomask & HEAP_XMAX_IS_MULTI)
4065  {
4066  int i;
4067  int nmembers;
4068  MultiXactMember *members;
4069 
4070  /*
4071  * We don't need to allow old multixacts here; if that had
4072  * been the case, HeapTupleSatisfiesUpdate would have returned
4073  * MayBeUpdated and we wouldn't be here.
4074  */
4075  nmembers =
4076  GetMultiXactIdMembers(xwait, &members, false,
4077  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4078 
4079  for (i = 0; i < nmembers; i++)
4080  {
4081  /* only consider members of our own transaction */
4082  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4083  continue;
4084 
4085  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4086  {
4087  pfree(members);
4088  result = TM_Ok;
4089  goto out_unlocked;
4090  }
4091  else
4092  {
4093  /*
4094  * Disable acquisition of the heavyweight tuple lock.
4095  * Otherwise, when promoting a weaker lock, we might
4096  * deadlock with another locker that has acquired the
4097  * heavyweight tuple lock and is waiting for our
4098  * transaction to finish.
4099  *
4100  * Note that in this case we still need to wait for
4101  * the multixact if required, to avoid acquiring
4102  * conflicting locks.
4103  */
4104  skip_tuple_lock = true;
4105  }
4106  }
4107 
4108  if (members)
4109  pfree(members);
4110  }
4111  else if (TransactionIdIsCurrentTransactionId(xwait))
4112  {
4113  switch (mode)
4114  {
4115  case LockTupleKeyShare:
4116  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4117  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4118  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4119  result = TM_Ok;
4120  goto out_unlocked;
4121  case LockTupleShare:
4122  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4123  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4124  {
4125  result = TM_Ok;
4126  goto out_unlocked;
4127  }
4128  break;
4130  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4131  {
4132  result = TM_Ok;
4133  goto out_unlocked;
4134  }
4135  break;
4136  case LockTupleExclusive:
4137  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4138  infomask2 & HEAP_KEYS_UPDATED)
4139  {
4140  result = TM_Ok;
4141  goto out_unlocked;
4142  }
4143  break;
4144  }
4145  }
4146  }
4147 
4148  /*
4149  * Initially assume that we will have to wait for the locking
4150  * transaction(s) to finish. We check various cases below in which
4151  * this can be turned off.
4152  */
4153  require_sleep = true;
4154  if (mode == LockTupleKeyShare)
4155  {
4156  /*
4157  * If we're requesting KeyShare, and there's no update present, we
4158  * don't need to wait. Even if there is an update, we can still
4159  * continue if the key hasn't been modified.
4160  *
4161  * However, if there are updates, we need to walk the update chain
4162  * to mark future versions of the row as locked, too. That way,
4163  * if somebody deletes that future version, we're protected
4164  * against the key going away. This locking of future versions
4165  * could block momentarily, if a concurrent transaction is
4166  * deleting a key; or it could return a value to the effect that
4167  * the transaction deleting the key has already committed. So we
4168  * do this before re-locking the buffer; otherwise this would be
4169  * prone to deadlocks.
4170  *
4171  * Note that the TID we're locking was grabbed before we unlocked
4172  * the buffer. For it to change while we're not looking, the
4173  * other properties we're testing for below after re-locking the
4174  * buffer would also change, in which case we would restart this
4175  * loop above.
4176  */
4177  if (!(infomask2 & HEAP_KEYS_UPDATED))
4178  {
4179  bool updated;
4180 
4181  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4182 
4183  /*
4184  * If there are updates, follow the update chain; bail out if
4185  * that cannot be done.
4186  */
4187  if (follow_updates && updated)
4188  {
4189  TM_Result res;
4190 
4191  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4193  mode);
4194  if (res != TM_Ok)
4195  {
4196  result = res;
4197  /* recovery code expects to have buffer lock held */
4199  goto failed;
4200  }
4201  }
4202 
4204 
4205  /*
4206  * Make sure it's still an appropriate lock, else start over.
4207  * Also, if it wasn't updated before we released the lock, but
4208  * is updated now, we start over too; the reason is that we
4209  * now need to follow the update chain to lock the new
4210  * versions.
4211  */
4212  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4213  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4214  !updated))
4215  goto l3;
4216 
4217  /* Things look okay, so we can skip sleeping */
4218  require_sleep = false;
4219 
4220  /*
4221  * Note we allow Xmax to change here; other updaters/lockers
4222  * could have modified it before we grabbed the buffer lock.
4223  * However, this is not a problem, because with the recheck we
4224  * just did we ensure that they still don't conflict with the
4225  * lock we want.
4226  */
4227  }
4228  }
4229  else if (mode == LockTupleShare)
4230  {
4231  /*
4232  * If we're requesting Share, we can similarly avoid sleeping if
4233  * there's no update and no exclusive lock present.
4234  */
4235  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4236  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4237  {
4239 
4240  /*
4241  * Make sure it's still an appropriate lock, else start over.
4242  * See above about allowing xmax to change.
4243  */
4244  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4246  goto l3;
4247  require_sleep = false;
4248  }
4249  }
4250  else if (mode == LockTupleNoKeyExclusive)
4251  {
4252  /*
4253  * If we're requesting NoKeyExclusive, we might also be able to
4254  * avoid sleeping; just ensure that there no conflicting lock
4255  * already acquired.
4256  */
4257  if (infomask & HEAP_XMAX_IS_MULTI)
4258  {
4259  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4260  mode, NULL))
4261  {
4262  /*
4263  * No conflict, but if the xmax changed under us in the
4264  * meantime, start over.
4265  */
4267  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4269  xwait))
4270  goto l3;
4271 
4272  /* otherwise, we're good */
4273  require_sleep = false;
4274  }
4275  }
4276  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4277  {
4279 
4280  /* if the xmax changed in the meantime, start over */
4281  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4284  xwait))
4285  goto l3;
4286  /* otherwise, we're good */
4287  require_sleep = false;
4288  }
4289  }
4290 
4291  /*
4292  * As a check independent from those above, we can also avoid sleeping
4293  * if the current transaction is the sole locker of the tuple. Note
4294  * that the strength of the lock already held is irrelevant; this is
4295  * not about recording the lock in Xmax (which will be done regardless
4296  * of this optimization, below). Also, note that the cases where we
4297  * hold a lock stronger than we are requesting are already handled
4298  * above by not doing anything.
4299  *
4300  * Note we only deal with the non-multixact case here; MultiXactIdWait
4301  * is well equipped to deal with this situation on its own.
4302  */
4303  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4305  {
4306  /* ... but if the xmax changed in the meantime, start over */
4308  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4310  xwait))
4311  goto l3;
4313  require_sleep = false;
4314  }
4315 
4316  /*
4317  * Time to sleep on the other transaction/multixact, if necessary.
4318  *
4319  * If the other transaction is an update/delete that's already
4320  * committed, then sleeping cannot possibly do any good: if we're
4321  * required to sleep, get out to raise an error instead.
4322  *
4323  * By here, we either have already acquired the buffer exclusive lock,
4324  * or we must wait for the locking transaction or multixact; so below
4325  * we ensure that we grab buffer lock after the sleep.
4326  */
4327  if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4328  {
4330  goto failed;
4331  }
4332  else if (require_sleep)
4333  {
4334  /*
4335  * Acquire tuple lock to establish our priority for the tuple, or
4336  * die trying. LockTuple will release us when we are next-in-line
4337  * for the tuple. We must do this even if we are share-locking,
4338  * but not if we already have a weaker lock on the tuple.
4339  *
4340  * If we are forced to "start over" below, we keep the tuple lock;
4341  * this arranges that we stay at the head of the line while
4342  * rechecking tuple state.
4343  */
4344  if (!skip_tuple_lock &&
4345  !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4346  &have_tuple_lock))
4347  {
4348  /*
4349  * This can only happen if wait_policy is Skip and the lock
4350  * couldn't be obtained.
4351  */
4352  result = TM_WouldBlock;
4353  /* recovery code expects to have buffer lock held */
4355  goto failed;
4356  }
4357 
4358  if (infomask & HEAP_XMAX_IS_MULTI)
4359  {
4361 
4362  /* We only ever lock tuples, never update them */
4363  if (status >= MultiXactStatusNoKeyUpdate)
4364  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4365 
4366  /* wait for multixact to end, or die trying */
4367  switch (wait_policy)
4368  {
4369  case LockWaitBlock:
4370  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4371  relation, &tuple->t_self, XLTW_Lock, NULL);
4372  break;
4373  case LockWaitSkip:
4375  status, infomask, relation,
4376  NULL))
4377  {
4378  result = TM_WouldBlock;
4379  /* recovery code expects to have buffer lock held */
4381  goto failed;
4382  }
4383  break;
4384  case LockWaitError:
4386  status, infomask, relation,
4387  NULL))
4388  ereport(ERROR,
4389  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4390  errmsg("could not obtain lock on row in relation \"%s\"",
4391  RelationGetRelationName(relation))));
4392 
4393  break;
4394  }
4395 
4396  /*
4397  * Of course, the multixact might not be done here: if we're
4398  * requesting a light lock mode, other transactions with light
4399  * locks could still be alive, as well as locks owned by our
4400  * own xact or other subxacts of this backend. We need to
4401  * preserve the surviving MultiXact members. Note that it
4402  * isn't absolutely necessary in the latter case, but doing so
4403  * is simpler.
4404  */
4405  }
4406  else
4407  {
4408  /* wait for regular transaction to end, or die trying */
4409  switch (wait_policy)
4410  {
4411  case LockWaitBlock:
4412  XactLockTableWait(xwait, relation, &tuple->t_self,
4413  XLTW_Lock);
4414  break;
4415  case LockWaitSkip:
4416  if (!ConditionalXactLockTableWait(xwait))
4417  {
4418  result = TM_WouldBlock;
4419  /* recovery code expects to have buffer lock held */
4421  goto failed;
4422  }
4423  break;
4424  case LockWaitError:
4425  if (!ConditionalXactLockTableWait(xwait))
4426  ereport(ERROR,
4427  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4428  errmsg("could not obtain lock on row in relation \"%s\"",
4429  RelationGetRelationName(relation))));
4430  break;
4431  }
4432  }
4433 
4434  /* if there are updates, follow the update chain */
4435  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4436  {
4437  TM_Result res;
4438 
4439  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4441  mode);
4442  if (res != TM_Ok)
4443  {
4444  result = res;
4445  /* recovery code expects to have buffer lock held */
4447  goto failed;
4448  }
4449  }
4450 
4452 
4453  /*
4454  * xwait is done, but if xwait had just locked the tuple then some
4455  * other xact could update this tuple before we get to this point.
4456  * Check for xmax change, and start over if so.
4457  */
4458  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4460  xwait))
4461  goto l3;
4462 
4463  if (!(infomask & HEAP_XMAX_IS_MULTI))
4464  {
4465  /*
4466  * Otherwise check if it committed or aborted. Note we cannot
4467  * be here if the tuple was only locked by somebody who didn't
4468  * conflict with us; that would have been handled above. So
4469  * that transaction must necessarily be gone by now. But
4470  * don't check for this in the multixact case, because some
4471  * locker transactions might still be running.
4472  */
4473  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4474  }
4475  }
4476 
4477  /* By here, we're certain that we hold buffer exclusive lock again */
4478 
4479  /*
4480  * We may lock if previous xmax aborted, or if it committed but only
4481  * locked the tuple without updating it; or if we didn't have to wait
4482  * at all for whatever reason.
4483  */
4484  if (!require_sleep ||
4485  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4488  result = TM_Ok;
4489  else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid) ||
4491  result = TM_Updated;
4492  else
4493  result = TM_Deleted;
4494  }
4495 
4496 failed:
4497  if (result != TM_Ok)
4498  {
4499  Assert(result == TM_SelfModified || result == TM_Updated ||
4500  result == TM_Deleted || result == TM_WouldBlock);
4501  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4502  Assert(result != TM_Updated ||
4503  !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4504  tmfd->ctid = tuple->t_data->t_ctid;
4505  tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4506  if (result == TM_SelfModified)
4507  tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4508  else
4509  tmfd->cmax = InvalidCommandId;
4510  goto out_locked;
4511  }
4512 
4513  /*
4514  * If we didn't pin the visibility map page and the page has become all
4515  * visible while we were busy locking the buffer, or during some
4516  * subsequent window during which we had it unlocked, we'll have to unlock
4517  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4518  * unfortunate, especially since we'll now have to recheck whether the
4519  * tuple has been locked or updated under us, but hopefully it won't
4520  * happen very often.
4521  */
4522  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4523  {
4524  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4525  visibilitymap_pin(relation, block, &vmbuffer);
4527  goto l3;
4528  }
4529 
4530  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4531  old_infomask = tuple->t_data->t_infomask;
4532 
4533  /*
4534  * If this is the first possibly-multixact-able operation in the current
4535  * transaction, set my per-backend OldestMemberMXactId setting. We can be
4536  * certain that the transaction will never become a member of any older
4537  * MultiXactIds than that. (We have to do this even if we end up just
4538  * using our own TransactionId below, since some other backend could
4539  * incorporate our XID into a MultiXact immediately afterwards.)
4540  */
4542 
4543  /*
4544  * Compute the new xmax and infomask to store into the tuple. Note we do
4545  * not modify the tuple just yet, because that would leave it in the wrong
4546  * state if multixact.c elogs.
4547  */
4548  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4549  GetCurrentTransactionId(), mode, false,
4550  &xid, &new_infomask, &new_infomask2);
4551 
4553 
4554  /*
4555  * Store transaction information of xact locking the tuple.
4556  *
4557  * Note: Cmax is meaningless in this context, so don't set it; this avoids
4558  * possibly generating a useless combo CID. Moreover, if we're locking a
4559  * previously updated tuple, it's important to preserve the Cmax.
4560  *
4561  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4562  * we would break the HOT chain.
4563  */
4564  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4565  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4566  tuple->t_data->t_infomask |= new_infomask;
4567  tuple->t_data->t_infomask2 |= new_infomask2;
4568  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4570  HeapTupleHeaderSetXmax(tuple->t_data, xid);
4571 
4572  /*
4573  * Make sure there is no forward chain link in t_ctid. Note that in the
4574  * cases where the tuple has been updated, we must not overwrite t_ctid,
4575  * because it was set by the updater. Moreover, if the tuple has been
4576  * updated, we need to follow the update chain to lock the new versions of
4577  * the tuple as well.
4578  */
4579  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4580  tuple->t_data->t_ctid = *tid;
4581 
4582  /* Clear only the all-frozen bit on visibility map if needed */
4583  if (PageIsAllVisible(page) &&
4584  visibilitymap_clear(relation, block, vmbuffer,
4586  cleared_all_frozen = true;
4587 
4588 
4589  MarkBufferDirty(*buffer);
4590 
4591  /*
4592  * XLOG stuff. You might think that we don't need an XLOG record because
4593  * there is no state change worth restoring after a crash. You would be
4594  * wrong however: we have just written either a TransactionId or a
4595  * MultiXactId that may never have been seen on disk before, and we need
4596  * to make sure that there are XLOG entries covering those ID numbers.
4597  * Else the same IDs might be re-used after a crash, which would be
4598  * disastrous if this page made it to disk before the crash. Essentially
4599  * we have to enforce the WAL log-before-data rule even in this case.
4600  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4601  * entries for everything anyway.)
4602  */
4603  if (RelationNeedsWAL(relation))
4604  {
4605  xl_heap_lock xlrec;
4606  XLogRecPtr recptr;
4607 
4608  XLogBeginInsert();
4609  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4610 
4611  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4612  xlrec.locking_xid = xid;
4613  xlrec.infobits_set = compute_infobits(new_infomask,
4614  tuple->t_data->t_infomask2);
4615  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4616  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4617 
4618  /* we don't decode row locks atm, so no need to log the origin */
4619 
4620  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4621 
4622  PageSetLSN(page, recptr);
4623  }
4624 
4625  END_CRIT_SECTION();
4626 
4627  result = TM_Ok;
4628 
4629 out_locked:
4630  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4631 
4632 out_unlocked:
4633  if (BufferIsValid(vmbuffer))
4634  ReleaseBuffer(vmbuffer);
4635 
4636  /*
4637  * Don't update the visibility map here. Locking a tuple doesn't change
4638  * visibility info.
4639  */
4640 
4641  /*
4642  * Now that we have successfully marked the tuple as locked, we can
4643  * release the lmgr tuple lock, if we had it.
4644  */
4645  if (have_tuple_lock)
4646  UnlockTupleTuplock(relation, tid, mode);
4647 
4648  return result;
4649 }
4650 
4651 /*
4652  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4653  * its normal, Xmax-based tuple lock.
4654  *
4655  * have_tuple_lock is an input and output parameter: on input, it indicates
4656  * whether the lock has previously been acquired (and this function does
4657  * nothing in that case). If this function returns success, have_tuple_lock
4658  * has been flipped to true.
4659  *
4660  * Returns false if it was unable to obtain the lock; this can only happen if
4661  * wait_policy is Skip.
4662  */
4663 static bool
4665  LockWaitPolicy wait_policy, bool *have_tuple_lock)
4666 {
4667  if (*have_tuple_lock)
4668  return true;
4669 
4670  switch (wait_policy)
4671  {
4672  case LockWaitBlock:
4673  LockTupleTuplock(relation, tid, mode);
4674  break;
4675 
4676  case LockWaitSkip:
4677  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4678  return false;
4679  break;
4680 
4681  case LockWaitError:
4682  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4683  ereport(ERROR,
4684  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4685  errmsg("could not obtain lock on row in relation \"%s\"",
4686  RelationGetRelationName(relation))));
4687  break;
4688  }
4689  *have_tuple_lock = true;
4690 
4691  return true;
4692 }
4693 
4694 /*
4695  * Given an original set of Xmax and infomask, and a transaction (identified by
4696  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4697  * corresponding infomasks to use on the tuple.
4698  *
4699  * Note that this might have side effects such as creating a new MultiXactId.
4700  *
4701  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4702  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4703  * but it was not running anymore. There is a race condition, which is that the
4704  * MultiXactId may have finished since then, but that uncommon case is handled
4705  * either here, or within MultiXactIdExpand.
4706  *
4707  * There is a similar race condition possible when the old xmax was a regular
4708  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
4709  * window, but it's still possible to end up creating an unnecessary
4710  * MultiXactId. Fortunately this is harmless.
4711  */
4712 static void
4714  uint16 old_infomask2, TransactionId add_to_xmax,
4715  LockTupleMode mode, bool is_update,
4716  TransactionId *result_xmax, uint16 *result_infomask,
4717  uint16 *result_infomask2)
4718 {
4719  TransactionId new_xmax;
4720  uint16 new_infomask,
4721  new_infomask2;
4722 
4724 
4725 l5:
4726  new_infomask = 0;
4727  new_infomask2 = 0;
4728  if (old_infomask & HEAP_XMAX_INVALID)
4729  {
4730  /*
4731  * No previous locker; we just insert our own TransactionId.
4732  *
4733  * Note that it's critical that this case be the first one checked,
4734  * because there are several blocks below that come back to this one
4735  * to implement certain optimizations; old_infomask might contain
4736  * other dirty bits in those cases, but we don't really care.
4737  */
4738  if (is_update)
4739  {
4740  new_xmax = add_to_xmax;
4741  if (mode == LockTupleExclusive)
4742  new_infomask2 |= HEAP_KEYS_UPDATED;
4743  }
4744  else
4745  {
4746  new_infomask |= HEAP_XMAX_LOCK_ONLY;
4747  switch (mode)
4748  {
4749  case LockTupleKeyShare:
4750  new_xmax = add_to_xmax;
4751  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
4752  break;
4753  case LockTupleShare:
4754  new_xmax = add_to_xmax;
4755  new_infomask |= HEAP_XMAX_SHR_LOCK;
4756  break;
4758  new_xmax = add_to_xmax;
4759  new_infomask |= HEAP_XMAX_EXCL_LOCK;
4760  break;
4761  case LockTupleExclusive:
4762  new_xmax = add_to_xmax;
4763  new_infomask |= HEAP_XMAX_EXCL_LOCK;
4764  new_infomask2 |= HEAP_KEYS_UPDATED;
4765  break;
4766  default:
4767  new_xmax = InvalidTransactionId; /* silence compiler */
4768  elog(ERROR, "invalid lock mode");
4769  }
4770  }
4771  }
4772  else if (old_infomask & HEAP_XMAX_IS_MULTI)
4773  {
4774  MultiXactStatus new_status;
4775 
4776  /*
4777  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
4778  * cross-check.
4779  */
4780  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
4781 
4782  /*
4783  * A multixact together with LOCK_ONLY set but neither lock bit set
4784  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
4785  * anymore. This check is critical for databases upgraded by
4786  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
4787  * that such multis are never passed.
4788  */
4789  if (HEAP_LOCKED_UPGRADED(old_infomask))
4790  {
4791  old_infomask &= ~HEAP_XMAX_IS_MULTI;
4792  old_infomask |= HEAP_XMAX_INVALID;
4793  goto l5;
4794  }
4795 
4796  /*
4797  * If the XMAX is already a MultiXactId, then we need to expand it to
4798  * include add_to_xmax; but if all the members were lockers and are
4799  * all gone, we can do away with the IS_MULTI bit and just set
4800  * add_to_xmax as the only locker/updater. If all lockers are gone
4801  * and we have an updater that aborted, we can also do without a
4802  * multi.
4803  *
4804  * The cost of doing GetMultiXactIdMembers would be paid by
4805  * MultiXactIdExpand if we weren't to do this, so this check is not
4806  * incurring extra work anyhow.
4807  */
4808  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
4809  {
4810  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
4812  old_infomask)))
4813  {
4814  /*
4815  * Reset these bits and restart; otherwise fall through to
4816  * create a new multi below.
4817  */
4818  old_infomask &= ~HEAP_XMAX_IS_MULTI;
4819  old_infomask |= HEAP_XMAX_INVALID;
4820  goto l5;
4821  }
4822  }
4823 
4824  new_status = get_mxact_status_for_lock(mode, is_update);
4825 
4826  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
4827  new_status);
4828  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4829  }
4830  else if (old_infomask & HEAP_XMAX_COMMITTED)
4831  {
4832  /*
4833  * It's a committed update, so we need to preserve him as updater of
4834  * the tuple.
4835  */
4837  MultiXactStatus new_status;
4838 
4839  if (old_infomask2 & HEAP_KEYS_UPDATED)
4840  status = MultiXactStatusUpdate;
4841  else
4842  status = MultiXactStatusNoKeyUpdate;
4843 
4844  new_status = get_mxact_status_for_lock(mode, is_update);
4845 
4846  /*
4847  * since it's not running, it's obviously impossible for the old
4848  * updater to be identical to the current one, so we need not check
4849  * for that case as we do in the block above.
4850  */
4851  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4852  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4853  }
4854  else if (TransactionIdIsInProgress(xmax))
4855  {
4856  /*
4857  * If the XMAX is a valid, in-progress TransactionId, then we need to
4858  * create a new MultiXactId that includes both the old locker or
4859  * updater and our own TransactionId.
4860  */
4861  MultiXactStatus new_status;
4862  MultiXactStatus old_status;
4863  LockTupleMode old_mode;
4864 
4865  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
4866  {
4867  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
4868  old_status = MultiXactStatusForKeyShare;
4869  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
4870  old_status = MultiXactStatusForShare;
4871  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
4872  {
4873  if (old_infomask2 & HEAP_KEYS_UPDATED)
4874  old_status = MultiXactStatusForUpdate;
4875  else
4876  old_status = MultiXactStatusForNoKeyUpdate;
4877  }
4878  else
4879  {
4880  /*
4881  * LOCK_ONLY can be present alone only when a page has been
4882  * upgraded by pg_upgrade. But in that case,
4883  * TransactionIdIsInProgress() should have returned false. We
4884  * assume it's no longer locked in this case.
4885  */
4886  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
4887  old_infomask |= HEAP_XMAX_INVALID;
4888  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
4889  goto l5;
4890  }
4891  }
4892  else
4893  {
4894  /* it's an update, but which kind? */
4895  if (old_infomask2 & HEAP_KEYS_UPDATED)
4896  old_status = MultiXactStatusUpdate;
4897  else
4898  old_status = MultiXactStatusNoKeyUpdate;
4899  }
4900 
4901  old_mode = TUPLOCK_from_mxstatus(old_status);
4902 
4903  /*
4904  * If the lock to be acquired is for the same TransactionId as the
4905  * existing lock, there's an optimization possible: consider only the
4906  * strongest of both locks as the only one present, and restart.
4907  */
4908  if (xmax == add_to_xmax)
4909  {
4910  /*
4911  * Note that it's not possible for the original tuple to be
4912  * updated: we wouldn't be here because the tuple would have been
4913  * invisible and we wouldn't try to update it. As a subtlety,
4914  * this code can also run when traversing an update chain to lock
4915  * future versions of a tuple. But we wouldn't be here either,
4916  * because the add_to_xmax would be different from the original
4917  * updater.
4918  */
4919  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
4920 
4921  /* acquire the strongest of both */
4922  if (mode < old_mode)
4923  mode = old_mode;
4924  /* mustn't touch is_update */
4925 
4926  old_infomask |= HEAP_XMAX_INVALID;
4927  goto l5;
4928  }
4929 
4930  /* otherwise, just fall back to creating a new multixact */
4931  new_status = get_mxact_status_for_lock(mode, is_update);
4932  new_xmax = MultiXactIdCreate(xmax, old_status,
4933  add_to_xmax, new_status);
4934  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4935  }
4936  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
4937  TransactionIdDidCommit(xmax))
4938  {
4939  /*
4940  * It's a committed update, so we gotta preserve him as updater of the
4941  * tuple.
4942  */
4944  MultiXactStatus new_status;
4945 
4946  if (old_infomask2 & HEAP_KEYS_UPDATED)
4947  status = MultiXactStatusUpdate;
4948  else
4949  status = MultiXactStatusNoKeyUpdate;
4950 
4951  new_status = get_mxact_status_for_lock(mode, is_update);
4952 
4953  /*
4954  * since it's not running, it's obviously impossible for the old
4955  * updater to be identical to the current one, so we need not check
4956  * for that case as we do in the block above.
4957  */
4958  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4959  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4960  }
4961  else
4962  {
4963  /*
4964  * Can get here iff the locking/updating transaction was running when
4965  * the infomask was extracted from the tuple, but finished before
4966  * TransactionIdIsInProgress got to run. Deal with it as if there was
4967  * no locker at all in the first place.
4968  */
4969  old_infomask |= HEAP_XMAX_INVALID;
4970  goto l5;
4971  }
4972 
4973  *result_infomask = new_infomask;
4974  *result_infomask2 = new_infomask2;
4975  *result_xmax = new_xmax;
4976 }
4977 
4978 /*
4979  * Subroutine for heap_lock_updated_tuple_rec.
4980  *
4981  * Given a hypothetical multixact status held by the transaction identified
4982  * with the given xid, does the current transaction need to wait, fail, or can
4983  * it continue if it wanted to acquire a lock of the given mode? "needwait"
4984  * is set to true if waiting is necessary; if it can continue, then TM_Ok is
4985  * returned. If the lock is already held by the current transaction, return
4986  * TM_SelfModified. In case of a conflict with another transaction, a
4987  * different HeapTupleSatisfiesUpdate return code is returned.
4988  *
4989  * The held status is said to be hypothetical because it might correspond to a
4990  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
4991  * way for simplicity of API.
4992  */
4993 static TM_Result
4996  bool *needwait)
4997 {
4998  MultiXactStatus wantedstatus;
4999 
5000  *needwait = false;
5001  wantedstatus = get_mxact_status_for_lock(mode, false);
5002 
5003  /*
5004  * Note: we *must* check TransactionIdIsInProgress before
5005  * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5006  * for an explanation.
5007  */
5009  {
5010  /*
5011  * The tuple has already been locked by our own transaction. This is
5012  * very rare but can happen if multiple transactions are trying to
5013  * lock an ancient version of the same tuple.
5014  */
5015  return TM_SelfModified;
5016  }
5017  else if (TransactionIdIsInProgress(xid))
5018  {
5019  /*
5020  * If the locking transaction is running, what we do depends on
5021  * whether the lock modes conflict: if they do, then we must wait for
5022  * it to finish; otherwise we can fall through to lock this tuple
5023  * version without waiting.
5024  */
5026  LOCKMODE_from_mxstatus(wantedstatus)))
5027  {
5028  *needwait = true;
5029  }
5030 
5031  /*
5032  * If we set needwait above, then this value doesn't matter;
5033  * otherwise, this value signals to caller that it's okay to proceed.
5034  */
5035  return TM_Ok;
5036  }
5037  else if (TransactionIdDidAbort(xid))
5038  return TM_Ok;
5039  else if (TransactionIdDidCommit(xid))
5040  {
5041  /*
5042  * The other transaction committed. If it was only a locker, then the
5043  * lock is completely gone now and we can return success; but if it
5044  * was an update, then what we do depends on whether the two lock
5045  * modes conflict. If they conflict, then we must report error to
5046  * caller. But if they don't, we can fall through to allow the current
5047  * transaction to lock the tuple.
5048  *
5049  * Note: the reason we worry about ISUPDATE here is because as soon as
5050  * a transaction ends, all its locks are gone and meaningless, and
5051  * thus we can ignore them; whereas its updates persist. In the
5052  * TransactionIdIsInProgress case, above, we don't need to check
5053  * because we know the lock is still "alive" and thus a conflict needs
5054  * always be checked.
5055  */
5056  if (!ISUPDATE_from_mxstatus(status))
5057  return TM_Ok;
5058 
5060  LOCKMODE_from_mxstatus(wantedstatus)))
5061  {
5062  /* bummer */
5063  if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid) ||
5065  return TM_Updated;
5066  else
5067  return TM_Deleted;
5068  }
5069 
5070  return TM_Ok;
5071  }
5072 
5073  /* Not in progress, not aborted, not committed -- must have crashed */
5074  return TM_Ok;
5075 }
5076 
5077 
5078 /*
5079  * Recursive part of heap_lock_updated_tuple
5080  *
5081  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5082  * xid with the given mode; if this tuple is updated, recurse to lock the new
5083  * version as well.
5084  */
5085 static TM_Result
5088 {
5089  TM_Result result;
5090  ItemPointerData tupid;
5091  HeapTupleData mytup;
5092  Buffer buf;
5093  uint16 new_infomask,
5094  new_infomask2,
5095  old_infomask,
5096  old_infomask2;
5097  TransactionId xmax,
5098  new_xmax;
5099  TransactionId priorXmax = InvalidTransactionId;
5100  bool cleared_all_frozen = false;
5101  bool pinned_desired_page;
5102  Buffer vmbuffer = InvalidBuffer;
5103  BlockNumber block;
5104 
5105  ItemPointerCopy(tid, &tupid);
5106 
5107  for (;;)
5108  {
5109  new_infomask = 0;
5110  new_xmax = InvalidTransactionId;
5111  block = ItemPointerGetBlockNumber(&tupid);
5112  ItemPointerCopy(&tupid, &(mytup.t_self));
5113 
5114  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5115  {
5116  /*
5117  * if we fail to find the updated version of the tuple, it's
5118  * because it was vacuumed/pruned away after its creator
5119  * transaction aborted. So behave as if we got to the end of the
5120  * chain, and there's no further tuple to lock: return success to
5121  * caller.
5122  */
5123  result = TM_Ok;
5124  goto out_unlocked;
5125  }
5126 
5127 l4:
5129 
5130  /*
5131  * Before locking the buffer, pin the visibility map page if it
5132  * appears to be necessary. Since we haven't got the lock yet,
5133  * someone else might be in the middle of changing this, so we'll need
5134  * to recheck after we have the lock.
5135  */
5136  if (PageIsAllVisible(BufferGetPage(buf)))
5137  {
5138  visibilitymap_pin(rel, block, &vmbuffer);
5139  pinned_desired_page = true;
5140  }
5141  else
5142  pinned_desired_page = false;
5143 
5145 
5146  /*
5147  * If we didn't pin the visibility map page and the page has become
5148  * all visible while we were busy locking the buffer, we'll have to
5149  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5150  * That's a bit unfortunate, but hopefully shouldn't happen often.
5151  *
5152  * Note: in some paths through this function, we will reach here
5153  * holding a pin on a vm page that may or may not be the one matching
5154  * this page. If this page isn't all-visible, we won't use the vm
5155  * page, but we hold onto such a pin till the end of the function.
5156  */
5157  if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5158  {
5160  visibilitymap_pin(rel, block, &vmbuffer);
5162  }
5163 
5164  /*
5165  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5166  * end of the chain, we're done, so return success.
5167  */
5168  if (TransactionIdIsValid(priorXmax) &&
5170  priorXmax))
5171  {
5172  result = TM_Ok;
5173  goto out_locked;
5174  }
5175 
5176  /*
5177  * Also check Xmin: if this tuple was created by an aborted
5178  * (sub)transaction, then we already locked the last live one in the
5179  * chain, thus we're done, so return success.
5180  */
5182  {
5183  result = TM_Ok;
5184  goto out_locked;
5185  }
5186 
5187  old_infomask = mytup.t_data->t_infomask;
5188  old_infomask2 = mytup.t_data->t_infomask2;
5189  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5190 
5191  /*
5192  * If this tuple version has been updated or locked by some concurrent
5193  * transaction(s), what we do depends on whether our lock mode
5194  * conflicts with what those other transactions hold, and also on the
5195  * status of them.
5196  */
5197  if (!(old_infomask & HEAP_XMAX_INVALID))
5198  {
5199  TransactionId rawxmax;
5200  bool needwait;
5201 
5202  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5203  if (old_infomask & HEAP_XMAX_IS_MULTI)
5204  {
5205  int nmembers;
5206  int i;
5207  MultiXactMember *members;
5208 
5209  /*
5210  * We don't need a test for pg_upgrade'd tuples: this is only
5211  * applied to tuples after the first in an update chain. Said
5212  * first tuple in the chain may well be locked-in-9.2-and-
5213  * pg_upgraded, but that one was already locked by our caller,
5214  * not us; and any subsequent ones cannot be because our
5215  * caller must necessarily have obtained a snapshot later than
5216  * the pg_upgrade itself.
5217  */
5219 
5220  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5221  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5222  for (i = 0; i < nmembers; i++)
5223  {
5224  result = test_lockmode_for_conflict(members[i].status,
5225  members[i].xid,
5226  mode,
5227  &mytup,
5228  &needwait);
5229 
5230  /*
5231  * If the tuple was already locked by ourselves in a
5232  * previous iteration of this (say heap_lock_tuple was
5233  * forced to restart the locking loop because of a change
5234  * in xmax), then we hold the lock already on this tuple
5235  * version and we don't need to do anything; and this is
5236  * not an error condition either. We just need to skip
5237  * this tuple and continue locking the next version in the
5238  * update chain.
5239  */
5240  if (result == TM_SelfModified)
5241  {
5242  pfree(members);
5243  goto next;
5244  }
5245 
5246  if (needwait)
5247  {
5249  XactLockTableWait(members[i].xid, rel,
5250  &mytup.t_self,
5252  pfree(members);
5253  goto l4;
5254  }
5255  if (result != TM_Ok)
5256  {
5257  pfree(members);
5258  goto out_locked;
5259  }
5260  }
5261  if (members)
5262  pfree(members);
5263  }
5264  else
5265  {
5267 
5268  /*
5269  * For a non-multi Xmax, we first need to compute the
5270  * corresponding MultiXactStatus by using the infomask bits.
5271  */
5272  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5273  {
5274  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5275  status = MultiXactStatusForKeyShare;
5276  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5277  status = MultiXactStatusForShare;
5278  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5279  {
5280  if (old_infomask2 & HEAP_KEYS_UPDATED)
5281  status = MultiXactStatusForUpdate;
5282  else
5284  }
5285  else
5286  {
5287  /*
5288  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5289  * as share-locked in the old cluster) shouldn't be
5290  * seen in the middle of an update chain.
5291  */
5292  elog(ERROR, "invalid lock status in tuple");
5293  }
5294  }
5295  else
5296  {
5297  /* it's an update, but which kind? */
5298  if (old_infomask2 & HEAP_KEYS_UPDATED)
5299  status = MultiXactStatusUpdate;
5300  else
5301  status = MultiXactStatusNoKeyUpdate;
5302  }
5303 
5304  result = test_lockmode_for_conflict(status, rawxmax, mode,
5305  &mytup, &needwait);
5306 
5307  /*
5308  * If the tuple was already locked by ourselves in a previous
5309  * iteration of this (say heap_lock_tuple was forced to
5310  * restart the locking loop because of a change in xmax), then
5311  * we hold the lock already on this tuple version and we don't
5312  * need to do anything; and this is not an error condition
5313  * either. We just need to skip this tuple and continue
5314  * locking the next version in the update chain.
5315  */
5316  if (result == TM_SelfModified)
5317  goto next;
5318 
5319  if (needwait)
5320  {
5322  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5324  goto l4;
5325  }
5326  if (result != TM_Ok)
5327  {
5328  goto out_locked;
5329  }
5330  }
5331  }
5332 
5333  /* compute the new Xmax and infomask values for the tuple ... */
5334  compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5335  xid, mode, false,
5336  &new_xmax, &new_infomask, &new_infomask2);
5337 
5338  if (PageIsAllVisible(BufferGetPage(buf)) &&
5339  visibilitymap_clear(rel, block, vmbuffer,
5341  cleared_all_frozen = true;
5342 
5344 
5345  /* ... and set them */
5346  HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5347  mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5349  mytup.t_data->t_infomask |= new_infomask;
5350  mytup.t_data->t_infomask2 |= new_infomask2;
5351 
5352  MarkBufferDirty(buf);
5353 
5354  /* XLOG stuff */
5355  if (RelationNeedsWAL(rel))
5356  {
5357  xl_heap_lock_updated xlrec;
5358  XLogRecPtr recptr;
5359  Page page = BufferGetPage(buf);
5360 
5361  XLogBeginInsert();
5363 
5364  xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5365  xlrec.xmax = new_xmax;
5366  xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5367  xlrec.flags =
5368  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5369 
5370  XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5371 
5372  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5373 
5374  PageSetLSN(page, recptr);
5375  }
5376 
5377  END_CRIT_SECTION();
5378 
5379 next:
5380  /* if we find the end of update chain, we're done. */
5381  if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5383  ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5385  {
5386  result = TM_Ok;
5387  goto out_locked;
5388  }
5389 
5390  /* tail recursion */
5391  priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5392  ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5393  UnlockReleaseBuffer(buf);
5394  }
5395 
5396  result = TM_Ok;
5397 
5398 out_locked:
5399  UnlockReleaseBuffer(buf);
5400 
5401 out_unlocked:
5402  if (vmbuffer != InvalidBuffer)
5403  ReleaseBuffer(vmbuffer);
5404 
5405  return result;
5406 }
5407 
5408 /*
5409  * heap_lock_updated_tuple
5410  * Follow update chain when locking an updated tuple, acquiring locks (row
5411  * marks) on the updated versions.
5412  *
5413  * The initial tuple is assumed to be already locked.
5414  *
5415  * This function doesn't check visibility, it just unconditionally marks the
5416  * tuple(s) as locked. If any tuple in the updated chain is being deleted
5417  * concurrently (or updated with the key being modified), sleep until the
5418  * transaction doing it is finished.
5419  *
5420  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5421  * when we have to wait for other transactions to release them, as opposed to
5422  * what heap_lock_tuple does. The reason is that having more than one
5423  * transaction walking the chain is probably uncommon enough that risk of
5424  * starvation is not likely: one of the preconditions for being here is that
5425  * the snapshot in use predates the update that created this tuple (because we
5426  * started at an earlier version of the tuple), but at the same time such a
5427  * transaction cannot be using repeatable read or serializable isolation
5428  * levels, because that would lead to a serializability failure.
5429  */
5430 static TM_Result
5433 {
5434  /*
5435  * If the tuple has not been updated, or has moved into another partition
5436  * (effectively a delete) stop here.
5437  */
5439  !ItemPointerEquals(&tuple->t_self, ctid))
5440  {
5441  /*
5442  * If this is the first possibly-multixact-able operation in the
5443  * current transaction, set my per-backend OldestMemberMXactId
5444  * setting. We can be certain that the transaction will never become a
5445  * member of any older MultiXactIds than that. (We have to do this
5446  * even if we end up just using our own TransactionId below, since
5447  * some other backend could incorporate our XID into a MultiXact
5448  * immediately afterwards.)
5449  */
5451 
5452  return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5453  }
5454 
5455  /* nothing to lock */
5456  return TM_Ok;
5457 }
5458 
5459 /*
5460  * heap_finish_speculative - mark speculative insertion as successful
5461  *
5462  * To successfully finish a speculative insertion we have to clear speculative
5463  * token from tuple. To do so the t_ctid field, which will contain a
5464  * speculative token value, is modified in place to point to the tuple itself,
5465  * which is characteristic of a newly inserted ordinary tuple.
5466  *
5467  * NB: It is not ok to commit without either finishing or aborting a
5468  * speculative insertion. We could treat speculative tuples of committed
5469  * transactions implicitly as completed, but then we would have to be prepared
5470  * to deal with speculative tokens on committed tuples. That wouldn't be
5471  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5472  * but clearing the token at completion isn't very expensive either.
5473  * An explicit confirmation WAL record also makes logical decoding simpler.
5474  */
5475 void
5477 {
5478  Buffer buffer;
5479  Page page;
5480  OffsetNumber offnum;
5481  ItemId lp = NULL;
5482  HeapTupleHeader htup;
5483 
5484  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5486  page = (Page) BufferGetPage(buffer);
5487 
5488  offnum = ItemPointerGetOffsetNumber(tid);
5489  if (PageGetMaxOffsetNumber(page) >= offnum)
5490  lp = PageGetItemId(page, offnum);
5491 
5492  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5493  elog(ERROR, "invalid lp");
5494 
5495  htup = (HeapTupleHeader) PageGetItem(page, lp);
5496 
5497  /* SpecTokenOffsetNumber should be distinguishable from any real offset */
5499  "invalid speculative token constant");
5500 
5501  /* NO EREPORT(ERROR) from here till changes are logged */
5503 
5505 
5506  MarkBufferDirty(buffer);
5507 
5508  /*
5509  * Replace the speculative insertion token with a real t_ctid, pointing to
5510  * itself like it does on regular tuples.
5511  */
5512  htup->t_ctid = *tid;
5513 
5514  /* XLOG stuff */
5515  if (RelationNeedsWAL(relation))
5516  {
5517  xl_heap_confirm xlrec;
5518  XLogRecPtr recptr;
5519 
5520  xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5521 
5522  XLogBeginInsert();
5523 
5524  /* We want the same filtering on this as on a plain insert */
5526 
5527  XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5528  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5529 
5530  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5531 
5532  PageSetLSN(page, recptr);
5533  }
5534 
5535  END_CRIT_SECTION();
5536 
5537  UnlockReleaseBuffer(buffer);
5538 }
5539 
5540 /*
5541  * heap_abort_speculative - kill a speculatively inserted tuple
5542  *
5543  * Marks a tuple that was speculatively inserted in the same command as dead,
5544  * by setting its xmin as invalid. That makes it immediately appear as dead
5545  * to all transactions, including our own. In particular, it makes
5546  * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5547  * inserting a duplicate key value won't unnecessarily wait for our whole
5548  * transaction to finish (it'll just wait for our speculative insertion to
5549  * finish).
5550  *
5551  * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5552  * that arise due to a mutual dependency that is not user visible. By
5553  * definition, unprincipled deadlocks cannot be prevented by the user
5554  * reordering lock acquisition in client code, because the implementation level
5555  * lock acquisitions are not under the user's direct control. If speculative
5556  * inserters did not take this precaution, then under high concurrency they
5557  * could deadlock with each other, which would not be acceptable.
5558  *
5559  * This is somewhat redundant with heap_delete, but we prefer to have a
5560  * dedicated routine with stripped down requirements. Note that this is also
5561  * used to delete the TOAST tuples created during speculative insertion.
5562  *
5563  * This routine does not affect logical decoding as it only looks at
5564  * confirmation records.
5565  */
5566 void
5568 {
5570  ItemId lp;
5571  HeapTupleData tp;
5572  Page page;
5573  BlockNumber block;
5574  Buffer buffer;
5575 
5576  Assert(ItemPointerIsValid(tid));
5577 
5578  block = ItemPointerGetBlockNumber(tid);
5579  buffer = ReadBuffer(relation, block);
5580  page = BufferGetPage(buffer);
5581 
5583 
5584  /*
5585  * Page can't be all visible, we just inserted into it, and are still
5586  * running.
5587  */
5588  Assert(!PageIsAllVisible(page));
5589 
5590  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5591  Assert(ItemIdIsNormal(lp));
5592 
5593  tp.t_tableOid = RelationGetRelid(relation);
5594  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5595  tp.t_len = ItemIdGetLength(lp);
5596  tp.t_self = *tid;
5597 
5598  /*
5599  * Sanity check that the tuple really is a speculatively inserted tuple,
5600  * inserted by us.
5601  */
5602  if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5603  elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5604  if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5605  elog(ERROR, "attempted to kill a non-speculative tuple");
5607 
5608  /*
5609  * No need to check for serializable conflicts here. There is never a
5610  * need for a combocid, either. No need to extract replica identity, or
5611  * do anything special with infomask bits.
5612  */
5613 
5615 
5616  /*
5617  * The tuple will become DEAD immediately. Flag that this page
5618  * immediately is a candidate for pruning by setting xmin to
5619  * RecentGlobalXmin. That's not pretty, but it doesn't seem worth
5620  * inventing a nicer API for this.
5621  */
5624 
5625  /* store transaction information of xact deleting the tuple */
5628 
5629  /*
5630  * Set the tuple header xmin to InvalidTransactionId. This makes the
5631  * tuple immediately invisible everyone. (In particular, to any
5632  * transactions waiting on the speculative token, woken up later.)
5633  */
5635 
5636  /* Clear the speculative insertion token too */
5637  tp.t_data->t_ctid = tp.t_self;
5638 
5639  MarkBufferDirty(buffer);
5640 
5641  /*
5642  * XLOG stuff
5643  *
5644  * The WAL records generated here match heap_delete(). The same recovery
5645  * routines are used.
5646  */
5647  if (RelationNeedsWAL(relation))
5648  {
5649  xl_heap_delete xlrec;
5650  XLogRecPtr recptr;
5651 
5652  xlrec.flags = XLH_DELETE_IS_SUPER;
5654  tp.t_data->t_infomask2);
5656  xlrec.xmax = xid;
5657 
5658  XLogBeginInsert();
5659  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5660  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5661 
5662  /* No replica identity & replication origin logged */
5663 
5664  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5665 
5666  PageSetLSN(page, recptr);
5667  }
5668 
5669  END_CRIT_SECTION();
5670 
5671  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5672 
5673  if (HeapTupleHasExternal(&tp))
5674  {
5675  Assert(!IsToastRelation(relation));
5676  heap_toast_delete(relation, &tp, true);
5677  }
5678 
5679  /*
5680  * Never need to mark tuple for invalidation, since catalogs don't support
5681  * speculative insertion
5682  */
5683 
5684  /* Now we can release the buffer */
5685  ReleaseBuffer(buffer);
5686 
5687  /* count deletion, as we counted the insertion too */
5688  pgstat_count_heap_delete(relation);
5689 }
5690 
5691 /*
5692  * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5693  *
5694  * Overwriting violates both MVCC and transactional safety, so the uses
5695  * of this function in Postgres are extremely limited. Nonetheless we
5696  * find some places to use it.
5697  *
5698  * The tuple cannot change size, and therefore it's reasonable to assume
5699  * that its null bitmap (if any) doesn't change either. So we just
5700  * overwrite the data portion of the tuple without touching the null
5701  * bitmap or any of the header fields.
5702  *
5703  * tuple is an in-memory tuple structure containing the data to be written
5704  * over the target tuple. Also, tuple->t_self identifies the target tuple.
5705  */
5706 void
5708 {
5709  Buffer buffer;
5710  Page page;
5711  OffsetNumber offnum;
5712  ItemId lp = NULL;
5713  HeapTupleHeader htup;
5714  uint32 oldlen;
5715  uint32 newlen;
5716 
5717  /*
5718  * For now, parallel operations are required to be strictly read-only.
5719  * Unlike a regular update, this should never create a combo CID, so it
5720  * might be possible to relax this restriction, but not without more
5721  * thought and testing. It's not clear that it would be useful, anyway.
5722  */
5723  if (IsInParallelMode())
5724  ereport(ERROR,
5725  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5726  errmsg("cannot update tuples during a parallel operation")));
5727 
5728  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
5730  page = (Page) BufferGetPage(buffer);
5731 
5732  offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
5733  if (PageGetMaxOffsetNumber(page) >= offnum)
5734  lp = PageGetItemId(page, offnum);
5735 
5736  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5737  elog(ERROR, "invalid lp");
5738 
5739  htup = (HeapTupleHeader) PageGetItem(page, lp);
5740 
5741  oldlen = ItemIdGetLength(lp) - htup->t_hoff;
5742  newlen = tuple->t_len - tuple->t_data->t_hoff;
5743  if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
5744  elog(ERROR, "wrong tuple length");
5745 
5746  /* NO EREPORT(ERROR) from here till changes are logged */
5748 
5749  memcpy((char *) htup + htup->t_hoff,
5750  (char *) tuple->t_data + tuple->t_data->t_hoff,
5751  newlen);
5752 
5753  MarkBufferDirty(buffer);
5754 
5755  /* XLOG stuff */
5756  if (RelationNeedsWAL(relation))
5757  {
5758  xl_heap_inplace xlrec;
5759  XLogRecPtr recptr;
5760 
5761  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5762 
5763  XLogBeginInsert();
5764  XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
5765 
5766  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5767  XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
5768 
5769  /* inplace updates aren't decoded atm, don't log the origin */
5770 
5771  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
5772 
5773  PageSetLSN(page, recptr);
5774  }
5775 
5776  END_CRIT_SECTION();
5777 
5778  UnlockReleaseBuffer(buffer);
5779 
5780  /*
5781  * Send out shared cache inval if necessary. Note that because we only
5782  * pass the new version of the tuple, this mustn't be used for any
5783  * operations that could change catcache lookup keys. But we aren't
5784  * bothering with index updates either, so that's true a fortiori.
5785  */
5787  CacheInvalidateHeapTuple(relation, tuple, NULL);
5788 }
5789 
5790 #define FRM_NOOP 0x0001
5791 #define FRM_INVALIDATE_XMAX 0x0002
5792 #define FRM_RETURN_IS_XID 0x0004
5793 #define FRM_RETURN_IS_MULTI 0x0008
5794 #define FRM_MARK_COMMITTED 0x0010
5795 
5796 /*
5797  * FreezeMultiXactId
5798  * Determine what to do during freezing when a tuple is marked by a
5799  * MultiXactId.